@@ -174,3 +174,179 @@ defmodule String.Unicode do
174174 [ ]
175175 end
176176end
177+
178+ defmodule String.Graphemes do
179+ @ moduledoc false
180+
181+ cluster_path = Path . join ( __DIR__ , "GraphemeBreakProperty.txt" )
182+ regex = % r / ( ?: ^ ( [ 0 - 9 A- F ] + ) ( ?: \.\. ( [ 0 - 9 A- F ] +) ) ?) \s+; \s( \w+) /m
183+
184+ to_range = fn
185+ first , "" ->
186+ [ << binary_to_integer ( first , 16 ) :: utf8 >> ]
187+ first , last ->
188+ range = binary_to_integer ( first , 16 ) .. binary_to_integer ( last , 16 )
189+ Enum . map ( range , fn ( int ) -> << int :: utf8 >> end )
190+ end
191+
192+ cluster = Enum . reduce File . stream! ( cluster_path ) , HashDict . new , fn ( line , dict ) ->
193+ [ _full , first , last , class ] = Regex . run ( regex , line )
194+
195+ # Skip surrogates
196+ if first == "D800" and last == "DFFF" do
197+ dict
198+ else
199+ list = to_range . ( first , last )
200+ Dict . update ( dict , class , list , & ( & 1 ++ list ) )
201+ end
202+ end
203+
204+ # There is no codepoint marked as Prepend by Unicode 6.3.0
205+ if cluster [ "Prepend" ] do
206+ raise "It seems this new unicode version has added Prepend items. " <>
207+ "Please remove this error and uncomment the code below."
208+ end
209+
210+ # Don't break CRLF
211+ def next_grapheme( << ?\n , ?\r , rest :: binary > > ) do
212+ { "\n \r " , rest }
213+ end
214+
215+ # Break on control
216+ lc codepoint inlist cluster [ "CR" ] ++ cluster [ "LF" ] ++ cluster [ "Control" ] do
217+ def next_grapheme ( << unquote ( codepoint ) , rest :: binary >> = string ) do
218+ { :binary . part ( string , 0 , unquote ( size ( codepoint ) ) ) , rest }
219+ end
220+ end
221+
222+ # Break on Prepend*
223+ # lc codepoint inlist cluster["Prepend"] do
224+ # def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
225+ # next_prepend(rest, string, unquote(size(codepoint)))
226+ # end
227+ # end
228+
229+ # Handle Hangul L
230+ lc codepoint inlist cluster[ "L" ] do
231+ def next_grapheme( << unquote( codepoint) , rest :: binary > > = string) do
232+ next_hangul_l ( rest , string , unquote ( size ( codepoint ) ) )
233+ end
234+ end
235+
236+ # Handle Hangul T
237+ lc codepoint inlist cluster [ "T" ] do
238+ def next_grapheme ( << unquote ( codepoint ) , rest :: binary >> = string ) do
239+ next_hangul_t ( rest , string , unquote ( size ( codepoint ) ) )
240+ end
241+ end
242+
243+ # Handle Regional
244+ lc codepoint inlist cluster[ "Regional_Indicator" ] do
245+ def next_grapheme( << unquote( codepoint) , rest :: binary > > = string) do
246+ next_regional ( rest , string , unquote ( size ( codepoint ) ) )
247+ end
248+ end
249+
250+ # Handle extended entries
251+ def next_grapheme ( << cp :: utf8 , rest :: binary >> = string ) do
252+ next_extend ( rest , string , byte_size ( << cp :: utf8 >> ) )
253+ end
254+
255+ def next_grapheme( << cp, rest :: binary > > ) do
256+ { << cp >> , rest }
257+ end
258+
259+ def next_grapheme ( << >> ) do
260+ nil
261+ end
262+
263+ # Handle Hangul L
264+ lc codepoint inlist cluster [ "L" ] do
265+ defp next_hangul_l ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
266+ next_hangul_l ( rest , string , size + unquote ( size ( codepoint ) ) )
267+ end
268+ end
269+
270+ lc codepoint inlist cluster[ "LV" ] do
271+ defp next_hangul_l ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
272+ next_hangul_v ( rest , string , size + unquote ( size ( codepoint ) ) )
273+ end
274+ end
275+
276+ lc codepoint inlist cluster[ "LVT" ] do
277+ defp next_hangul_l ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
278+ next_hangul_t ( rest , string , size + unquote ( size ( codepoint ) ) )
279+ end
280+ end
281+
282+ defp next_hangul_l ( rest , string , size ) do
283+ next_hangul_v ( rest , string , size )
284+ end
285+
286+ # Handle Hangul V
287+ lc codepoint inlist cluster [ "V" ] do
288+ defp next_hangul_v ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
289+ next_hangul_v ( rest , string , size + unquote ( size ( codepoint ) ) )
290+ end
291+ end
292+
293+ defp next_hangul_v( rest, string , size ) do
294+ next_hangul_t ( rest , string , size )
295+ end
296+
297+ # Handle Hangul T
298+ lc codepoint inlist cluster [ "T" ] do
299+ defp next_hangul_t ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
300+ next_hangul_t ( rest , string , size + unquote ( size ( codepoint ) ) )
301+ end
302+ end
303+
304+ defp next_hangul_t( rest, string , size ) do
305+ next_extend ( rest , string , size )
306+ end
307+
308+ # Handle regional
309+ lc codepoint inlist cluster [ "Regional_Indicator" ] do
310+ defp next_regional ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
311+ next_regional ( rest , string , size + unquote ( size ( codepoint ) ) )
312+ end
313+ end
314+
315+ defp next_regional( rest, string , size ) do
316+ next_extend ( rest , string , size )
317+ end
318+
319+ # Handle Extend+SpacingMark
320+ lc codepoint inlist cluster [ "Extend" ] ++ cluster [ "SpacingMark" ] do
321+ defp next_extend ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
322+ next_extend ( rest , string , size + unquote ( size ( codepoint ) ) )
323+ end
324+ end
325+
326+ defp next_extend( rest, string , size ) do
327+ { :binary . part ( string , 0 , size ) , rest }
328+ end
329+
330+ # Handle Prepend
331+ # lc codepoint inlist cluster["Prepend"] do
332+ # defp next_prepend(<< unquote(codepoint), rest :: binary >>, string, size) do
333+ # next_prepend(rest, string, size + unquote(size(codepoint)))
334+ # end
335+ # end
336+ #
337+ # defp next_prepend(rest, string, size) do
338+ # { :binary.part(string, 0, size), rest }
339+ # end
340+
341+ def graphemes ( binary ) when is_binary ( binary ) do
342+ do_graphemes ( next_grapheme ( binary ) )
343+ end
344+
345+ defp do_graphemes ( { c , rest } ) do
346+ [ c | do_graphemes ( next_grapheme ( rest ) ) ]
347+ end
348+
349+ defp do_graphemes ( nil ) do
350+ [ ]
351+ end
352+ end
0 commit comments