|
1 | | -# This file has its own compilation step because it needs to parse |
2 | | -# String.Unicode and String.Graphemes data and compile digested modules. |
3 | | - |
4 | 1 | defmodule String.Unicode do |
5 | 2 | @moduledoc false |
6 | 3 | def version, do: {6,3,0} |
@@ -177,179 +174,3 @@ defmodule String.Unicode do |
177 | 174 | [] |
178 | 175 | end |
179 | 176 | end |
180 | | - |
181 | | -defmodule String.Graphemes do |
182 | | - @moduledoc false |
183 | | - |
184 | | - cluster_path = Path.join(__DIR__, "GraphemeBreakProperty.txt") |
185 | | - regex = %r/(?:^([0-9A-F]+)(?:\.\.([0-9A-F]+))?)\s+;\s(\w+)/m |
186 | | - |
187 | | - to_range = fn |
188 | | - first, "" -> |
189 | | - [<< binary_to_integer(first, 16) :: utf8 >>] |
190 | | - first, last -> |
191 | | - range = binary_to_integer(first, 16)..binary_to_integer(last, 16) |
192 | | - Enum.map(range, fn(int) -> << int :: utf8 >> end) |
193 | | - end |
194 | | - |
195 | | - cluster = Enum.reduce File.stream!(cluster_path), HashDict.new, fn(line, dict) -> |
196 | | - [ _full, first, last, class ] = Regex.run(regex, line) |
197 | | - |
198 | | - # Skip surrogates |
199 | | - if first == "D800" and last == "DFFF" do |
200 | | - dict |
201 | | - else |
202 | | - list = to_range.(first, last) |
203 | | - Dict.update(dict, class, list, &(&1 ++ list)) |
204 | | - end |
205 | | - end |
206 | | - |
207 | | - # There is no codepoint marked as Prepend by Unicode 6.3.0 |
208 | | - if cluster["Prepend"] do |
209 | | - raise "It seems this new unicode version has added Prepend items. " <> |
210 | | - "Please remove this error and uncomment the code below." |
211 | | - end |
212 | | - |
213 | | - # Don't break CRLF |
214 | | - def next_grapheme(<< ?\n, ?\r, rest :: binary >>) do |
215 | | - { "\n\r", rest } |
216 | | - end |
217 | | - |
218 | | - # Break on control |
219 | | - lc codepoint inlist cluster["CR"] ++ cluster["LF"] ++ cluster["Control"] do |
220 | | - def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do |
221 | | - { :binary.part(string, 0, unquote(size(codepoint))), rest } |
222 | | - end |
223 | | - end |
224 | | - |
225 | | - # Break on Prepend* |
226 | | - # lc codepoint inlist cluster["Prepend"] do |
227 | | - # def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do |
228 | | - # next_prepend(rest, string, unquote(size(codepoint))) |
229 | | - # end |
230 | | - # end |
231 | | - |
232 | | - # Handle Hangul L |
233 | | - lc codepoint inlist cluster["L"] do |
234 | | - def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do |
235 | | - next_hangul_l(rest, string, unquote(size(codepoint))) |
236 | | - end |
237 | | - end |
238 | | - |
239 | | - # Handle Hangul T |
240 | | - lc codepoint inlist cluster["T"] do |
241 | | - def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do |
242 | | - next_hangul_t(rest, string, unquote(size(codepoint))) |
243 | | - end |
244 | | - end |
245 | | - |
246 | | - # Handle Regional |
247 | | - lc codepoint inlist cluster["Regional_Indicator"] do |
248 | | - def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do |
249 | | - next_regional(rest, string, unquote(size(codepoint))) |
250 | | - end |
251 | | - end |
252 | | - |
253 | | - # Handle extended entries |
254 | | - def next_grapheme(<< cp :: utf8, rest :: binary >> = string) do |
255 | | - next_extend(rest, string, byte_size(<< cp :: utf8 >>)) |
256 | | - end |
257 | | - |
258 | | - def next_grapheme(<< cp, rest :: binary >>) do |
259 | | - { <<cp>>, rest } |
260 | | - end |
261 | | - |
262 | | - def next_grapheme(<<>>) do |
263 | | - :no_grapheme |
264 | | - end |
265 | | - |
266 | | - # Handle Hangul L |
267 | | - lc codepoint inlist cluster["L"] do |
268 | | - defp next_hangul_l(<< unquote(codepoint), rest :: binary >>, string, size) do |
269 | | - next_hangul_l(rest, string, size + unquote(size(codepoint))) |
270 | | - end |
271 | | - end |
272 | | - |
273 | | - lc codepoint inlist cluster["LV"] do |
274 | | - defp next_hangul_l(<< unquote(codepoint), rest :: binary >>, string, size) do |
275 | | - next_hangul_v(rest, string, size + unquote(size(codepoint))) |
276 | | - end |
277 | | - end |
278 | | - |
279 | | - lc codepoint inlist cluster["LVT"] do |
280 | | - defp next_hangul_l(<< unquote(codepoint), rest :: binary >>, string, size) do |
281 | | - next_hangul_t(rest, string, size + unquote(size(codepoint))) |
282 | | - end |
283 | | - end |
284 | | - |
285 | | - defp next_hangul_l(rest, string, size) do |
286 | | - next_hangul_v(rest, string, size) |
287 | | - end |
288 | | - |
289 | | - # Handle Hangul V |
290 | | - lc codepoint inlist cluster["V"] do |
291 | | - defp next_hangul_v(<< unquote(codepoint), rest :: binary >>, string, size) do |
292 | | - next_hangul_v(rest, string, size + unquote(size(codepoint))) |
293 | | - end |
294 | | - end |
295 | | - |
296 | | - defp next_hangul_v(rest, string, size) do |
297 | | - next_hangul_t(rest, string, size) |
298 | | - end |
299 | | - |
300 | | - # Handle Hangul T |
301 | | - lc codepoint inlist cluster["T"] do |
302 | | - defp next_hangul_t(<< unquote(codepoint), rest :: binary >>, string, size) do |
303 | | - next_hangul_t(rest, string, size + unquote(size(codepoint))) |
304 | | - end |
305 | | - end |
306 | | - |
307 | | - defp next_hangul_t(rest, string, size) do |
308 | | - next_extend(rest, string, size) |
309 | | - end |
310 | | - |
311 | | - # Handle regional |
312 | | - lc codepoint inlist cluster["Regional_Indicator"] do |
313 | | - defp next_regional(<< unquote(codepoint), rest :: binary >>, string, size) do |
314 | | - next_regional(rest, string, size + unquote(size(codepoint))) |
315 | | - end |
316 | | - end |
317 | | - |
318 | | - defp next_regional(rest, string, size) do |
319 | | - next_extend(rest, string, size) |
320 | | - end |
321 | | - |
322 | | - # Handle Extend+SpacingMark |
323 | | - lc codepoint inlist cluster["Extend"] ++ cluster["SpacingMark"] do |
324 | | - defp next_extend(<< unquote(codepoint), rest :: binary >>, string, size) do |
325 | | - next_extend(rest, string, size + unquote(size(codepoint))) |
326 | | - end |
327 | | - end |
328 | | - |
329 | | - defp next_extend(rest, string, size) do |
330 | | - { :binary.part(string, 0, size), rest } |
331 | | - end |
332 | | - |
333 | | - # Handle Prepend |
334 | | - # lc codepoint inlist cluster["Prepend"] do |
335 | | - # defp next_prepend(<< unquote(codepoint), rest :: binary >>, string, size) do |
336 | | - # next_prepend(rest, string, size + unquote(size(codepoint))) |
337 | | - # end |
338 | | - # end |
339 | | - # |
340 | | - # defp next_prepend(rest, string, size) do |
341 | | - # { :binary.part(string, 0, size), rest } |
342 | | - # end |
343 | | - |
344 | | - def graphemes(binary) when is_binary(binary) do |
345 | | - do_graphemes(next_grapheme(binary)) |
346 | | - end |
347 | | - |
348 | | - defp do_graphemes({ c, rest }) do |
349 | | - [c|do_graphemes(next_grapheme(rest))] |
350 | | - end |
351 | | - |
352 | | - defp do_graphemes(:no_grapheme) do |
353 | | - [] |
354 | | - end |
355 | | -end |
0 commit comments