|
248 | 248 | data_path = Path.join(__DIR__, "UnicodeData.txt") |
249 | 249 |
|
250 | 250 | {codes, non_breakable, decompositions, combining_classes} = |
251 | | - Enum.reduce File.stream!(data_path), {[], [], %{}, []}, fn line, {cacc, wacc, dacc, kacc} -> |
| 251 | + Enum.reduce File.stream!(data_path), {[], [], %{}, %{}}, fn line, {cacc, wacc, dacc, kacc} -> |
252 | 252 | [codepoint, _name, _category, |
253 | 253 | class, _bidi, decomposition, |
254 | 254 | _numeric_1, _numeric_2, _numeric_3, |
@@ -276,16 +276,16 @@ data_path = Path.join(__DIR__, "UnicodeData.txt") |
276 | 276 | decomposition = |
277 | 277 | decomposition |
278 | 278 | |> :binary.split(" ", [:global]) |
279 | | - |> Enum.map(&<<String.to_integer(&1, 16)::utf8>>) |
280 | | - Map.put(dacc, to_binary.(codepoint), decomposition) |
| 279 | + |> Enum.map(&String.to_integer(&1, 16)) |
| 280 | + Map.put(dacc, String.to_integer(codepoint, 16), decomposition) |
281 | 281 | _ -> |
282 | 282 | dacc |
283 | 283 | end |
284 | 284 |
|
285 | 285 | kacc = |
286 | 286 | case Integer.parse(class) do |
287 | 287 | {0, ""} -> kacc |
288 | | - {n, ""} -> [{String.to_integer(codepoint, 16), n}|kacc] |
| 288 | + {n, ""} -> Map.put(kacc, String.to_integer(codepoint, 16), n) |
289 | 289 | end |
290 | 290 |
|
291 | 291 | {cacc, wacc, dacc, kacc} |
@@ -459,7 +459,7 @@ defmodule String.Normalizer do |
459 | 459 | compositions = Enum.reduce File.stream!(exclusions_path), decompositions, fn |
460 | 460 | <<h, _::binary>> = line, acc when h in ?0..?9 or h in ?A..?F -> |
461 | 461 | [codepoint, _] = :binary.split(line, " ") |
462 | | - Map.delete(acc, to_binary.(codepoint)) |
| 462 | + Map.delete(acc, String.to_integer(codepoint, 16)) |
463 | 463 | _, acc -> |
464 | 464 | acc |
465 | 465 | end |
@@ -514,9 +514,10 @@ defmodule String.Normalizer do |
514 | 514 | end |
515 | 515 | end |
516 | 516 |
|
517 | | - for {binary, decomposition} <- decompositions do |
518 | | - defp canonical_order(unquote(binary) <> rest, acc) do |
519 | | - canonical_order(unquote(IO.iodata_to_binary(decomposition)) <> rest, acc) |
| 517 | + for {cp, decomposition} <- decompositions do |
| 518 | + decomposition = decomposition |> Enum.map(&<<&1::utf8>>) |> IO.iodata_to_binary() |
| 519 | + defp canonical_order(unquote(<<cp::utf8>>) <> rest, acc) do |
| 520 | + canonical_order(unquote(decomposition) <> rest, acc) |
520 | 521 | end |
521 | 522 | end |
522 | 523 | defp canonical_order(<<h::utf8, t::binary>>, acc) do |
@@ -566,8 +567,12 @@ defmodule String.Normalizer do |
566 | 567 | end |
567 | 568 | end |
568 | 569 |
|
569 | | - for {composition, [_, _] = binary} <- compositions do |
570 | | - defp compose_one(unquote(IO.iodata_to_binary(binary))), do: unquote(composition) |
| 570 | + # Compositions: |
| 571 | + # 1. We must exclude compositions with a single codepoint |
| 572 | + # 2. We must exclude compositions that do not start with 0 combining class |
| 573 | + for {cp, [fst, snd]} <- compositions, |
| 574 | + Map.get(combining_classes, fst, 0) == 0 do |
| 575 | + defp compose_one(unquote(<<fst::utf8, snd::utf8>>)), do: unquote(<<cp::utf8>>) |
571 | 576 | end |
572 | 577 |
|
573 | 578 | defp compose_one(_), do: nil |
|
0 commit comments