Skip to content

Commit 0538e46

Browse files
author
José Valim
committed
Compile unicode modules in parallel
1 parent ebf8403 commit 0538e46

File tree

3 files changed

+177
-181
lines changed

3 files changed

+177
-181
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,10 @@ $(KERNEL): lib/elixir/lib/*.ex lib/elixir/lib/*/*.ex
7373
$(Q) cd lib/elixir && ../../$(REBAR) compile
7474

7575
unicode: $(UNICODE)
76-
$(UNICODE): lib/elixir/unicode/unicode.ex lib/elixir/unicode/UnicodeData.txt lib/elixir/unicode/GraphemeBreakProperty.txt
76+
$(UNICODE): lib/elixir/unicode/*
7777
@ echo "==> unicode (compile)";
7878
@ echo "This step can take up to a minute to compile in order to embed the Unicode database"
79-
$(Q) cd lib/elixir && ../../$(ELIXIRC) unicode/unicode.ex -o ebin;
79+
$(Q) cd lib/elixir && ../../$(ELIXIRC) unicode/graphemes.ex unicode/unicode.ex -o ebin;
8080

8181
$(eval $(call APP_TEMPLATE,ex_unit,ExUnit))
8282
$(eval $(call APP_TEMPLATE,eex,EEx))

lib/elixir/unicode/graphemes.ex

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
defmodule String.Graphemes do
2+
@moduledoc false
3+
4+
cluster_path = Path.join(__DIR__, "GraphemeBreakProperty.txt")
5+
regex = %r/(?:^([0-9A-F]+)(?:\.\.([0-9A-F]+))?)\s+;\s(\w+)/m
6+
7+
to_range = fn
8+
first, "" ->
9+
[<< binary_to_integer(first, 16) :: utf8 >>]
10+
first, last ->
11+
range = binary_to_integer(first, 16)..binary_to_integer(last, 16)
12+
Enum.map(range, fn(int) -> << int :: utf8 >> end)
13+
end
14+
15+
cluster = Enum.reduce File.stream!(cluster_path), HashDict.new, fn(line, dict) ->
16+
[ _full, first, last, class ] = Regex.run(regex, line)
17+
18+
# Skip surrogates
19+
if first == "D800" and last == "DFFF" do
20+
dict
21+
else
22+
list = to_range.(first, last)
23+
Dict.update(dict, class, list, &(&1 ++ list))
24+
end
25+
end
26+
27+
# There is no codepoint marked as Prepend by Unicode 6.3.0
28+
if cluster["Prepend"] do
29+
raise "It seems this new unicode version has added Prepend items. " <>
30+
"Please remove this error and uncomment the code below."
31+
end
32+
33+
# Don't break CRLF
34+
def next_grapheme(<< ?\n, ?\r, rest :: binary >>) do
35+
{ "\n\r", rest }
36+
end
37+
38+
# Break on control
39+
lc codepoint inlist cluster["CR"] ++ cluster["LF"] ++ cluster["Control"] do
40+
def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
41+
{ :binary.part(string, 0, unquote(size(codepoint))), rest }
42+
end
43+
end
44+
45+
# Break on Prepend*
46+
# lc codepoint inlist cluster["Prepend"] do
47+
# def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
48+
# next_prepend(rest, string, unquote(size(codepoint)))
49+
# end
50+
# end
51+
52+
# Handle Hangul L
53+
lc codepoint inlist cluster["L"] do
54+
def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
55+
next_hangul_l(rest, string, unquote(size(codepoint)))
56+
end
57+
end
58+
59+
# Handle Hangul T
60+
lc codepoint inlist cluster["T"] do
61+
def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
62+
next_hangul_t(rest, string, unquote(size(codepoint)))
63+
end
64+
end
65+
66+
# Handle Regional
67+
lc codepoint inlist cluster["Regional_Indicator"] do
68+
def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
69+
next_regional(rest, string, unquote(size(codepoint)))
70+
end
71+
end
72+
73+
# Handle extended entries
74+
def next_grapheme(<< cp :: utf8, rest :: binary >> = string) do
75+
next_extend(rest, string, byte_size(<< cp :: utf8 >>))
76+
end
77+
78+
def next_grapheme(<< cp, rest :: binary >>) do
79+
{ <<cp>>, rest }
80+
end
81+
82+
def next_grapheme(<<>>) do
83+
:no_grapheme
84+
end
85+
86+
# Handle Hangul L
87+
lc codepoint inlist cluster["L"] do
88+
defp next_hangul_l(<< unquote(codepoint), rest :: binary >>, string, size) do
89+
next_hangul_l(rest, string, size + unquote(size(codepoint)))
90+
end
91+
end
92+
93+
lc codepoint inlist cluster["LV"] do
94+
defp next_hangul_l(<< unquote(codepoint), rest :: binary >>, string, size) do
95+
next_hangul_v(rest, string, size + unquote(size(codepoint)))
96+
end
97+
end
98+
99+
lc codepoint inlist cluster["LVT"] do
100+
defp next_hangul_l(<< unquote(codepoint), rest :: binary >>, string, size) do
101+
next_hangul_t(rest, string, size + unquote(size(codepoint)))
102+
end
103+
end
104+
105+
defp next_hangul_l(rest, string, size) do
106+
next_hangul_v(rest, string, size)
107+
end
108+
109+
# Handle Hangul V
110+
lc codepoint inlist cluster["V"] do
111+
defp next_hangul_v(<< unquote(codepoint), rest :: binary >>, string, size) do
112+
next_hangul_v(rest, string, size + unquote(size(codepoint)))
113+
end
114+
end
115+
116+
defp next_hangul_v(rest, string, size) do
117+
next_hangul_t(rest, string, size)
118+
end
119+
120+
# Handle Hangul T
121+
lc codepoint inlist cluster["T"] do
122+
defp next_hangul_t(<< unquote(codepoint), rest :: binary >>, string, size) do
123+
next_hangul_t(rest, string, size + unquote(size(codepoint)))
124+
end
125+
end
126+
127+
defp next_hangul_t(rest, string, size) do
128+
next_extend(rest, string, size)
129+
end
130+
131+
# Handle regional
132+
lc codepoint inlist cluster["Regional_Indicator"] do
133+
defp next_regional(<< unquote(codepoint), rest :: binary >>, string, size) do
134+
next_regional(rest, string, size + unquote(size(codepoint)))
135+
end
136+
end
137+
138+
defp next_regional(rest, string, size) do
139+
next_extend(rest, string, size)
140+
end
141+
142+
# Handle Extend+SpacingMark
143+
lc codepoint inlist cluster["Extend"] ++ cluster["SpacingMark"] do
144+
defp next_extend(<< unquote(codepoint), rest :: binary >>, string, size) do
145+
next_extend(rest, string, size + unquote(size(codepoint)))
146+
end
147+
end
148+
149+
defp next_extend(rest, string, size) do
150+
{ :binary.part(string, 0, size), rest }
151+
end
152+
153+
# Handle Prepend
154+
# lc codepoint inlist cluster["Prepend"] do
155+
# defp next_prepend(<< unquote(codepoint), rest :: binary >>, string, size) do
156+
# next_prepend(rest, string, size + unquote(size(codepoint)))
157+
# end
158+
# end
159+
#
160+
# defp next_prepend(rest, string, size) do
161+
# { :binary.part(string, 0, size), rest }
162+
# end
163+
164+
def graphemes(binary) when is_binary(binary) do
165+
do_graphemes(next_grapheme(binary))
166+
end
167+
168+
defp do_graphemes({ c, rest }) do
169+
[c|do_graphemes(next_grapheme(rest))]
170+
end
171+
172+
defp do_graphemes(:no_grapheme) do
173+
[]
174+
end
175+
end

lib/elixir/unicode/unicode.ex

Lines changed: 0 additions & 179 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
# This file has its own compilation step because it needs to parse
2-
# String.Unicode and String.Graphemes data and compile digested modules.
3-
41
defmodule String.Unicode do
52
@moduledoc false
63
def version, do: {6,3,0}
@@ -177,179 +174,3 @@ defmodule String.Unicode do
177174
[]
178175
end
179176
end
180-
181-
defmodule String.Graphemes do
182-
@moduledoc false
183-
184-
cluster_path = Path.join(__DIR__, "GraphemeBreakProperty.txt")
185-
regex = %r/(?:^([0-9A-F]+)(?:\.\.([0-9A-F]+))?)\s+;\s(\w+)/m
186-
187-
to_range = fn
188-
first, "" ->
189-
[<< binary_to_integer(first, 16) :: utf8 >>]
190-
first, last ->
191-
range = binary_to_integer(first, 16)..binary_to_integer(last, 16)
192-
Enum.map(range, fn(int) -> << int :: utf8 >> end)
193-
end
194-
195-
cluster = Enum.reduce File.stream!(cluster_path), HashDict.new, fn(line, dict) ->
196-
[ _full, first, last, class ] = Regex.run(regex, line)
197-
198-
# Skip surrogates
199-
if first == "D800" and last == "DFFF" do
200-
dict
201-
else
202-
list = to_range.(first, last)
203-
Dict.update(dict, class, list, &(&1 ++ list))
204-
end
205-
end
206-
207-
# There is no codepoint marked as Prepend by Unicode 6.3.0
208-
if cluster["Prepend"] do
209-
raise "It seems this new unicode version has added Prepend items. " <>
210-
"Please remove this error and uncomment the code below."
211-
end
212-
213-
# Don't break CRLF
214-
def next_grapheme(<< ?\n, ?\r, rest :: binary >>) do
215-
{ "\n\r", rest }
216-
end
217-
218-
# Break on control
219-
lc codepoint inlist cluster["CR"] ++ cluster["LF"] ++ cluster["Control"] do
220-
def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
221-
{ :binary.part(string, 0, unquote(size(codepoint))), rest }
222-
end
223-
end
224-
225-
# Break on Prepend*
226-
# lc codepoint inlist cluster["Prepend"] do
227-
# def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
228-
# next_prepend(rest, string, unquote(size(codepoint)))
229-
# end
230-
# end
231-
232-
# Handle Hangul L
233-
lc codepoint inlist cluster["L"] do
234-
def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
235-
next_hangul_l(rest, string, unquote(size(codepoint)))
236-
end
237-
end
238-
239-
# Handle Hangul T
240-
lc codepoint inlist cluster["T"] do
241-
def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
242-
next_hangul_t(rest, string, unquote(size(codepoint)))
243-
end
244-
end
245-
246-
# Handle Regional
247-
lc codepoint inlist cluster["Regional_Indicator"] do
248-
def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
249-
next_regional(rest, string, unquote(size(codepoint)))
250-
end
251-
end
252-
253-
# Handle extended entries
254-
def next_grapheme(<< cp :: utf8, rest :: binary >> = string) do
255-
next_extend(rest, string, byte_size(<< cp :: utf8 >>))
256-
end
257-
258-
def next_grapheme(<< cp, rest :: binary >>) do
259-
{ <<cp>>, rest }
260-
end
261-
262-
def next_grapheme(<<>>) do
263-
:no_grapheme
264-
end
265-
266-
# Handle Hangul L
267-
lc codepoint inlist cluster["L"] do
268-
defp next_hangul_l(<< unquote(codepoint), rest :: binary >>, string, size) do
269-
next_hangul_l(rest, string, size + unquote(size(codepoint)))
270-
end
271-
end
272-
273-
lc codepoint inlist cluster["LV"] do
274-
defp next_hangul_l(<< unquote(codepoint), rest :: binary >>, string, size) do
275-
next_hangul_v(rest, string, size + unquote(size(codepoint)))
276-
end
277-
end
278-
279-
lc codepoint inlist cluster["LVT"] do
280-
defp next_hangul_l(<< unquote(codepoint), rest :: binary >>, string, size) do
281-
next_hangul_t(rest, string, size + unquote(size(codepoint)))
282-
end
283-
end
284-
285-
defp next_hangul_l(rest, string, size) do
286-
next_hangul_v(rest, string, size)
287-
end
288-
289-
# Handle Hangul V
290-
lc codepoint inlist cluster["V"] do
291-
defp next_hangul_v(<< unquote(codepoint), rest :: binary >>, string, size) do
292-
next_hangul_v(rest, string, size + unquote(size(codepoint)))
293-
end
294-
end
295-
296-
defp next_hangul_v(rest, string, size) do
297-
next_hangul_t(rest, string, size)
298-
end
299-
300-
# Handle Hangul T
301-
lc codepoint inlist cluster["T"] do
302-
defp next_hangul_t(<< unquote(codepoint), rest :: binary >>, string, size) do
303-
next_hangul_t(rest, string, size + unquote(size(codepoint)))
304-
end
305-
end
306-
307-
defp next_hangul_t(rest, string, size) do
308-
next_extend(rest, string, size)
309-
end
310-
311-
# Handle regional
312-
lc codepoint inlist cluster["Regional_Indicator"] do
313-
defp next_regional(<< unquote(codepoint), rest :: binary >>, string, size) do
314-
next_regional(rest, string, size + unquote(size(codepoint)))
315-
end
316-
end
317-
318-
defp next_regional(rest, string, size) do
319-
next_extend(rest, string, size)
320-
end
321-
322-
# Handle Extend+SpacingMark
323-
lc codepoint inlist cluster["Extend"] ++ cluster["SpacingMark"] do
324-
defp next_extend(<< unquote(codepoint), rest :: binary >>, string, size) do
325-
next_extend(rest, string, size + unquote(size(codepoint)))
326-
end
327-
end
328-
329-
defp next_extend(rest, string, size) do
330-
{ :binary.part(string, 0, size), rest }
331-
end
332-
333-
# Handle Prepend
334-
# lc codepoint inlist cluster["Prepend"] do
335-
# defp next_prepend(<< unquote(codepoint), rest :: binary >>, string, size) do
336-
# next_prepend(rest, string, size + unquote(size(codepoint)))
337-
# end
338-
# end
339-
#
340-
# defp next_prepend(rest, string, size) do
341-
# { :binary.part(string, 0, size), rest }
342-
# end
343-
344-
def graphemes(binary) when is_binary(binary) do
345-
do_graphemes(next_grapheme(binary))
346-
end
347-
348-
defp do_graphemes({ c, rest }) do
349-
[c|do_graphemes(next_grapheme(rest))]
350-
end
351-
352-
defp do_graphemes(:no_grapheme) do
353-
[]
354-
end
355-
end

0 commit comments

Comments
 (0)