|
52 | 52 | # feature extraction |
53 | 53 | # ------------------------------------------------------------------------------------------------------------------------- |
54 | 54 | # ngrams from arrays |
55 | | -@expect ngrams(["a", "b", "c"], order = 3) == ["a", "a b", "a b c"] |
| 55 | +@expect ngrams(["a", "b", "c"], order = 3) == ["a", "b", "c", "a b", "b c", "a b c"] |
56 | 56 | @expect ngrams(["a", "b", "c"], order = 3, truncated_start = true) == ["a b c"] |
57 | 57 |
|
58 | | -@expect ngrams(["a", "b", "c"], order = 2) == ["a", "a b", "b c"] |
| 58 | +@expect ngrams(["a", "b", "c"], order = 2) == ["a", "b", "c", "a b", "b c"] |
59 | 59 | @expect ngrams(["a", "b", "c"], order = 2, truncated_start = true) == ["a b", "b c"] |
60 | 60 |
|
61 | 61 | @expect ngrams(["a", "b", "c"], order = 1) == ["a", "b", "c"] |
|
65 | 65 | @expect ngrams(["a"], order = 3, truncated_start = true) == [] |
66 | 66 |
|
67 | 67 | # ngrams from strings |
68 | | -@expect ngrams("abc", order = 3) == ["a", "ab", "abc"] |
| 68 | +@expect ngrams("abc", order = 3) == ["a", "b", "c", "ab", "bc", "abc"] |
69 | 69 | @expect ngrams("abc", order = 3, truncated_start = true) == ["abc"] |
70 | 70 |
|
71 | | -@expect ngrams("abc", order = 2) == ["a", "ab", "bc"] |
| 71 | +@expect ngrams("abc", order = 2) == ["a", "b", "c", "ab", "bc"] |
72 | 72 | @expect ngrams("abc", order = 2, truncated_start = true) == ["ab", "bc"] |
73 | 73 |
|
74 | 74 | @expect ngrams("abc", order = 1) == ["a", "b", "c"] |
75 | 75 | @expect ngrams("abc", order = 1, truncated_start = true) == ["a", "b", "c"] |
76 | 76 |
|
77 | 77 | @expect ngrams("a", order = 3) == ["a"] |
78 | | -@expect ngrams("ab", order = 3) == ["a", "ab"] |
79 | | -@expect ngrams("abcd", order = 3) == ["a", "ab", "abc", "bcd"] |
| 78 | +@expect ngrams("ab", order = 3) == ["a", "b", "ab"] |
| 79 | +@expect ngrams("abcd", order = 3) == ["a", "b", "c", "d", "ab", "bc", "cd", "abc", "bcd"] |
80 | 80 | @expect ngrams("a", order = 3, truncated_start = true) == [] |
81 | 81 | @expect ngrams("ab", order = 3, truncated_start = true) == [] |
82 | 82 | @expect ngrams("abcd", order = 3, truncated_start = true) == ["abc", "bcd"] |
83 | 83 |
|
84 | 84 | @expect ngrams("是的", order = 1) == ["是", "的"] |
85 | | -@expect ngrams("是的", order = 2) == ["是", "是的"] |
86 | | -@expect ngrams("是的", order = 3) == ["是", "是的"] |
| 85 | +@expect ngrams("是的", order = 2) == ["是", "的", "是的"] |
| 86 | +@expect ngrams("是的", order = 3) == ["是", "的", "是的"] |
87 | 87 | @expect ngrams("是的", order = 3, truncated_start = true) == [] |
88 | 88 |
|
89 | 89 | @expect ngrams("陇陇*", order = 1) == ["陇", "陇", "*"] |
90 | | -@expect ngrams("陇陇*", order = 2) == ["陇", "陇陇", "陇*"] |
91 | | -@expect ngrams("陇陇*", order = 3) == ["陇", "陇陇", "陇陇*"] |
| 90 | +@expect ngrams("陇陇*", order = 2) == ["陇", "陇", "*", "陇陇", "陇*"] |
| 91 | +@expect ngrams("陇陇*", order = 3) == ["陇", "陇", "*", "陇陇", "陇*", "陇陇*"] |
92 | 92 | @expect ngrams("陇陇*", order = 3, truncated_start = true) == ["陇陇*"] |
93 | 93 |
|
94 | 94 | @expect ngrams("", order = 1) == [] |
95 | 95 |
|
96 | 96 | # ngram iterator |
97 | | -@expect collect(ngram_iterator("abc", order = 3)) == ["a", "ab", "abc"] |
| 97 | +@expect collect(ngram_iterator("abc", order = 3)) == ["a", "b", "c", "ab", "bc", "abc"] |
98 | 98 | @expect collect(ngram_iterator("abc", order = 3, truncated_start = true)) == ["abc"] |
99 | 99 |
|
100 | | -@expect collect(ngram_iterator("abc", order = 2)) == ["a", "ab", "bc"] |
| 100 | +@expect collect(ngram_iterator("abc", order = 2)) == ["a", "b", "c", "ab", "bc"] |
101 | 101 | @expect collect(ngram_iterator("abc", order = 2, truncated_start = true)) == ["ab", "bc"] |
102 | 102 |
|
103 | 103 | @expect collect(ngram_iterator("abc", order = 1)) == ["a", "b", "c"] |
104 | 104 | @expect collect(ngram_iterator("abc", order = 1, truncated_start = true)) == ["a", "b", "c"] |
105 | 105 |
|
106 | 106 | @expect collect(ngram_iterator("a", order = 3)) == ["a"] |
107 | | -@expect collect(ngram_iterator("ab", order = 3)) == ["a", "ab"] |
108 | | -@expect collect(ngram_iterator("abcd", order = 3)) == ["a", "ab", "abc", "bcd"] |
| 107 | +@expect collect(ngram_iterator("ab", order = 3)) == ["a", "b", "ab"] |
| 108 | +@expect collect(ngram_iterator("abcd", order = 3)) == ["a", "b", "c", "d", "ab", "bc", "cd", "abc", "bcd"] |
109 | 109 | @expect collect(ngram_iterator("a", order = 3, truncated_start = true)) == [] |
110 | 110 | @expect collect(ngram_iterator("ab", order = 3, truncated_start = true)) == [] |
111 | 111 | @expect collect(ngram_iterator("abcd", order = 3, truncated_start = true)) == ["abc", "bcd"] |
112 | 112 |
|
113 | 113 | @expect collect(ngram_iterator("是的", order = 1)) == ["是", "的"] |
114 | | -@expect collect(ngram_iterator("是的", order = 2)) == ["是", "是的"] |
115 | | -@expect collect(ngram_iterator("是的", order = 3)) == ["是", "是的"] |
| 114 | +@expect collect(ngram_iterator("是的", order = 2)) == ["是", "的", "是的"] |
| 115 | +@expect collect(ngram_iterator("是的", order = 3)) == ["是", "的", "是的"] |
116 | 116 | @expect collect(ngram_iterator("是的", order = 3, truncated_start = true)) == [] |
117 | 117 |
|
118 | 118 | @expect collect(ngram_iterator("陇陇*", order = 1)) == ["陇", "陇", "*"] |
119 | | -@expect collect(ngram_iterator("陇陇*", order = 2)) == ["陇", "陇陇", "陇*"] |
120 | | -@expect collect(ngram_iterator("陇陇*", order = 3)) == ["陇", "陇陇", "陇陇*"] |
| 119 | +@expect collect(ngram_iterator("陇陇*", order = 2)) == ["陇", "陇", "*", "陇陇", "陇*"] |
| 120 | +@expect collect(ngram_iterator("陇陇*", order = 3)) == ["陇", "陇", "*", "陇陇", "陇*", "陇陇*"] |
121 | 121 | @expect collect(ngram_iterator("陇陇*", order = 3, truncated_start = true)) == ["陇陇*"] |
122 | 122 |
|
123 | 123 | @expect collect(ngram_iterator("", order = 1)) == [] |
@@ -150,5 +150,3 @@ bkg = make_background(lines, mincount = 2) |
150 | 150 |
|
151 | 151 | include("lid.jl") |
152 | 152 | include("topic.jl") |
153 | | - |
154 | | - |
0 commit comments