Skip to content

Commit 921719d

Browse files
committed
- applied ngrams change to Strings, updated tests
1 parent a2708ce commit 921719d

File tree

3 files changed

+35
-32
lines changed

3 files changed

+35
-32
lines changed

src/features.jl

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,19 @@ export ngrams, count, tfnorm, sparse_count, norm, znorm, ngram_iterator, ngrams!
2020

2121
immutable NgramStringIterator
2222
string :: String
23-
order :: Int32
23+
order :: Int32
2424
truncated_start :: Bool
2525
end
2626
type StringPosition
27-
start :: Int32
28-
fin :: Int32
29-
nth :: Int32
27+
start :: Int32
28+
fin :: Int32
29+
nth :: Int32
3030
end
3131

3232
function start(ngi :: NgramStringIterator)
3333
if ngi.truncated_start
3434
idx = 1
35-
for i = 1:(ngi.order-1)
35+
for i = 1:(ngi.order-1) #necessary because strings are indexed to bytes, not characters
3636
idx = nextind(ngi.string, idx)
3737
end
3838
return StringPosition(1, idx, ngi.order)
@@ -41,14 +41,20 @@ function start(ngi :: NgramStringIterator)
4141
end
4242
end
4343

44-
done(ngi :: NgramStringIterator, position) = position.fin > endof(ngi.string)
44+
done(ngi :: NgramStringIterator, position) = position.nth > ngi.order || position.fin > endof(ngi.string)
4545
function next(ngi :: NgramStringIterator, position)
4646
str = make_string(ngi.string, position.start, position.fin)
47-
if position.nth >= ngi.order
48-
position.start = nextind(ngi.string, position.start)
47+
48+
if position.fin >= endof(ngi.string)
49+
position.start = 0
50+
position.fin = 1
51+
for i = 1:position.nth-1
52+
position.fin = nextind(ngi.string, position.fin)
53+
end
54+
position.nth += 1
4955
end
50-
position.nth += 1
51-
position.fin = nextind(ngi.string, position.fin)
56+
position.start = nextind(ngi.string, position.start)
57+
position.fin = nextind(ngi.string, position.fin)
5258
return str, position
5359
end
5460

@@ -100,7 +106,6 @@ function sparse_count(text, bkg)
100106
return vec
101107
end
102108

103-
104109
function dict_count(tokens)
105110
map = DefaultDict{String,Int32}()
106111
for w in tokens

test/lid.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(Stri
1919
res = test_classification(model, lazy_map(x -> fextractor(lid_iterating_tokenizer(x)), test), test_truth, record = (t, h) -> confmat[t][h] += 1) * 100.0
2020
@info @sprintf("mira test set error rate: %7.3f", res)
2121
print_confusion_matrix(confmat)
22-
@expect abs(res - 0.596) < 0.01
22+
@expect abs(res - 0.700) < 0.01
2323

2424
# List specific errors
2525
# for (text, t) in zip(test, test_truth)

test/runtests.jl

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@ end
5252
# feature extraction
5353
# -------------------------------------------------------------------------------------------------------------------------
5454
# ngrams from arrays
55-
@expect ngrams(["a", "b", "c"], order = 3) == ["a", "a b", "a b c"]
55+
@expect ngrams(["a", "b", "c"], order = 3) == ["a", "b", "c", "a b", "b c", "a b c"]
5656
@expect ngrams(["a", "b", "c"], order = 3, truncated_start = true) == ["a b c"]
5757

58-
@expect ngrams(["a", "b", "c"], order = 2) == ["a", "a b", "b c"]
58+
@expect ngrams(["a", "b", "c"], order = 2) == ["a", "b", "c", "a b", "b c"]
5959
@expect ngrams(["a", "b", "c"], order = 2, truncated_start = true) == ["a b", "b c"]
6060

6161
@expect ngrams(["a", "b", "c"], order = 1) == ["a", "b", "c"]
@@ -65,59 +65,59 @@ end
6565
@expect ngrams(["a"], order = 3, truncated_start = true) == []
6666

6767
# ngrams from strings
68-
@expect ngrams("abc", order = 3) == ["a", "ab", "abc"]
68+
@expect ngrams("abc", order = 3) == ["a", "b", "c", "ab", "bc", "abc"]
6969
@expect ngrams("abc", order = 3, truncated_start = true) == ["abc"]
7070

71-
@expect ngrams("abc", order = 2) == ["a", "ab", "bc"]
71+
@expect ngrams("abc", order = 2) == ["a", "b", "c", "ab", "bc"]
7272
@expect ngrams("abc", order = 2, truncated_start = true) == ["ab", "bc"]
7373

7474
@expect ngrams("abc", order = 1) == ["a", "b", "c"]
7575
@expect ngrams("abc", order = 1, truncated_start = true) == ["a", "b", "c"]
7676

7777
@expect ngrams("a", order = 3) == ["a"]
78-
@expect ngrams("ab", order = 3) == ["a", "ab"]
79-
@expect ngrams("abcd", order = 3) == ["a", "ab", "abc", "bcd"]
78+
@expect ngrams("ab", order = 3) == ["a", "b", "ab"]
79+
@expect ngrams("abcd", order = 3) == ["a", "b", "c", "d", "ab", "bc", "cd", "abc", "bcd"]
8080
@expect ngrams("a", order = 3, truncated_start = true) == []
8181
@expect ngrams("ab", order = 3, truncated_start = true) == []
8282
@expect ngrams("abcd", order = 3, truncated_start = true) == ["abc", "bcd"]
8383

8484
@expect ngrams("是的", order = 1) == ["", ""]
85-
@expect ngrams("是的", order = 2) == ["", "是的"]
86-
@expect ngrams("是的", order = 3) == ["", "是的"]
85+
@expect ngrams("是的", order = 2) == ["", "", "是的"]
86+
@expect ngrams("是的", order = 3) == ["", "", "是的"]
8787
@expect ngrams("是的", order = 3, truncated_start = true) == []
8888

8989
@expect ngrams("陇陇*", order = 1) == ["", "", "*"]
90-
@expect ngrams("陇陇*", order = 2) == ["", "陇陇", "陇*"]
91-
@expect ngrams("陇陇*", order = 3) == ["", "陇陇", "陇陇*"]
90+
@expect ngrams("陇陇*", order = 2) == ["", "", "*", "陇陇", "陇*"]
91+
@expect ngrams("陇陇*", order = 3) == ["", "", "*", "陇陇", "陇*", "陇陇*"]
9292
@expect ngrams("陇陇*", order = 3, truncated_start = true) == ["陇陇*"]
9393

9494
@expect ngrams("", order = 1) == []
9595

9696
# ngram iterator
97-
@expect collect(ngram_iterator("abc", order = 3)) == ["a", "ab", "abc"]
97+
@expect collect(ngram_iterator("abc", order = 3)) == ["a", "b", "c", "ab", "bc", "abc"]
9898
@expect collect(ngram_iterator("abc", order = 3, truncated_start = true)) == ["abc"]
9999

100-
@expect collect(ngram_iterator("abc", order = 2)) == ["a", "ab", "bc"]
100+
@expect collect(ngram_iterator("abc", order = 2)) == ["a", "b", "c", "ab", "bc"]
101101
@expect collect(ngram_iterator("abc", order = 2, truncated_start = true)) == ["ab", "bc"]
102102

103103
@expect collect(ngram_iterator("abc", order = 1)) == ["a", "b", "c"]
104104
@expect collect(ngram_iterator("abc", order = 1, truncated_start = true)) == ["a", "b", "c"]
105105

106106
@expect collect(ngram_iterator("a", order = 3)) == ["a"]
107-
@expect collect(ngram_iterator("ab", order = 3)) == ["a", "ab"]
108-
@expect collect(ngram_iterator("abcd", order = 3)) == ["a", "ab", "abc", "bcd"]
107+
@expect collect(ngram_iterator("ab", order = 3)) == ["a", "b", "ab"]
108+
@expect collect(ngram_iterator("abcd", order = 3)) == ["a", "b", "c", "d", "ab", "bc", "cd", "abc", "bcd"]
109109
@expect collect(ngram_iterator("a", order = 3, truncated_start = true)) == []
110110
@expect collect(ngram_iterator("ab", order = 3, truncated_start = true)) == []
111111
@expect collect(ngram_iterator("abcd", order = 3, truncated_start = true)) == ["abc", "bcd"]
112112

113113
@expect collect(ngram_iterator("是的", order = 1)) == ["", ""]
114-
@expect collect(ngram_iterator("是的", order = 2)) == ["", "是的"]
115-
@expect collect(ngram_iterator("是的", order = 3)) == ["", "是的"]
114+
@expect collect(ngram_iterator("是的", order = 2)) == ["", "", "是的"]
115+
@expect collect(ngram_iterator("是的", order = 3)) == ["", "", "是的"]
116116
@expect collect(ngram_iterator("是的", order = 3, truncated_start = true)) == []
117117

118118
@expect collect(ngram_iterator("陇陇*", order = 1)) == ["", "", "*"]
119-
@expect collect(ngram_iterator("陇陇*", order = 2)) == ["", "陇陇", "陇*"]
120-
@expect collect(ngram_iterator("陇陇*", order = 3)) == ["", "陇陇", "陇陇*"]
119+
@expect collect(ngram_iterator("陇陇*", order = 2)) == ["", "", "*", "陇陇", "陇*"]
120+
@expect collect(ngram_iterator("陇陇*", order = 3)) == ["", "", "*", "陇陇", "陇*", "陇陇*"]
121121
@expect collect(ngram_iterator("陇陇*", order = 3, truncated_start = true)) == ["陇陇*"]
122122

123123
@expect collect(ngram_iterator("", order = 1)) == []
@@ -150,5 +150,3 @@ bkg = make_background(lines, mincount = 2)
150150

151151
include("lid.jl")
152152
include("topic.jl")
153-
154-

0 commit comments

Comments
 (0)