Skip to content

Commit 326f5fa

Browse files
committed
Corrected q-gram similarity and distance
1 parent a61d6ba commit 326f5fa

File tree

4 files changed

+126
-48
lines changed

4 files changed

+126
-48
lines changed

README.md

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Currently implemeted:
66
- Levenshtein edit distance;
77
- Jaro-Winkler similarity;
88
- Longest Common Subsequence edit distance;
9-
- Q-Gram (Jaccard index);
9+
- Q-Gram (Ukkonen);
1010
- n-Gram distance (Kondrak).
1111

1212
## Download
@@ -90,7 +90,10 @@ public class MyApp {
9090

9191
## Q-Gram
9292

93-
Q-Gram similarity, not to confuse with N-Gram distance defined by Kondrak (below), is the relative number of n-grams both strings have in common. It is thus the Jaccard index between the strings considered as sets of n-grams. The computed similarity and distance are relative value (between 0 and 1).
93+
A-gram similarity and distance, as defined by Ukkonen in "Approximate string-matching with q-grams and maximal matches"
94+
http://www.sciencedirect.com/science/article/pii/0304397592901434
95+
96+
The distance between two strings is defined as the L1 norm of the difference of their profiles (the number of occurences of each k-shingle). Q-gram distance is a lower bound on Levenshtein distance, but can be computed in O(|A| + |B|), where Levenshtein requires O(|A|.|B|)
9497

9598
```java
9699
import info.debatty.java.stringsimilarity.*;
@@ -99,11 +102,14 @@ public class MyApp {
99102

100103
public static void main(String[] args) {
101104
QGram dig = new QGram(2);
102-
103-
// Should be 2: CD and CE
105+
106+
// AB BC CD CE
107+
// 1 1 1 0
108+
// 1 1 0 1
109+
// Total: 2
104110
System.out.println(dig.absoluteDistance("ABCD", "ABCE"));
105-
106-
// Should be 0.5 (2 / 4)
111+
112+
// 2 / (3 + 3) = 0.33333
107113
System.out.println(dig.distance("ABCD", "ABCE"));
108114
}
109115
}

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
<modelVersion>4.0.0</modelVersion>
88
<groupId>info.debatty</groupId>
99
<artifactId>java-string-similarity</artifactId>
10-
<version>0.4</version>
10+
<version>0.5</version>
1111
<packaging>jar</packaging>
1212

1313
<name>${project.artifactId}</name>

src/main/java/info/debatty/java/stringsimilarity/KShingling.java

Lines changed: 78 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import java.io.Serializable;
44
import java.security.InvalidParameterException;
5-
import java.util.ArrayList;
65
import java.util.HashSet;
76
import java.util.regex.Pattern;
87
import java.util.Set;
@@ -30,17 +29,33 @@ public static void main(String[] args) {
3029
ks.parse(s2);
3130
System.out.println(ks.toString());
3231

33-
for (boolean b : ks.booleanVectorOf(s1)) {
34-
System.out.print(b ? "1" : "0");
35-
}
36-
System.out.print("\n");
32+
printArray(ks.booleanVectorOf(s1));
33+
printArray(ks.booleanVectorOf(s2));
34+
printArray(ks.profileOf(s1));
35+
36+
ks.add("This should trigger an exception!");
37+
}
38+
39+
public static int countOccurences(String substring, String str){
40+
return (str.length() - str.replace(substring, "").length()) / substring.length();
41+
}
42+
43+
public static void printArray(boolean[] a) {
3744

38-
for (boolean b : ks.booleanVectorOf(s2)) {
45+
System.out.print("[");
46+
for (boolean b : a) {
3947
System.out.print(b ? "1" : "0");
4048
}
41-
System.out.print("\n");
49+
System.out.println("]");
50+
}
51+
52+
public static void printArray(int[] a) {
4253

43-
ks.add("This should trigger an exception!");
54+
System.out.print("[");
55+
for (int i : a) {
56+
System.out.print("" + i + "\t");
57+
}
58+
System.out.println("]");
4459
}
4560

4661
protected int k = 5;
@@ -74,7 +89,17 @@ public final void setK(int k) {
7489
this.k = k;
7590
}
7691

92+
/**
93+
* Pattern for finding multiple following spaces
94+
*/
7795
private static final Pattern spaceReg = Pattern.compile("\\s+");
96+
97+
/**
98+
* Extract all k-singles from sting s and add them to the list of possible
99+
* shingles
100+
* @param s
101+
* @return true
102+
*/
78103
public boolean parse(String s) {
79104
s = spaceReg.matcher(s).replaceAll(" ");
80105
for (int i = 0; i < (s.length() - k + 1); i++) {
@@ -83,6 +108,11 @@ public boolean parse(String s) {
83108
return true;
84109
}
85110

111+
/**
112+
* Add a k-shingle s to the list of possible shingles
113+
* @param s
114+
* @return
115+
*/
86116
@Override
87117
public boolean add(String s) {
88118
if (s.length() != k) {
@@ -93,6 +123,14 @@ public boolean add(String s) {
93123
return super.add(s);
94124
}
95125

126+
/**
127+
* Compute and return the boolean vector representation of string s.
128+
* E.g. if this set contains the shingles [AB, BC, CD, DE]
129+
* and s is ABCD
130+
* This will return [true, true, true, false]
131+
* @param s
132+
* @return
133+
*/
96134
public boolean[] booleanVectorOf(String s) {
97135
boolean[] r = new boolean[this.size()];
98136

@@ -105,6 +143,15 @@ public boolean[] booleanVectorOf(String s) {
105143
return r;
106144
}
107145

146+
/**
147+
* Compute the boolean representation of string s, returned as a set of
148+
* position integers.
149+
* E.g. if this set contains the shingles [AB, BC, CD, DE]
150+
* and s is ABCD
151+
* This will return (0, 1, 2)
152+
* @param s
153+
* @return
154+
*/
108155
public Set<Integer> integerSetOf(String s) {
109156
Set<Integer> set = new HashSet<Integer>();
110157
int i = 0;
@@ -118,4 +165,27 @@ public Set<Integer> integerSetOf(String s) {
118165
return set;
119166
}
120167

168+
/**
169+
* Compute and return the profile of s, as defined by Ukkonen "Approximate
170+
* string-matching with q-grams and maximal matches".
171+
* https://www.cs.helsinki.fi/u/ukkonen/TCS92.pdf
172+
* The profile is the number of occurences of k-shingles, and is used to
173+
* compute q-gram similarity.
174+
* E.g. if this set contains the shingles [AB, BC, CD, DE]
175+
* and s is ABCDAB
176+
* This will return [2, 1, 1, 0]
177+
* @param s
178+
* @return
179+
*/
180+
public int[] profileOf(String s) {
181+
int[] p = new int[this.size()];
182+
int i = 0;
183+
for (String shingle : this) {
184+
p[i] = countOccurences(shingle, s);
185+
i++;
186+
}
187+
188+
return p;
189+
}
190+
121191
}
Lines changed: 35 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,29 @@
11
package info.debatty.java.stringsimilarity;
22

33
/**
4-
* QGram similarity is the relative number of n-grams both strings have in
5-
* common. It is thus the Jaccard index between strings, considered as sets
6-
* of n-grams. The computed similarity and distance are relative value (between
7-
* 0 and 1).
8-
*
4+
* Q-gram similarity and distance.
5+
* Defined by Ukkonen in "Approximate string-matching with q-grams and maximal
6+
* matches", http://www.sciencedirect.com/science/article/pii/0304397592901434
7+
* The distance between two strings is defined as the L1 norm of the difference
8+
* of their profiles (the number of occurences of each k-shingle).
9+
* Q-gram distance is a lower bound on Levenshtein distance, but can be computed
10+
* in O(|A| + |B|), where Levenshtein requires O(|A|.|B|)
911
* @author Thibault Debatty
1012
*/
1113
public class QGram implements StringSimilarityInterface {
1214

1315
public static void main(String[] args) {
1416
QGram dig = new QGram(2);
1517

16-
// Should be 2: CD and CE
18+
// AB BC CD CE
19+
// 1 1 1 0
20+
// 1 1 0 1
21+
// Total: 2
1722
System.out.println(dig.absoluteDistance("ABCD", "ABCE"));
1823

19-
// Should be 0.5 (2 / 4)
24+
// 2 / (3 + 3) = 0.33333
2025
System.out.println(dig.distance("ABCD", "ABCE"));
2126

22-
// AB BC CD DE BX XB CE
23-
// 2 / 7
24-
System.out.println(dig.similarity("ABCDE", "ABXBCE"));
25-
2627
System.out.println(dig.similarity(
2728
"High Qua1ityMedications Discount On All Reorders = Best Deal Ever! Viagra50/100mg - $1.85 071",
2829
"High Qua1ityMedications Discount On All Reorders = Best Deal Ever! Viagra50/100mg - $1.85 7z3"));
@@ -45,6 +46,14 @@ public double similarity(String s1, String s2) {
4546

4647
@Override
4748
public double distance(String s1, String s2) {
49+
return dist(s1, s2, false);
50+
}
51+
52+
public int absoluteDistance(String s1, String s2) {
53+
return (int) dist(s1, s2, true);
54+
}
55+
56+
protected double dist(String s1, String s2, boolean abs) {
4857
if (s1.length() < n || s2.length() < n) {
4958
return 0;
5059
}
@@ -53,35 +62,28 @@ public double distance(String s1, String s2) {
5362
sh.parse(s1);
5463
sh.parse(s2);
5564

56-
boolean[] b1 = sh.booleanVectorOf(s1);
57-
boolean[] b2 = sh.booleanVectorOf(s2);
65+
int[] p1 = sh.profileOf(s1);
66+
int[] p2 = sh.profileOf(s2);
67+
5868

5969
int d = 0;
60-
for (int i = 0; i < b1.length; i++) {
61-
if (b1[i] != b2[i]) {
62-
d++;
63-
}
70+
for (int i = 0; i < p1.length; i++) {
71+
d += Math.abs(p1[i] - p2[i]);
6472
}
6573

66-
return ((double) d) / sh.size();
67-
}
68-
69-
public int absoluteDistance(String s1, String s2) {
70-
KShingling sh = new KShingling(n);
71-
sh.parse(s1);
72-
sh.parse(s2);
73-
74-
boolean[] b1 = sh.booleanVectorOf(s1);
75-
boolean[] b2 = sh.booleanVectorOf(s2);
74+
if (abs) {
75+
return d;
76+
}
7677

77-
int d = 0;
78-
for (int i = 0; i < b1.length; i++) {
79-
if (b1[i] != b2[i]) {
80-
d++;
81-
}
78+
int sum = 0;
79+
for (int i : p1) {
80+
sum += i;
81+
}
82+
for (int i : p2) {
83+
sum += i;
8284
}
8385

84-
return d;
86+
return (double) d / sum;
8587
}
8688

8789
}

0 commit comments

Comments
 (0)