Skip to content

Commit f867e70

Browse files
committed
Added Jaccard similarity
1 parent 326f5fa commit f867e70

File tree

2 files changed

+94
-1
lines changed

2 files changed

+94
-1
lines changed
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2015 tibo.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
25+
package info.debatty.java.stringsimilarity;
26+
27+
/**
28+
*
29+
* @author Thibault Debatty
30+
*/
31+
public class Jaccard implements StringSimilarityInterface {
32+
33+
/**
34+
* @param args the command line arguments
35+
*/
36+
public static void main(String[] args) {
37+
Jaccard j2 = new Jaccard(2);
38+
39+
// AB BC CD DE DF
40+
// 1 1 1 1 0
41+
// 1 1 1 0 1
42+
// => 3 / 5 = 0.6
43+
System.out.println(j2.similarity("ABCDE", "ABCDF"));
44+
}
45+
46+
private int k;
47+
48+
/**
49+
* The strings are first transformed into sets of k-shingles (sequences of k
50+
* characters), then Jaccard index is computed as |A inter B| / |A union B|.
51+
* The default value of k is 3.
52+
*
53+
* @param k
54+
*/
55+
public Jaccard(int k) {
56+
this.k = k;
57+
}
58+
59+
public Jaccard() {
60+
this.k = 3;
61+
}
62+
63+
public double similarity(String s1, String s2) {
64+
KShingling ks = new KShingling(this.k);
65+
ks.parse(s1);
66+
ks.parse(s2);
67+
68+
boolean[] v1 = ks.booleanVectorOf(s1);
69+
boolean[] v2 = ks.booleanVectorOf(s2);
70+
71+
int inter = 0;
72+
int union = 0;
73+
for (int i = 0; i < v1.length; i++) {
74+
if (v1[i] || v2[i]) {
75+
union++;
76+
77+
if (v1[i] && v2[i]) {
78+
inter++;
79+
}
80+
}
81+
}
82+
83+
return (double) inter / union;
84+
85+
}
86+
87+
public double distance(String s1, String s2) {
88+
return 1.0 - similarity(s1, s2);
89+
}
90+
91+
}

src/main/java/info/debatty/java/stringsimilarity/QGram.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ public static void main(String[] args) {
2424
// 2 / (3 + 3) = 0.33333
2525
System.out.println(dig.distance("ABCD", "ABCE"));
2626

27+
System.out.println(dig.similarity("", "QSDFGHJKLM"));
28+
2729
System.out.println(dig.similarity(
2830
"High Qua1ityMedications Discount On All Reorders = Best Deal Ever! Viagra50/100mg - $1.85 071",
2931
"High Qua1ityMedications Discount On All Reorders = Best Deal Ever! Viagra50/100mg - $1.85 7z3"));
@@ -55,7 +57,7 @@ public int absoluteDistance(String s1, String s2) {
5557

5658
protected double dist(String s1, String s2, boolean abs) {
5759
if (s1.length() < n || s2.length() < n) {
58-
return 0;
60+
return 1;
5961
}
6062

6163
KShingling sh = new KShingling(n);

0 commit comments

Comments
 (0)