Skip to content

Commit 44dd2b8

Browse files
committed
Added SorensenDice coefficient
1 parent f867e70 commit 44dd2b8

File tree

3 files changed

+108
-6
lines changed

3 files changed

+108
-6
lines changed

src/main/java/info/debatty/java/stringsimilarity/Jaccard.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public static void main(String[] args) {
4343
System.out.println(j2.similarity("ABCDE", "ABCDF"));
4444
}
4545

46-
private int k;
46+
private final int k;
4747

4848
/**
4949
* The strings are first transformed into sets of k-shingles (sequences of k

src/main/java/info/debatty/java/stringsimilarity/QGram.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,14 @@ public static void main(String[] args) {
3131
"High Qua1ityMedications Discount On All Reorders = Best Deal Ever! Viagra50/100mg - $1.85 7z3"));
3232
}
3333

34-
private int n;
34+
private final int k;
3535

3636
public QGram(int n) {
37-
this.n = n;
37+
this.k = n;
3838
}
3939

4040
public QGram() {
41-
this.n = 3;
41+
this.k = 3;
4242
}
4343

4444
@Override
@@ -56,11 +56,11 @@ public int absoluteDistance(String s1, String s2) {
5656
}
5757

5858
protected double dist(String s1, String s2, boolean abs) {
59-
if (s1.length() < n || s2.length() < n) {
59+
if (s1.length() < k || s2.length() < k) {
6060
return 1;
6161
}
6262

63-
KShingling sh = new KShingling(n);
63+
KShingling sh = new KShingling(k);
6464
sh.parse(s1);
6565
sh.parse(s2);
6666

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2015 tibo.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
25+
package info.debatty.java.stringsimilarity;
26+
27+
/**
28+
* Sorensen-Dice coefficien, aka Sørensen index, Dice's coefficient or
29+
* Czekanowski's binary (non-quantitative) index.
30+
*
31+
* The strings are first converted to boolean sets of k-shingles (strings of k
32+
* characters), then the similarity is computed as 2 |A inter B| / (|A| + |B|).
33+
* Attention: Sorensen-Dice distance (and similarity) does not satisfy
34+
* triangle inequality.
35+
*
36+
* @author Thibault Debatty
37+
*/
38+
public class SorensenDice implements StringSimilarityInterface {
39+
40+
/**
41+
* @param args the command line arguments
42+
*/
43+
public static void main(String[] args) {
44+
SorensenDice sd = new SorensenDice(2);
45+
46+
// AB BC CD DE DF FG
47+
// 1 1 1 1 0 0
48+
// 1 1 1 0 1 1
49+
// => 2 x 3 / (4 + 5) = 6/9 = 0.6666
50+
System.out.println(sd.similarity("ABCDE", "ABCDFG"));
51+
}
52+
53+
private final int k;
54+
55+
public SorensenDice(int k) {
56+
this.k = k;
57+
}
58+
59+
public SorensenDice() {
60+
this.k = 3;
61+
}
62+
63+
/**
64+
* Compute Sorensen-Dice coefficient 2 |A inter B| / (|A| + |B|).
65+
* @param s1
66+
* @param s2
67+
* @return
68+
*/
69+
public double similarity(String s1, String s2) {
70+
KShingling ks = new KShingling(this.k);
71+
72+
ks.parse(s1);
73+
ks.parse(s2);
74+
75+
boolean[] v1 = ks.booleanVectorOf(s1);
76+
boolean[] v2 = ks.booleanVectorOf(s2);
77+
78+
int inter = 0;
79+
int sum = 0;
80+
for (int i = 0; i < v1.length; i++) {
81+
if (v1[i] && v2[i]) {
82+
inter++;
83+
}
84+
85+
if (v1[i]) {
86+
sum++;
87+
}
88+
89+
if (v2[i]) {
90+
sum++;
91+
}
92+
}
93+
94+
return 2.0 * inter / sum;
95+
96+
}
97+
98+
public double distance(String s1, String s2) {
99+
return 1.0 - similarity(s1, s2);
100+
}
101+
102+
}

0 commit comments

Comments
 (0)