From a8a5b32abb0807d3eff4ba8a32feb90bb2ebe6a2 Mon Sep 17 00:00:00 2001 From: Jeffery Painter Date: Tue, 30 Jan 2024 09:17:20 -0500 Subject: [PATCH 1/7] Update the computation of max-depth to initialize the root concept to a depth of zero --- .../kernel/IntrinsicInfoContentEvaluatorImpl.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IntrinsicInfoContentEvaluatorImpl.java b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IntrinsicInfoContentEvaluatorImpl.java index 515cb82a..73248760 100644 --- a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IntrinsicInfoContentEvaluatorImpl.java +++ b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/IntrinsicInfoContentEvaluatorImpl.java @@ -460,8 +460,16 @@ private Set getSubsumers(ConcRel concept, } } } - if (calcDepth) - depthArray[concept.getNodeIndex()] = (short) (parentMaxDepth + 1); + + // Compute the current depth of the concept node + if (calcDepth) { + // Dummy concept has no length + if ( concept.getConceptID().contentEquals("C0000000") ) { + depthArray[concept.getNodeIndex()] = (short) (0); + } else { + depthArray[concept.getNodeIndex()] = (short) (parentMaxDepth + 1); + } + } // add the concept itself to the set of subsumers subsumers.add(concept.getConceptID()); // add this to the cache - copy the key so that this can be gc'ed as From a6cb7e1b4ebfe769d435458b2e3cf8a6248f30e8 Mon Sep 17 00:00:00 2001 From: Jeffery Painter Date: Tue, 30 Jan 2024 09:18:09 -0500 Subject: [PATCH 2/7] Insure that the Jaccard metric has a positive denominator --- .../ctakes/ytex/kernel/metric/JaccardMetric.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/JaccardMetric.java b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/JaccardMetric.java index 7c980d3d..a2d3499e 100644 --- a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/JaccardMetric.java +++ b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/JaccardMetric.java @@ -42,7 +42,15 @@ public double similarity(String concept1, String concept2, return 0d; double ic1 = simSvc.getIC(concept1, true); double ic2 = simSvc.getIC(concept2, true); - return lcsIC / (ic1 + ic2 - lcsIC); + + // + // Test that we get a positive denominator + // + if ( ic1 + ic2 > lcsIC ) { + return lcsIC / (ic1 + ic2 - lcsIC); + } else { + return 0d; + } } } From 8b56d8a6cd7ffc3d3a26b673bef6976ba071dc09 Mon Sep 17 00:00:00 2001 From: Jeffery Painter Date: Tue, 30 Jan 2024 09:19:51 -0500 Subject: [PATCH 3/7] Update Wu-Palmer to match outputs generated by Perl UMLS::Similarity wup.pm module --- .../ytex/kernel/metric/WuPalmerMetric.java | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/WuPalmerMetric.java b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/WuPalmerMetric.java index d29c097c..396ab523 100644 --- a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/WuPalmerMetric.java +++ b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/WuPalmerMetric.java @@ -20,20 +20,41 @@ import java.util.Map; +/** + * Wu-Palmer metric matches results as found in the CPAN UMLS-Similarity::wup module + * + * @author vijay + * @author painter + * + */ public class WuPalmerMetric extends BaseSimilarityMetric { @Override public double similarity(String concept1, String concept2, Map conceptFilter, SimilarityInfo simInfo) { initLCSes(concept1, concept2, simInfo); + if (simInfo.getLcses().size() > 0) { int lcsDepth = 0; + + // Test for the LCS with the greatest depth + // to find the lowest common synonym for (String lcs : simInfo.getLcses()) { - int d = simSvc.getDepth(lcs); + + // The depth of the LCS is off by 1 + int d = simSvc.getDepth(lcs) + 1; + // Find the max depth of the LCS if (d > lcsDepth) lcsDepth = d; } - double lcsDepth2 = (double) (lcsDepth * 2); - return lcsDepth2 / (lcsDepth2 + (double) (simInfo.getLcsDist()-1)); + + // + // Compute Wu-Palmer Similarity: + // + double lcsDist = simInfo.getLcsDist().doubleValue(); + double c1Depth = simSvc.getDepth(concept1) + lcsDist; + double c2Depth = simSvc.getDepth(concept2) + lcsDist ; + double score = ( 2.0 * (lcsDepth) / ( c1Depth + c2Depth ) ); + return score; } return 0d; } From 1e31ac4d06cf4777583036ebfefde6452b9cdf90 Mon Sep 17 00:00:00 2001 From: Jeffery Painter Date: Tue, 30 Jan 2024 09:21:47 -0500 Subject: [PATCH 4/7] LCH Metric was impacted by inflated max-depth, corrected here --- .../ctakes/ytex/kernel/metric/LCHMetric.java | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/LCHMetric.java b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/LCHMetric.java index 3b2cc5e6..8ac3dd07 100644 --- a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/LCHMetric.java +++ b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/LCHMetric.java @@ -20,22 +20,36 @@ import java.util.Map; +/** + * + * This metric is an implementation of the semantic relatedness measure described + * by Leacock and Chodorow (1998). + * + * See reference paper: https://aclanthology.org/J06-1003.pdf + * Page 19, Sec 2.5.3 (7) + * + * sim(c1,c2) = -log ( len(c1,c2) / 2 * max_depth ) + * + */ public class LCHMetric extends BaseSimilarityMetric { /** - * log(max depth * 2) + * natural log(max depth * 2) */ - double logdm = 0d; + double maxDepth = 0d; @Override public double similarity(String concept1, String concept2, Map conceptFilter, SimilarityInfo simInfo) { - if (logdm != 0d) { + if (maxDepth != 0d) { initLCSes(concept1, concept2, simInfo); if (simInfo.getLcsDist() > 0) { - // double lch = logdm - Math.log((double) simInfo.getLcsDist()); - // // scale to depth - // return lch / logdm; - return 1 - (Math.log((double) simInfo.getLcsDist()) / logdm); + + double length = simInfo.getLcsDist(); + + // Compute the length between the concepts + double lch = Math.log(length / (double)(2 * maxDepth)) * -1.0d; + return lch; + } } return 0d; @@ -44,7 +58,10 @@ public double similarity(String concept1, String concept2, public LCHMetric(ConceptSimilarityService simSvc, Integer maxDepth) { super(simSvc); if (maxDepth != null) { - this.logdm = Math.log(2 * maxDepth); + // The cTakes YTEX concept graph adds a dummy node C000000 + // which should be reduced by 1 for computing the max depth + // correctly + this.maxDepth = maxDepth - 1; } } From ce6db4a3761c172d116f56d7cb4e6f4156e3f2e3 Mon Sep 17 00:00:00 2001 From: Jeffery Painter Date: Tue, 30 Jan 2024 09:23:16 -0500 Subject: [PATCH 5/7] Code format to be consistent with other metrics, fixed computational error where the similarity metric where the log value was being computed as 1 - rather than multiplied by -1 --- .../kernel/metric/IntrinsicLCHMetric.java | 36 ++++++++++++------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/IntrinsicLCHMetric.java b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/IntrinsicLCHMetric.java index cc42404e..144f8b2d 100644 --- a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/IntrinsicLCHMetric.java +++ b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/IntrinsicLCHMetric.java @@ -24,33 +24,43 @@ * compute intrinsic LCH as in eqn 28 from * http://dx.doi.org/10.1016/j.jbi.2011.03.013 * - * Scale to unit interval + * This version is NOT scaled to the unit metric * * @author vijay * */ public class IntrinsicLCHMetric extends BaseSimilarityMetric { - double logMaxIC2 = 0d; - - public IntrinsicLCHMetric(ConceptSimilarityService simSvc, Double maxIC) { - super(simSvc); - if (maxIC != null) - this.logMaxIC2 = Math.log(2 * maxIC.doubleValue()) + 1d; - } + + double maxIC2 = 0d; @Override public double similarity(String concept1, String concept2, Map conceptFilter, SimilarityInfo simInfo) { - double sim = 0d; - if (logMaxIC2 != 0d) { + + if (maxIC2 != 0d) { + double ic1 = simSvc.getIC(concept1, true); double ic2 = simSvc.getIC(concept2, true); double lcsIC = initLcsIC(concept1, concept2, conceptFilter, simInfo, true); - sim = 1 - (Math.log(ic1 + ic2 - 2 * (lcsIC) + 1) / logMaxIC2); - + + // Compute the Intrinsic LCH metric + double sim = Math.log( (ic1 + ic2 - (2d * lcsIC) + 1d) / maxIC2) * -1.0d; + return sim; + } - return sim; + return 0d; + } + + public IntrinsicLCHMetric(ConceptSimilarityService simSvc, Double maxIC) { + super(simSvc); + if (maxIC != null) { + // Compute the denominator of the Intrinsic LCH metric + this.maxIC2 = 2.0d * maxIC.doubleValue(); + } + } + + } From 864d5c7fad00a413ef8254fb6b1553c2961ff493 Mon Sep 17 00:00:00 2001 From: Jeffery Painter Date: Tue, 30 Jan 2024 09:24:27 -0500 Subject: [PATCH 6/7] Cleanup on the Lin metric code --- .../ctakes/ytex/kernel/metric/LinMetric.java | 33 ++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/LinMetric.java b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/LinMetric.java index fb02d25e..5347ff28 100644 --- a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/LinMetric.java +++ b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/LinMetric.java @@ -46,30 +46,39 @@ public void setIntrinsicIC(boolean intrinsicIC) { @Override public double similarity(String concept1, String concept2, Map conceptFilter, SimilarityInfo simInfo) { - // don't bother if the concept graph is null + + // Test that there is a valid concept graph if (!validCG) return 0d; - // get lcs - double lcsIC = initLcsIC(concept1, concept2, conceptFilter, simInfo, - this.intrinsicIC); - if (lcsIC == 0d) { - return 0d; - } - // get ic of concepts + + // Compute the IC values for each concept double ic1 = simSvc.getIC(concept1, this.intrinsicIC); double ic2 = simSvc.getIC(concept2, this.intrinsicIC); + + // Get the LCS with the lowest IC score + double lcsIC = initLcsIC(concept1, concept2, conceptFilter, simInfo, + this.intrinsicIC); + // if the corpus IC is 0 and the concept is not the root, then we don't // have any IC on the concept and can't measure similarity - return 0 if (!intrinsicIC && ic1 == 0 && !rootConcept.equals(concept1)) return 0d; + if (!intrinsicIC && ic2 == 0 && !rootConcept.equals(concept2)) return 0d; - double denom = ic1 + ic2; - if (denom == 0) - return 0d; - return 2 * lcsIC / denom; + + // Compute the Lin score + double sim = (2d * lcsIC) / ( ic1 + ic2 ); + return sim; + } + /** + * This constructor allows us to specify if we want the standard Lin + * metric or the Intrinsic Lin by passing a boolean flag + * @param simSvc + * @param intrinsicIC if true, then compute the intrinsic Lin metric + */ public LinMetric(ConceptSimilarityService simSvc, boolean intrinsicIC) { super(simSvc); this.intrinsicIC = intrinsicIC; From bc835b8683c8c6a9d90a4fa14904643c3a2254b0 Mon Sep 17 00:00:00 2001 From: Jeffery Painter Date: Wed, 14 Feb 2024 14:42:10 -0500 Subject: [PATCH 7/7] Minor fix to correct depth calculation in WuPalmer --- .../apache/ctakes/ytex/kernel/metric/WuPalmerMetric.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/WuPalmerMetric.java b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/WuPalmerMetric.java index 396ab523..ef09c4d2 100644 --- a/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/WuPalmerMetric.java +++ b/ctakes-ytex/src/main/java/org/apache/ctakes/ytex/kernel/metric/WuPalmerMetric.java @@ -51,8 +51,10 @@ public double similarity(String concept1, String concept2, // Compute Wu-Palmer Similarity: // double lcsDist = simInfo.getLcsDist().doubleValue(); - double c1Depth = simSvc.getDepth(concept1) + lcsDist; - double c2Depth = simSvc.getDepth(concept2) + lcsDist ; + + // Adjust depth by 1 due to fake root node + double c1Depth = simSvc.getDepth(concept1) + 1; + double c2Depth = simSvc.getDepth(concept2) + 1; double score = ( 2.0 * (lcsDepth) / ( c1Depth + c2Depth ) ); return score; }