Skip to content

Commit 5a41f13

Browse files
Condensing cluster hierarchies
Co-authored-by: Ioannis Panagiotas <ioannis.panagiotas@neotechnology.com>
1 parent 09ecbf7 commit 5a41f13

File tree

4 files changed

+312
-4
lines changed

4 files changed

+312
-4
lines changed

algo/src/main/java/org/neo4j/gds/hdbscan/ClusterHierarchy.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,9 @@ final class ClusterHierarchy {
3333
private final HugeLongArray size;
3434
private final long nodeCount;
3535

36-
private ClusterHierarchy(
37-
long root, HugeLongArray left,
36+
ClusterHierarchy(
37+
long root,
38+
HugeLongArray left,
3839
HugeLongArray right,
3940
HugeDoubleArray lambda,
4041
HugeLongArray size,
@@ -72,9 +73,9 @@ static ClusterHierarchy create(long nodeCount, HugeObjectArray<Edge> edges) {
7273
lambda.set(adaptedIndex, edge.distance());
7374

7475
var leftSize = sizeFn.apply(l);
75-
var rigthSize = sizeFn.apply(r);
76+
var rightSize = sizeFn.apply(r);
7677

77-
size.set(adaptedIndex, leftSize + rigthSize);
78+
size.set(adaptedIndex, leftSize + rightSize);
7879
}
7980

8081
return new ClusterHierarchy(currentRoot, left, right, lambda, size, nodeCount);
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
/*
2+
* Copyright (c) "Neo4j"
3+
* Neo4j Sweden AB [http://neo4j.com]
4+
*
5+
* This file is part of Neo4j.
6+
*
7+
* Neo4j is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*/
20+
package org.neo4j.gds.hdbscan;
21+
22+
import org.neo4j.gds.collections.ha.HugeDoubleArray;
23+
import org.neo4j.gds.collections.ha.HugeLongArray;
24+
import org.neo4j.gds.core.utils.paged.HugeAtomicBitSet;
25+
import org.neo4j.gds.core.utils.paged.HugeLongArrayQueue;
26+
27+
class CondenseStep {
28+
private final long nodeCount;
29+
30+
CondenseStep(long nodeCount) {
31+
this.nodeCount = nodeCount;
32+
}
33+
34+
CondensedTree condense(ClusterHierarchy clusterHierarchy, long minClusterSize) {
35+
36+
// Walk through the hierarchy
37+
// at each split if one of the clusters created by the split has fewer points than the minimum cluster size
38+
// if it is the case that we have fewer points than the minimum cluster size we declare it to be
39+
// ‘points falling out of a cluster’
40+
// the larger cluster retain the cluster identity of the parent,
41+
// marking down which points ‘fell out of the cluster’ and at what distance value that happened.
42+
//
43+
// If the split is into two clusters each at least as large as the minimum cluster size
44+
// then we consider that a true cluster split and let that split persist in the tree.
45+
//
46+
// After walking through the whole hierarchy and doing this we end up
47+
// with a much smaller tree with a small number of nodes,
48+
// each of which has data about how the size of the cluster at that node decreases over varying distance.
49+
50+
var clusterHierarchyRoot = clusterHierarchy.root();
51+
var parent = HugeLongArray.newArray(clusterHierarchyRoot + 1);
52+
var lambda = HugeDoubleArray.newArray(clusterHierarchyRoot + 1);
53+
54+
var relabel = HugeLongArray.newArray(nodeCount);
55+
var currentCondensedRoot = nodeCount;
56+
relabel.set(clusterHierarchyRoot - nodeCount, currentCondensedRoot);
57+
58+
var currentCondensedMaxClusterId = nodeCount;
59+
var bfsQueue = HugeLongArrayQueue.newQueue(nodeCount);
60+
var visited = HugeAtomicBitSet.create(clusterHierarchyRoot + 1);
61+
62+
for (var i = clusterHierarchyRoot; i >= nodeCount; i--) {
63+
if (visited.get(i)) {
64+
continue;
65+
}
66+
67+
var left = clusterHierarchy.left(i);
68+
var leftSize = clusterHierarchy.size(left);
69+
var right = clusterHierarchy.right(i);
70+
var rightSize = clusterHierarchy.size(right);
71+
72+
var currentReLabel = relabel.get(i - nodeCount);
73+
var fallingOutLambda = clusterHierarchy.lambda(i);
74+
if (leftSize < minClusterSize && rightSize < minClusterSize) { // both fall out of cluster
75+
fallOut(clusterHierarchy, left, parent, currentReLabel, lambda, fallingOutLambda, bfsQueue, visited);
76+
fallOut(clusterHierarchy, right, parent, currentReLabel, lambda, fallingOutLambda, bfsQueue, visited);
77+
78+
} else if (leftSize < minClusterSize && rightSize >= minClusterSize) { // left falls out, right retains parent cluster id
79+
fallOut(clusterHierarchy, left, parent, currentReLabel, lambda, fallingOutLambda, bfsQueue, visited);
80+
relabel.set(right - nodeCount, currentReLabel);
81+
} else if (leftSize >= minClusterSize && rightSize < minClusterSize) { // left retains parent cluster id, right falls out
82+
relabel.set(left - nodeCount, currentReLabel);
83+
84+
fallOut(clusterHierarchy, right, parent, currentReLabel, lambda, fallingOutLambda, bfsQueue, visited);
85+
86+
} else { // none fall out, both get new cluster ids
87+
var leftClusterId = ++currentCondensedMaxClusterId;
88+
relabel.set(left - nodeCount, leftClusterId);
89+
parent.set(leftClusterId, currentReLabel);
90+
lambda.set(leftClusterId, fallingOutLambda);
91+
92+
var rightClusterId = ++currentCondensedMaxClusterId;
93+
relabel.set(right - nodeCount, rightClusterId);
94+
parent.set(rightClusterId, currentReLabel);
95+
lambda.set(rightClusterId, fallingOutLambda);
96+
}
97+
}
98+
99+
return new CondensedTree(currentCondensedRoot, parent, lambda, currentCondensedMaxClusterId);
100+
}
101+
102+
private void fallOut(
103+
ClusterHierarchy clusterHierarchy,
104+
long nodeToFallOut,
105+
HugeLongArray parent,
106+
long clusterToFallOutFrom,
107+
HugeDoubleArray lambda,
108+
double fallingOutLambda,
109+
HugeLongArrayQueue bfsQueue,
110+
HugeAtomicBitSet visited
111+
) {
112+
if (nodeToFallOut < nodeCount) {
113+
parent.set(nodeToFallOut, clusterToFallOutFrom);
114+
lambda.set(nodeToFallOut, fallingOutLambda);
115+
} else {
116+
// for each descendant of nodeToFallOut - find the leaf (original node) and mark it as fallen out
117+
bfsQueue.add(nodeToFallOut);
118+
while (!bfsQueue.isEmpty()) {
119+
var currentNode = bfsQueue.remove();
120+
var left = clusterHierarchy.left(currentNode);
121+
var right = clusterHierarchy.right(currentNode);
122+
123+
if (left < nodeCount) {
124+
parent.set(left, clusterToFallOutFrom);
125+
lambda.set(left, fallingOutLambda);
126+
} else {
127+
bfsQueue.add(left);
128+
}
129+
130+
if (right < nodeCount) {
131+
parent.set(right, clusterToFallOutFrom);
132+
lambda.set(right, fallingOutLambda);
133+
} else {
134+
bfsQueue.add(right);
135+
}
136+
visited.set(currentNode);
137+
}
138+
}
139+
140+
}
141+
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/*
2+
* Copyright (c) "Neo4j"
3+
* Neo4j Sweden AB [http://neo4j.com]
4+
*
5+
* This file is part of Neo4j.
6+
*
7+
* Neo4j is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*/
20+
package org.neo4j.gds.hdbscan;
21+
22+
import org.neo4j.gds.collections.ha.HugeDoubleArray;
23+
import org.neo4j.gds.collections.ha.HugeLongArray;
24+
25+
class CondensedTree {
26+
27+
private final long root;
28+
private final HugeLongArray parent;
29+
private final HugeDoubleArray lambda;
30+
private final long maximumClusterId;
31+
32+
CondensedTree(long root, HugeLongArray parent, HugeDoubleArray lambda, long maximumClusterId) {
33+
this.root = root;
34+
this.parent = parent;
35+
this.lambda = lambda;
36+
this.maximumClusterId = maximumClusterId;
37+
}
38+
39+
long root() {
40+
return root;
41+
}
42+
43+
long parent(long node) {
44+
return parent.get(node);
45+
}
46+
47+
long fellOutOf(long node) {
48+
return parent.get(node);
49+
}
50+
51+
long maximumClusterId() {
52+
return maximumClusterId;
53+
}
54+
55+
double lambda(long node) {
56+
return lambda.get(node);
57+
}
58+
}
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/*
2+
* Copyright (c) "Neo4j"
3+
* Neo4j Sweden AB [http://neo4j.com]
4+
*
5+
* This file is part of Neo4j.
6+
*
7+
* Neo4j is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*/
20+
package org.neo4j.gds.hdbscan;
21+
22+
import org.junit.jupiter.api.Test;
23+
import org.neo4j.gds.collections.ha.HugeDoubleArray;
24+
import org.neo4j.gds.collections.ha.HugeLongArray;
25+
26+
import static org.assertj.core.api.Assertions.assertThat;
27+
28+
class CondenseStepTest {
29+
30+
@Test
31+
void minClusterSizeTwo() {
32+
var nodeCount = 7L;
33+
var root = 12L;
34+
var left = HugeLongArray.of(5, 4, 2, 9, 0, 11);
35+
var right = HugeLongArray.of(6, 7, 3, 8, 1, 10);
36+
var lambda = HugeDoubleArray.of(7d, 8d, 9d, 10d, 11d, 12d);
37+
var size = HugeLongArray.of(2, 3, 2, 5, 2, 7);
38+
39+
var clusterHierarchy = new ClusterHierarchy(root, left, right, lambda, size, nodeCount);
40+
41+
var condensedTree = new CondenseStep(nodeCount).condense(clusterHierarchy, 2L);
42+
43+
assertThat(condensedTree.root()).isEqualTo(7L);
44+
assertThat(condensedTree.maximumClusterId()).isEqualTo(11L);
45+
46+
47+
assertThat(condensedTree.parent(8L)).isEqualTo(7L);
48+
assertThat(condensedTree.lambda(8L)).isEqualTo(12d);
49+
assertThat(condensedTree.parent(9L)).isEqualTo(7L);
50+
assertThat(condensedTree.lambda(9L)).isEqualTo(12d);
51+
52+
assertThat(condensedTree.parent(10L)).isEqualTo(9L);
53+
assertThat(condensedTree.lambda(10L)).isEqualTo(10d);
54+
assertThat(condensedTree.parent(11L)).isEqualTo(9L);
55+
assertThat(condensedTree.lambda(11L)).isEqualTo(10d);
56+
57+
assertThat(condensedTree.fellOutOf(0L)).isEqualTo(8L);
58+
assertThat(condensedTree.lambda(0L)).isEqualTo(11d);
59+
assertThat(condensedTree.fellOutOf(1L)).isEqualTo(8L);
60+
assertThat(condensedTree.lambda(1L)).isEqualTo(11d);
61+
62+
assertThat(condensedTree.fellOutOf(2L)).isEqualTo(10L);
63+
assertThat(condensedTree.lambda(2L)).isEqualTo(9d);
64+
assertThat(condensedTree.fellOutOf(3L)).isEqualTo(10L);
65+
assertThat(condensedTree.lambda(3L)).isEqualTo(9d);
66+
67+
assertThat(condensedTree.fellOutOf(4L)).isEqualTo(11L);
68+
assertThat(condensedTree.lambda(4L)).isEqualTo(8d);
69+
assertThat(condensedTree.fellOutOf(5L)).isEqualTo(11L);
70+
assertThat(condensedTree.lambda(5L)).isEqualTo(7d);
71+
assertThat(condensedTree.fellOutOf(6L)).isEqualTo(11L);
72+
assertThat(condensedTree.lambda(6L)).isEqualTo(7d);
73+
}
74+
75+
@Test
76+
void minClusterSizeThree() {
77+
var nodeCount = 7L;
78+
var root = 12L;
79+
var left = HugeLongArray.of(5, 4, 2, 9, 0, 11);
80+
var right = HugeLongArray.of(6, 7, 3, 8, 1, 10);
81+
var lambda = HugeDoubleArray.of(7d, 8d, 9d, 10d, 11d, 12d);
82+
var size = HugeLongArray.of(2, 3, 2, 5, 2, 7);
83+
84+
var clusterHierarchy = new ClusterHierarchy(root, left, right, lambda, size, nodeCount);
85+
86+
var condensedTree = new CondenseStep(nodeCount).condense(clusterHierarchy, 3L);
87+
88+
assertThat(condensedTree.root()).isEqualTo(7L);
89+
assertThat(condensedTree.maximumClusterId()).isEqualTo(7L);
90+
91+
assertThat(condensedTree.fellOutOf(0L)).isEqualTo(7L);
92+
assertThat(condensedTree.lambda(0L)).isEqualTo(12d);
93+
assertThat(condensedTree.fellOutOf(1L)).isEqualTo(7L);
94+
assertThat(condensedTree.lambda(1L)).isEqualTo(12);
95+
96+
assertThat(condensedTree.fellOutOf(2L)).isEqualTo(7L);
97+
assertThat(condensedTree.lambda(2L)).isEqualTo(10d);
98+
assertThat(condensedTree.fellOutOf(3L)).isEqualTo(7L);
99+
assertThat(condensedTree.lambda(3L)).isEqualTo(10d);
100+
101+
assertThat(condensedTree.fellOutOf(4L)).isEqualTo(7L);
102+
assertThat(condensedTree.lambda(4L)).isEqualTo(8d);
103+
assertThat(condensedTree.fellOutOf(5L)).isEqualTo(7L);
104+
assertThat(condensedTree.lambda(5L)).isEqualTo(8d);
105+
assertThat(condensedTree.fellOutOf(6L)).isEqualTo(7L);
106+
assertThat(condensedTree.lambda(6L)).isEqualTo(8d);
107+
}
108+
}

0 commit comments

Comments
 (0)