Skip to content

Commit 43aa57b

Browse files
Create cluster hierarchy
Co-authored-by: Ioannis Panagiotas <ioannis.panagiotas@neotechnology.com>
1 parent 54cbde9 commit 43aa57b

File tree

4 files changed

+345
-0
lines changed

4 files changed

+345
-0
lines changed
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
/*
2+
* Copyright (c) "Neo4j"
3+
* Neo4j Sweden AB [http://neo4j.com]
4+
*
5+
* This file is part of Neo4j.
6+
*
7+
* Neo4j is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*/
20+
package org.neo4j.gds.hdbscan;
21+
22+
import org.neo4j.gds.collections.ha.HugeDoubleArray;
23+
import org.neo4j.gds.collections.ha.HugeLongArray;
24+
import org.neo4j.gds.collections.ha.HugeObjectArray;
25+
26+
import java.util.function.Function;
27+
28+
final class ClusterHierarchy {
29+
private final long root;
30+
private final HugeLongArray left;
31+
private final HugeLongArray right;
32+
private final HugeDoubleArray lambda;
33+
private final HugeLongArray size;
34+
private final long nodeCount;
35+
36+
private ClusterHierarchy(
37+
long root, HugeLongArray left,
38+
HugeLongArray right,
39+
HugeDoubleArray lambda,
40+
HugeLongArray size,
41+
long nodeCount
42+
) {
43+
this.root = root;
44+
this.left = left;
45+
this.right = right;
46+
this.lambda = lambda;
47+
this.size = size;
48+
this.nodeCount = nodeCount;
49+
}
50+
51+
static ClusterHierarchy create(long nodeCount, HugeObjectArray<Edge> edges) {
52+
var left = HugeLongArray.newArray(nodeCount);
53+
var right = HugeLongArray.newArray(nodeCount);
54+
var lambda = HugeDoubleArray.newArray(nodeCount);
55+
var size = HugeLongArray.newArray(nodeCount);
56+
57+
var unionFind = new ClusterHierarchyUnionFind(nodeCount);
58+
59+
long currentRoot = -1L;
60+
61+
var sizeFn = (Function<Long, Long>) n -> n < nodeCount ? 1L : size.get(n - nodeCount);
62+
63+
for (var i = 0; i < edges.size(); i++) {
64+
var edge = edges.get(i);
65+
var l = unionFind.find(edge.source());
66+
var r = unionFind.find(edge.target());
67+
68+
currentRoot = unionFind.union(l, r);
69+
var adaptedIndex = currentRoot - nodeCount;
70+
left.set(adaptedIndex, l);
71+
right.set(adaptedIndex, r);
72+
lambda.set(adaptedIndex, edge.distance());
73+
74+
var leftSize = sizeFn.apply(l);
75+
var rigthSize = sizeFn.apply(r);
76+
77+
size.set(adaptedIndex, leftSize + rigthSize);
78+
}
79+
80+
return new ClusterHierarchy(currentRoot, left, right, lambda, size, nodeCount);
81+
}
82+
83+
long root() {
84+
return this.root;
85+
}
86+
87+
long left(long node) {
88+
return left.get(node - nodeCount);
89+
}
90+
91+
long right(long node) {
92+
return right.get(node - nodeCount);
93+
}
94+
95+
long size(long node) {
96+
return node < nodeCount
97+
? 1L
98+
: size.get(node - nodeCount);
99+
}
100+
101+
double lambda(long node) {
102+
return lambda.get(node - nodeCount);
103+
}
104+
105+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
* Copyright (c) "Neo4j"
3+
* Neo4j Sweden AB [http://neo4j.com]
4+
*
5+
* This file is part of Neo4j.
6+
*
7+
* Neo4j is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*/
20+
package org.neo4j.gds.hdbscan;
21+
22+
import org.neo4j.gds.collections.ha.HugeLongArray;
23+
24+
class ClusterHierarchyUnionFind {
25+
26+
private static final long DEFAULT_PARENT_VALUE = -1L;
27+
28+
private final HugeLongArray parents;
29+
private long nextLabel;
30+
31+
public ClusterHierarchyUnionFind(long nodeCount) {
32+
var size = (2 * nodeCount) - 1;
33+
this.parents = HugeLongArray.newArray(size);
34+
this.parents.fill(DEFAULT_PARENT_VALUE);
35+
this.nextLabel = nodeCount;
36+
}
37+
38+
long union(long x, long y) {
39+
parents.set(x, nextLabel);
40+
parents.set(y, nextLabel);
41+
return nextLabel++;
42+
}
43+
44+
long find(long x) {
45+
while (parents.get(x) != DEFAULT_PARENT_VALUE) {
46+
var parent = parents.get(x);
47+
var grandParent = parents.get(parent);
48+
if (grandParent == DEFAULT_PARENT_VALUE) {
49+
return parent;
50+
}
51+
52+
parents.set(x, grandParent);
53+
x = grandParent;
54+
}
55+
return x;
56+
}
57+
}
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/*
2+
* Copyright (c) "Neo4j"
3+
* Neo4j Sweden AB [http://neo4j.com]
4+
*
5+
* This file is part of Neo4j.
6+
*
7+
* Neo4j is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*/
20+
package org.neo4j.gds.hdbscan;
21+
22+
import org.assertj.core.api.SoftAssertions;
23+
import org.assertj.core.api.junit.jupiter.SoftAssertionsExtension;
24+
import org.assertj.core.data.Offset;
25+
import org.junit.jupiter.api.Test;
26+
import org.junit.jupiter.api.extension.ExtendWith;
27+
import org.neo4j.gds.collections.ha.HugeObjectArray;
28+
29+
@ExtendWith(SoftAssertionsExtension.class)
30+
class ClusterHierarchyTest {
31+
32+
@Test
33+
void shouldWorkWithLineGraph(SoftAssertions assertions) {
34+
var edges = HugeObjectArray.of(
35+
new Edge(1, 2, 3.),
36+
new Edge(0, 1, 5.)
37+
);
38+
39+
var clusterHierarchy = ClusterHierarchy.create(3, edges);
40+
41+
// 1. `1` and `2` are joined and create new id = 3 --> first set
42+
// 2. `0` and `3` are joined and create new id = 4 --> second set
43+
44+
assertions.assertThat(clusterHierarchy.root()).isEqualTo(4L);
45+
46+
assertions.assertThat(clusterHierarchy.left(4)).isEqualTo(0);
47+
assertions.assertThat(clusterHierarchy.right(4)).isEqualTo(3);
48+
assertions.assertThat(clusterHierarchy.lambda(4)).isEqualTo(5.);
49+
assertions.assertThat(clusterHierarchy.size(4)).isEqualTo(3);
50+
51+
assertions.assertThat(clusterHierarchy.left(3)).isEqualTo(1);
52+
assertions.assertThat(clusterHierarchy.right(3)).isEqualTo(2);
53+
assertions.assertThat(clusterHierarchy.lambda(3)).isEqualTo(3.);
54+
assertions.assertThat(clusterHierarchy.size(3)).isEqualTo(2);
55+
56+
assertions.assertThat(clusterHierarchy.size(0)).isEqualTo(1);
57+
}
58+
59+
@Test
60+
void shouldWorkOnMoreComplexTree(SoftAssertions assertions) {
61+
var edges = HugeObjectArray.of(
62+
new Edge(2, 0, 0.24862277),
63+
new Edge(4, 0, 0.24862277),
64+
new Edge(3, 8, 0.28702033),
65+
new Edge(10, 8, 0.28702033),
66+
new Edge(5, 8, 0.31202177),
67+
new Edge(6, 2, 0.412653),
68+
new Edge(1, 3, 0.51812741),
69+
new Edge(9, 2, 0.55225731),
70+
new Edge(7, 6, 0.65362267),
71+
new Edge(5, 7, 1.42823558)
72+
);
73+
74+
var clusterHierarchy = ClusterHierarchy.create(edges.size() + 1, edges);
75+
76+
77+
assertions.assertThat(clusterHierarchy.left(11)).isEqualTo(2);
78+
assertions.assertThat(clusterHierarchy.right(11)).isEqualTo(0);
79+
assertions.assertThat(clusterHierarchy.lambda(11)).isCloseTo(0.24862277, Offset.offset(1e-9));
80+
assertions.assertThat(clusterHierarchy.size(11)).isEqualTo(2);
81+
82+
assertions.assertThat(clusterHierarchy.left(12)).isEqualTo(4);
83+
assertions.assertThat(clusterHierarchy.right(12)).isEqualTo(11);
84+
assertions.assertThat(clusterHierarchy.lambda(12)).isCloseTo(0.24862277, Offset.offset(1e-9));
85+
assertions.assertThat(clusterHierarchy.size(12)).isEqualTo(3);
86+
87+
assertions.assertThat(clusterHierarchy.left(13)).isEqualTo(3);
88+
assertions.assertThat(clusterHierarchy.right(13)).isEqualTo(8);
89+
assertions.assertThat(clusterHierarchy.lambda(13)).isCloseTo(0.28702033, Offset.offset(1e-9));
90+
assertions.assertThat(clusterHierarchy.size(13)).isEqualTo(2);
91+
92+
assertions.assertThat(clusterHierarchy.left(14)).isEqualTo(10);
93+
assertions.assertThat(clusterHierarchy.right(14)).isEqualTo(13);
94+
assertions.assertThat(clusterHierarchy.lambda(14)).isCloseTo(0.28702033, Offset.offset(1e-9));
95+
assertions.assertThat(clusterHierarchy.size(14)).isEqualTo(3);
96+
97+
assertions.assertThat(clusterHierarchy.left(15)).isEqualTo(5);
98+
assertions.assertThat(clusterHierarchy.right(15)).isEqualTo(14);
99+
assertions.assertThat(clusterHierarchy.lambda(15)).isCloseTo(0.31202177, Offset.offset(1e-9));
100+
assertions.assertThat(clusterHierarchy.size(15)).isEqualTo(4);
101+
102+
assertions.assertThat(clusterHierarchy.left(16)).isEqualTo(6);
103+
assertions.assertThat(clusterHierarchy.right(16)).isEqualTo(12);
104+
assertions.assertThat(clusterHierarchy.lambda(16)).isCloseTo(0.412653, Offset.offset(1e-9));
105+
assertions.assertThat(clusterHierarchy.size(16)).isEqualTo(4);
106+
107+
assertions.assertThat(clusterHierarchy.left(17)).isEqualTo(1);
108+
assertions.assertThat(clusterHierarchy.right(17)).isEqualTo(15);
109+
assertions.assertThat(clusterHierarchy.lambda(17)).isCloseTo(0.51812741, Offset.offset(1e-9));
110+
assertions.assertThat(clusterHierarchy.size(17)).isEqualTo(5);
111+
112+
assertions.assertThat(clusterHierarchy.left(18)).isEqualTo(9);
113+
assertions.assertThat(clusterHierarchy.right(18)).isEqualTo(16);
114+
assertions.assertThat(clusterHierarchy.lambda(18)).isCloseTo(0.55225731, Offset.offset(1e-9));
115+
assertions.assertThat(clusterHierarchy.size(18)).isEqualTo(5);
116+
117+
assertions.assertThat(clusterHierarchy.left(19)).isEqualTo(7);
118+
assertions.assertThat(clusterHierarchy.right(19)).isEqualTo(18);
119+
assertions.assertThat(clusterHierarchy.lambda(19)).isCloseTo(0.65362267, Offset.offset(1e-9));
120+
assertions.assertThat(clusterHierarchy.size(19)).isEqualTo(6);
121+
122+
assertions.assertThat(clusterHierarchy.left(20)).isEqualTo(17);
123+
assertions.assertThat(clusterHierarchy.right(20)).isEqualTo(19);
124+
assertions.assertThat(clusterHierarchy.lambda(20)).isCloseTo(1.42823558, Offset.offset(1e-9));
125+
assertions.assertThat(clusterHierarchy.size(20)).isEqualTo(11);
126+
}
127+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Copyright (c) "Neo4j"
3+
* Neo4j Sweden AB [http://neo4j.com]
4+
*
5+
* This file is part of Neo4j.
6+
*
7+
* Neo4j is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*/
20+
package org.neo4j.gds.hdbscan;
21+
22+
import org.assertj.core.api.SoftAssertions;
23+
import org.assertj.core.api.junit.jupiter.SoftAssertionsExtension;
24+
import org.junit.jupiter.api.Test;
25+
import org.junit.jupiter.api.extension.ExtendWith;
26+
27+
@ExtendWith(SoftAssertionsExtension.class)
28+
class ClusterHierarchyUnionFindTest {
29+
30+
@Test
31+
void union(SoftAssertions assertions) {
32+
var u = new ClusterHierarchyUnionFind(5L);
33+
34+
assertions.assertThat(u.union(3L, 4L)).isEqualTo(5L);
35+
assertions.assertThat(u.union(1L, 2L)).isEqualTo(6L);
36+
assertions.assertThat(u.union(0L, 2L)).isEqualTo(7L);
37+
}
38+
39+
@Test
40+
void find(SoftAssertions assertions) {
41+
var u = new ClusterHierarchyUnionFind(5L);
42+
43+
assertions.assertThat(u.union(3L, 4L)).isEqualTo(5L);
44+
45+
46+
assertions.assertThat(u.find(3L)).isEqualTo(5L);
47+
assertions.assertThat(u.find(4L)).isEqualTo(5L);
48+
49+
assertions.assertThat(u.union(3L, 2L)).isEqualTo(6L);
50+
assertions.assertThat(u.find(3L)).isEqualTo(6L);
51+
assertions.assertThat(u.find(4L)).isEqualTo(5L);
52+
53+
assertions.assertThat(u.find(7L)).isEqualTo(7L);
54+
}
55+
56+
}

0 commit comments

Comments
 (0)