|
| 1 | +/* |
| 2 | + * Copyright (c) "Neo4j" |
| 3 | + * Neo4j Sweden AB [http://neo4j.com] |
| 4 | + * |
| 5 | + * This file is part of Neo4j. |
| 6 | + * |
| 7 | + * Neo4j is free software: you can redistribute it and/or modify |
| 8 | + * it under the terms of the GNU General Public License as published by |
| 9 | + * the Free Software Foundation, either version 3 of the License, or |
| 10 | + * (at your option) any later version. |
| 11 | + * |
| 12 | + * This program is distributed in the hope that it will be useful, |
| 13 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 15 | + * GNU General Public License for more details. |
| 16 | + * |
| 17 | + * You should have received a copy of the GNU General Public License |
| 18 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 19 | + */ |
| 20 | +package org.neo4j.gds.hdbscan; |
| 21 | + |
| 22 | +import org.neo4j.gds.collections.ha.HugeDoubleArray; |
| 23 | +import org.neo4j.gds.collections.ha.HugeLongArray; |
| 24 | +import org.neo4j.gds.core.utils.paged.HugeAtomicBitSet; |
| 25 | +import org.neo4j.gds.core.utils.paged.HugeLongArrayQueue; |
| 26 | + |
| 27 | +class CondenseStep { |
| 28 | + private final long nodeCount; |
| 29 | + |
| 30 | + CondenseStep(long nodeCount) { |
| 31 | + this.nodeCount = nodeCount; |
| 32 | + } |
| 33 | + |
| 34 | + CondensedTree condense(ClusterHierarchy clusterHierarchy, long minClusterSize) { |
| 35 | + |
| 36 | + // Walk through the hierarchy |
| 37 | + // at each split if one of the clusters created by the split has fewer points than the minimum cluster size |
| 38 | + // if it is the case that we have fewer points than the minimum cluster size we declare it to be |
| 39 | + // ‘points falling out of a cluster’ |
| 40 | + // the larger cluster retain the cluster identity of the parent, |
| 41 | + // marking down which points ‘fell out of the cluster’ and at what distance value that happened. |
| 42 | + // |
| 43 | + // If the split is into two clusters each at least as large as the minimum cluster size |
| 44 | + // then we consider that a true cluster split and let that split persist in the tree. |
| 45 | + // |
| 46 | + // After walking through the whole hierarchy and doing this we end up |
| 47 | + // with a much smaller tree with a small number of nodes, |
| 48 | + // each of which has data about how the size of the cluster at that node decreases over varying distance. |
| 49 | + |
| 50 | + var clusterHierarchyRoot = clusterHierarchy.root(); |
| 51 | + var parent = HugeLongArray.newArray(clusterHierarchyRoot + 1); |
| 52 | + var lambda = HugeDoubleArray.newArray(clusterHierarchyRoot + 1); |
| 53 | + |
| 54 | + var relabel = HugeLongArray.newArray(nodeCount); |
| 55 | + var currentCondensedRoot = nodeCount; |
| 56 | + relabel.set(clusterHierarchyRoot - nodeCount, currentCondensedRoot); |
| 57 | + |
| 58 | + var currentCondensedMaxClusterId = nodeCount; |
| 59 | + var bfsQueue = HugeLongArrayQueue.newQueue(nodeCount); |
| 60 | + var visited = HugeAtomicBitSet.create(clusterHierarchyRoot + 1); |
| 61 | + |
| 62 | + for (var i = clusterHierarchyRoot; i >= nodeCount; i--) { |
| 63 | + if (visited.get(i)) { |
| 64 | + continue; |
| 65 | + } |
| 66 | + |
| 67 | + var left = clusterHierarchy.left(i); |
| 68 | + var leftSize = clusterHierarchy.size(left); |
| 69 | + var right = clusterHierarchy.right(i); |
| 70 | + var rightSize = clusterHierarchy.size(right); |
| 71 | + |
| 72 | + var currentReLabel = relabel.get(i - nodeCount); |
| 73 | + var fallingOutLambda = clusterHierarchy.lambda(i); |
| 74 | + if (leftSize < minClusterSize && rightSize < minClusterSize) { // both fall out of cluster |
| 75 | + fallOut(clusterHierarchy, left, parent, currentReLabel, lambda, fallingOutLambda, bfsQueue, visited); |
| 76 | + fallOut(clusterHierarchy, right, parent, currentReLabel, lambda, fallingOutLambda, bfsQueue, visited); |
| 77 | + |
| 78 | + } else if (leftSize < minClusterSize && rightSize >= minClusterSize) { // left falls out, right retains parent cluster id |
| 79 | + fallOut(clusterHierarchy, left, parent, currentReLabel, lambda, fallingOutLambda, bfsQueue, visited); |
| 80 | + relabel.set(right - nodeCount, currentReLabel); |
| 81 | + } else if (leftSize >= minClusterSize && rightSize < minClusterSize) { // left retains parent cluster id, right falls out |
| 82 | + relabel.set(left - nodeCount, currentReLabel); |
| 83 | + |
| 84 | + fallOut(clusterHierarchy, right, parent, currentReLabel, lambda, fallingOutLambda, bfsQueue, visited); |
| 85 | + |
| 86 | + } else { // none fall out, both get new cluster ids |
| 87 | + var leftClusterId = ++currentCondensedMaxClusterId; |
| 88 | + relabel.set(left - nodeCount, leftClusterId); |
| 89 | + parent.set(leftClusterId, currentReLabel); |
| 90 | + lambda.set(leftClusterId, fallingOutLambda); |
| 91 | + |
| 92 | + var rightClusterId = ++currentCondensedMaxClusterId; |
| 93 | + relabel.set(right - nodeCount, rightClusterId); |
| 94 | + parent.set(rightClusterId, currentReLabel); |
| 95 | + lambda.set(rightClusterId, fallingOutLambda); |
| 96 | + } |
| 97 | + } |
| 98 | + |
| 99 | + return new CondensedTree(currentCondensedRoot, parent, lambda, currentCondensedMaxClusterId); |
| 100 | + } |
| 101 | + |
| 102 | + private void fallOut( |
| 103 | + ClusterHierarchy clusterHierarchy, |
| 104 | + long nodeToFallOut, |
| 105 | + HugeLongArray parent, |
| 106 | + long clusterToFallOutFrom, |
| 107 | + HugeDoubleArray lambda, |
| 108 | + double fallingOutLambda, |
| 109 | + HugeLongArrayQueue bfsQueue, |
| 110 | + HugeAtomicBitSet visited |
| 111 | + ) { |
| 112 | + if (nodeToFallOut < nodeCount) { |
| 113 | + parent.set(nodeToFallOut, clusterToFallOutFrom); |
| 114 | + lambda.set(nodeToFallOut, fallingOutLambda); |
| 115 | + } else { |
| 116 | + // for each descendant of nodeToFallOut - find the leaf (original node) and mark it as fallen out |
| 117 | + bfsQueue.add(nodeToFallOut); |
| 118 | + while (!bfsQueue.isEmpty()) { |
| 119 | + var currentNode = bfsQueue.remove(); |
| 120 | + var left = clusterHierarchy.left(currentNode); |
| 121 | + var right = clusterHierarchy.right(currentNode); |
| 122 | + |
| 123 | + if (left < nodeCount) { |
| 124 | + parent.set(left, clusterToFallOutFrom); |
| 125 | + lambda.set(left, fallingOutLambda); |
| 126 | + } else { |
| 127 | + bfsQueue.add(left); |
| 128 | + } |
| 129 | + |
| 130 | + if (right < nodeCount) { |
| 131 | + parent.set(right, clusterToFallOutFrom); |
| 132 | + lambda.set(right, fallingOutLambda); |
| 133 | + } else { |
| 134 | + bfsQueue.add(right); |
| 135 | + } |
| 136 | + visited.set(currentNode); |
| 137 | + } |
| 138 | + } |
| 139 | + |
| 140 | + } |
| 141 | +} |
0 commit comments