@@ -44,25 +44,25 @@ class BinarizeTask implements Runnable {
4444 private final double [][] propertyEmbeddings ;
4545
4646 private final double threshold ;
47- private final BinarizeFeaturesConfig binarizationConfig ;
47+ private final int dimension ;
4848 private final ProgressTracker progressTracker ;
49- private long totalNumFeatures ;
49+ private long totalFeatureCount ;
5050
5151 private double scalarProductSum ;
5252
5353 private double scalarProductSumOfSquares ;
5454
5555 BinarizeTask (
5656 Partition partition ,
57- HashGNNConfig config ,
57+ BinarizeFeaturesConfig config ,
5858 HugeObjectArray <HugeAtomicBitSet > truncatedFeatures ,
5959 List <FeatureExtractor > featureExtractors ,
6060 double [][] propertyEmbeddings ,
6161 ProgressTracker progressTracker
6262 ) {
6363 this .partition = partition ;
64- this .binarizationConfig = config .binarizeFeatures (). orElseThrow ();
65- this .threshold = binarizationConfig .threshold ();
64+ this .dimension = config .dimension ();
65+ this .threshold = config .threshold ();
6666 this .truncatedFeatures = truncatedFeatures ;
6767 this .featureExtractors = featureExtractors ;
6868 this .propertyEmbeddings = propertyEmbeddings ;
@@ -76,7 +76,7 @@ static HugeObjectArray<HugeAtomicBitSet> compute(
7676 SplittableRandom rng ,
7777 ProgressTracker progressTracker ,
7878 TerminationFlag terminationFlag ,
79- MutableLong totalNumFeaturesOutput
79+ MutableLong totalFeatureCountOutput
8080 ) {
8181 progressTracker .beginSubTask ("Binarize node property features" );
8282
@@ -88,14 +88,14 @@ static HugeObjectArray<HugeAtomicBitSet> compute(
8888 );
8989
9090 var inputDimension = FeatureExtraction .featureCount (featureExtractors );
91- var propertyEmbeddings = embedProperties (config , rng , inputDimension );
91+ var propertyEmbeddings = embedProperties (binarizationConfig . dimension () , rng , inputDimension );
9292
9393 var truncatedFeatures = HugeObjectArray .newArray (HugeAtomicBitSet .class , graph .nodeCount ());
9494
9595 var tasks = partition .stream ()
9696 .map (p -> new BinarizeTask (
9797 p ,
98- config ,
98+ binarizationConfig ,
9999 truncatedFeatures ,
100100 featureExtractors ,
101101 propertyEmbeddings ,
@@ -108,7 +108,7 @@ static HugeObjectArray<HugeAtomicBitSet> compute(
108108 .terminationFlag (terminationFlag )
109109 .run ();
110110
111- totalNumFeaturesOutput .add (tasks .stream ().mapToLong (BinarizeTask ::totalNumFeatures ).sum ());
111+ totalFeatureCountOutput .add (tasks .stream ().mapToLong (BinarizeTask ::totalFeatureCount ).sum ());
112112
113113 var squaredSum = tasks .stream ().mapToDouble (BinarizeTask ::scalarProductSumOfSquares ).sum ();
114114 var sum = tasks .stream ().mapToDouble (BinarizeTask ::scalarProductSum ).sum ();
@@ -118,38 +118,47 @@ static HugeObjectArray<HugeAtomicBitSet> compute(
118118 var variance = (squaredSum - exampleCount * avg * avg ) / exampleCount ;
119119 var std = Math .sqrt (variance );
120120
121- progressTracker .logInfo (formatWithLocale ("Hyperplane scalar products have mean %.4f and standard deviation %.4f. A threshold for binarization may be set to the average plus a few standard deviations." , avg , std ));
121+ progressTracker .logInfo (formatWithLocale (
122+ "Hyperplane scalar products have mean %.4f and standard deviation %.4f. A threshold for binarization may be set to the mean plus a few standard deviations." ,
123+ avg ,
124+ std
125+ ));
122126
123127 progressTracker .endSubTask ("Binarize node property features" );
124128
125129 return truncatedFeatures ;
126130 }
131+
127132 // creates a random projection vector for each feature
128133 // (input features vector for each node is the concatenation of the node's properties)
129134 // this array is used embed the properties themselves from inputDimension to embeddingDimension dimensions.
130- public static double [][] embedProperties (HashGNNConfig config , SplittableRandom rng , int inputDimension ) {
131- var binarizationConfig = config .binarizeFeatures ().orElseThrow ();
135+ public static double [][] embedProperties (int vectorDimension , SplittableRandom rng , int inputDimension ) {
132136 var propertyEmbeddings = new double [inputDimension ][];
133137
134138 for (int inputFeature = 0 ; inputFeature < inputDimension ; inputFeature ++) {
135- propertyEmbeddings [inputFeature ] = new double [binarizationConfig .dimension ()];
136- for (int feature = 0 ; feature < binarizationConfig .dimension (); feature ++) {
137- // Box-muller transformation to generate gaussian
138- double matrixValue = Math .sqrt (-2 *Math .log (rng .nextDouble (0.0 , 1.0 ))) * Math .cos (2 *Math .PI * rng .nextDouble (0.0 , 1.0 ));
139- propertyEmbeddings [inputFeature ][feature ] = matrixValue ;
139+ propertyEmbeddings [inputFeature ] = new double [vectorDimension ];
140+ for (int feature = 0 ; feature < vectorDimension ; feature ++) {
141+ propertyEmbeddings [inputFeature ][feature ] = boxMullerGaussianRandom (rng );
140142 }
141143 }
142144 return propertyEmbeddings ;
143145 }
144146
147+ private static double boxMullerGaussianRandom (SplittableRandom rng ) {
148+ return Math .sqrt (-2 * Math .log (rng .nextDouble (
149+ 0.0 ,
150+ 1.0
151+ ))) * Math .cos (2 * Math .PI * rng .nextDouble (0.0 , 1.0 ));
152+ }
153+
145154 @ Override
146155 public void run () {
147156 partition .consume (nodeId -> {
148- var featureVector = new float [binarizationConfig . dimension () ];
157+ var featureVector = new float [dimension ];
149158 FeatureExtraction .extract (nodeId , -1 , featureExtractors , new FeatureConsumer () {
150159 @ Override
151160 public void acceptScalar (long nodeOffset , int offset , double value ) {
152- for (int feature = 0 ; feature < binarizationConfig . dimension () ; feature ++) {
161+ for (int feature = 0 ; feature < dimension ; feature ++) {
153162 double featureValue = propertyEmbeddings [offset ][feature ];
154163 featureVector [feature ] += value * featureValue ;
155164 }
@@ -159,7 +168,7 @@ public void acceptScalar(long nodeOffset, int offset, double value) {
159168 public void acceptArray (long nodeOffset , int offset , double [] values ) {
160169 for (int inputFeatureOffset = 0 ; inputFeatureOffset < values .length ; inputFeatureOffset ++) {
161170 double value = values [inputFeatureOffset ];
162- for (int feature = 0 ; feature < binarizationConfig . dimension () ; feature ++) {
171+ for (int feature = 0 ; feature < dimension ; feature ++) {
163172 double featureValue = propertyEmbeddings [offset + inputFeatureOffset ][feature ];
164173 featureVector [feature ] += value * featureValue ;
165174 }
@@ -168,7 +177,7 @@ public void acceptArray(long nodeOffset, int offset, double[] values) {
168177 });
169178
170179 var featureSet = round (featureVector );
171- totalNumFeatures += featureSet .cardinality ();
180+ totalFeatureCount += featureSet .cardinality ();
172181 truncatedFeatures .set (nodeId , featureSet );
173182 });
174183
@@ -188,13 +197,14 @@ private HugeAtomicBitSet round(float[] floatVector) {
188197 return bitset ;
189198 }
190199
191- public long totalNumFeatures () {
192- return totalNumFeatures ;
200+ public long totalFeatureCount () {
201+ return totalFeatureCount ;
193202 }
194203
195204 public double scalarProductSum () {
196205 return scalarProductSum ;
197206 }
207+
198208 public double scalarProductSumOfSquares () {
199209 return scalarProductSumOfSquares ;
200210 }
0 commit comments