Skip to content

Commit 3fcd660

Browse files
Neural-Link Teamtensorflow-copybara
authored andcommitted
Adds a new --lsh_bits flag to nsl.tools.build_graph().
If --lsh_bits > 0, all the input points are first bucketed into at most 2^n buckets, where 'n' is the --lsh_bits value. Each of the buckets is then considered independently, vastly reducing the number of pairs of points that need to be compared for similarity. Thus, the running time of the program can be reduced dramatically when the number of input points is large, at the potential expense of missing some graph edges. This also introduces a new configuration class and a new top-level function: nsl.configs.GraphBuilderConfig: graph building configuration object. nsl.tools.build_graph_from_config(...): parameterized by GraphBuilderConfig. PiperOrigin-RevId: 318376805
1 parent 55cfd84 commit 3fcd660

File tree

6 files changed

+263
-80
lines changed

6 files changed

+263
-80
lines changed

neural_structured_learning/configs/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from neural_structured_learning.configs.configs import DEFAULT_DISTANCE_PARAMS
1111
from neural_structured_learning.configs.configs import DistanceConfig
1212
from neural_structured_learning.configs.configs import DistanceType
13+
from neural_structured_learning.configs.configs import GraphBuilderConfig
1314
from neural_structured_learning.configs.configs import GraphNeighborConfig
1415
from neural_structured_learning.configs.configs import GraphRegConfig
1516
from neural_structured_learning.configs.configs import IntegrationConfig
@@ -31,6 +32,7 @@
3132
'DEFAULT_DISTANCE_PARAMS',
3233
'DistanceConfig',
3334
'DistanceType',
35+
'GraphBuilderConfig',
3436
'GraphNeighborConfig',
3537
'GraphRegConfig',
3638
'IntegrationConfig',

neural_structured_learning/configs/configs.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@
1717
from __future__ import division
1818
from __future__ import print_function
1919

20-
import attr
2120
import enum
21+
22+
import attr
2223
import tensorflow as tf
2324

2425

@@ -291,6 +292,36 @@ class VirtualAdvConfig(object):
291292
approx_difference = attr.ib(default=1e-6)
292293

293294

295+
@attr.s
296+
class GraphBuilderConfig(object):
297+
"""Encapsulates configuration parameters for building a graph.
298+
299+
For more information, see `nsl.tools.build_graph_from_config`.
300+
301+
Attributes:
302+
id_feature_name: The name of the feature in the input `tf.train.Example`
303+
objects representing the ID of examples.
304+
embedding_feature_name: The name of the feature in the input
305+
`tf.train.Example` objects representing the embedding of examples.
306+
similarity_threshold: Threshold used to determine which edges to retain in
307+
the resulting graph.
308+
lsh_bits: Determines the maximum number of LSH buckets into which input data
309+
points will be bucketed by the graph builder. See the
310+
`nsl.tools.build_graph_from_config` documentation for details. This
311+
defaults to 0, in which case all pairs of inputs will be compared,
312+
probably resulting in slow running times on larger input sets.
313+
random_seed: Value used to seed the random number generator used to perform
314+
randomized LSH bucketing of the inputs when `lsh_bits > 0`. By default,
315+
the generator will be initialized randomly, but setting this to any
316+
integer will initialize it deterministically.
317+
"""
318+
id_feature_name = attr.ib(default='id')
319+
embedding_feature_name = attr.ib(default='embedding')
320+
similarity_threshold = attr.ib(default=0.8)
321+
lsh_bits = attr.ib(default=0)
322+
random_seed = attr.ib(default=None)
323+
324+
294325
@attr.s
295326
class GraphNeighborConfig(object):
296327
"""Specifies neighbor attributes for graph regularization.

neural_structured_learning/tools/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ py_library(
6363
# package absl:app
6464
# package absl/flags
6565
# package absl/logging
66+
"//neural_structured_learning/configs",
6667
# package numpy
6768
# package six
6869
# package tensorflow

neural_structured_learning/tools/__init__.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,28 @@
11
"""Tools and APIs for preparing data for Neural Structured Learning.
22
33
In addition to the functions exported here, two of the modules can be invoked
4-
from the command-line as follows:
4+
from the command-line.
5+
6+
Sample usage for running the graph builder:
7+
8+
`python -m neural_structured_learning.tools.build_graph` [*flags*]
9+
*embedding_file.tfr... output_graph.tsv*
10+
11+
Sample usage for preparing input for graph-based NSL:
12+
13+
`python -m neural_structured_learning.tools.pack_nbrs` [*flags*]
14+
*labeled.tfr unlabeled.tfr graph.tsv output.tfr*
15+
16+
For details about these programs' flags, run these commands:
517
618
```sh
7-
$ python -m neural_structured_learning.tools.build_graph ...
8-
$ python -m neural_structured_learning.tools.pack_nbrs ...
19+
$ python -m neural_structured_learning.tools.build_graph --help
20+
$ python -m neural_structured_learning.tools.pack_nbrs --help
921
```
10-
11-
For details on the command-line usage for these programs, see the
12-
`nsl.tools.build_graph` and `nsl.tools.pack_nbrs` documentation.
1322
"""
1423

1524
from neural_structured_learning.tools.build_graph import build_graph
25+
from neural_structured_learning.tools.build_graph import build_graph_from_config
1626
from neural_structured_learning.tools.graph_utils import add_edge
1727
from neural_structured_learning.tools.graph_utils import add_undirected_edges
1828
from neural_structured_learning.tools.graph_utils import read_tsv_graph

0 commit comments

Comments
 (0)