Skip to content

Commit f34ea03

Browse files
arjungtensorflow-copybara
authored andcommitted
Add a library for preparing graph input with unit tests.
This is in line with the rest of the modules in NSL (and in TF), and also allows us to write unit tests. PiperOrigin-RevId: 272758407
1 parent 74745fc commit f34ea03

File tree

5 files changed

+500
-112
lines changed

5 files changed

+500
-112
lines changed

neural_structured_learning/tools/BUILD

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ py_library(
3131
deps = [
3232
":graph_builder",
3333
":graph_utils",
34-
":pack_nbrs_lib",
34+
":input_maker_lib",
3535
],
3636
)
3737

@@ -94,24 +94,40 @@ py_binary(
9494
)
9595

9696
py_library(
97-
name = "pack_nbrs_lib",
98-
srcs = ["pack_nbrs.py"],
97+
name = "input_maker_lib",
98+
srcs = ["input_maker_lib.py"],
9999
srcs_version = "PY2AND3",
100100
deps = [
101101
":graph_utils",
102-
# package absl:app
103-
# package absl/flags
104102
# package absl/logging
105103
# package six
106104
# package tensorflow
107105
],
108106
)
109107

108+
py_test(
109+
name = "input_maker_lib_test",
110+
srcs = ["input_maker_lib_test.py"],
111+
srcs_version = "PY2AND3",
112+
deps = [
113+
":graph_utils",
114+
":input_maker_lib",
115+
# package protobuf,
116+
# package absl/testing:absltest
117+
# package tensorflow
118+
],
119+
)
120+
110121
py_binary(
111-
name = "pack_nbrs",
112-
srcs = ["pack_nbrs.py"],
122+
name = "input_maker",
123+
srcs = ["input_maker.py"],
113124
python_version = "PY3",
114-
deps = [":pack_nbrs_lib"],
125+
deps = [
126+
":input_maker_lib",
127+
# package absl:app
128+
# package absl/flags
129+
# package tensorflow
130+
],
115131
)
116132

117133
py_binary(

neural_structured_learning/tools/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@
55
from neural_structured_learning.tools.graph_utils import add_undirected_edges
66
from neural_structured_learning.tools.graph_utils import read_tsv_graph
77
from neural_structured_learning.tools.graph_utils import write_tsv_graph
8-
import neural_structured_learning.tools.pack_nbrs
8+
from neural_structured_learning.tools.input_maker_lib import pack_nbrs
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Copyright 2019 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
r"""Tool that prepares input for graph-based Neural Structured Learning.
15+
16+
This is a wrapper around the `nsl.tools.pack_nbrs` API. See its documentation
17+
for more details.
18+
19+
USAGE:
20+
21+
`python input_maker.py` [*flags*] *labeled.tfr unlabeled.tfr graph.tsv
22+
output.tfr*
23+
24+
For details about this program's flags, run `python input_maker.py --help`.
25+
"""
26+
27+
from __future__ import absolute_import
28+
from __future__ import division
29+
from __future__ import print_function
30+
31+
from absl import app
32+
from absl import flags
33+
from neural_structured_learning.tools import input_maker_lib
34+
import tensorflow as tf
35+
36+
37+
def _main(argv):
38+
"""Main function for running the input_maker program."""
39+
flag = flags.FLAGS
40+
flag.showprefixforinfo = False
41+
# Check that the correct number of arguments have been provided.
42+
if len(argv) != 5:
43+
raise app.UsageError('Invalid number of arguments; expected 4, got %d' %
44+
(len(argv) - 1))
45+
46+
input_maker_lib.pack_nbrs(argv[1], argv[2], argv[3], argv[4],
47+
flag.add_undirected_edges, flag.max_nbrs,
48+
flag.id_feature_name)
49+
50+
51+
if __name__ == '__main__':
52+
flags.DEFINE_integer(
53+
'max_nbrs', None,
54+
'The maximum number of neighbors to merge into each labeled Example.')
55+
flags.DEFINE_string(
56+
'id_feature_name', 'id',
57+
"""Name of the singleton bytes_list feature in each input Example
58+
whose value is the Example's ID.""")
59+
flags.DEFINE_bool(
60+
'add_undirected_edges', False,
61+
"""By default, the set of neighbors of a node S are
62+
only those nodes T such that there is an edge S-->T in the input graph. If
63+
this flag is True, all edges of the graph will be made symmetric before
64+
determining each node's neighbors (and in the case where edges S-->T and
65+
T-->S exist in the input graph with weights w1 and w2, respectively, the
66+
weight of the symmetric edge will be max(w1, w2)).""")
67+
68+
# Ensure TF 2.0 behavior even if TF 1.X is installed.
69+
tf.compat.v1.enable_v2_behavior()
70+
app.run(_main)

neural_structured_learning/tools/pack_nbrs.py renamed to neural_structured_learning/tools/input_maker_lib.py

Lines changed: 90 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -12,63 +12,10 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
r"""Tool that prepares input for graph-based Neural Structured Learning.
16-
17-
In particular, this tool merges into each labeled training example the features
18-
from its out-edge neighbor examples according to a supplied *similarity graph*.
19-
20-
USAGE:
21-
22-
`python pack_nbrs.py` [*flags*] *labeled.tfr unlabeled.tfr graph.tsv output.tfr*
23-
24-
The *labeled.tfr* command-line argument is expected to name a TFRecord file
25-
containing labeled `tf.train.Examples`, while the *unlabeled.tfr* command-line
26-
argument is expected to name a TFRecord file containing unlabeled examples.
27-
The *unlabeled.tfr* argument can be an empty string ('' or "" as the shell
28-
command-line argument) if there are no unlabeled examples. Each example read
29-
from either of those files is expected to have a feature that contains its ID
30-
(represented as a singleton `bytes_list` value); the name of this feature is
31-
specified by the value of the `--id_feature_name` flag (default: 'id').
32-
33-
The *graph.tsv* command-line argument is expected to name a TSV file that
34-
specifies a graph as a set of edges representing similarity relationships
35-
between the labeled and unlabeled `Example`s. Each graph edge is identified by a
36-
source instance ID, a target instance ID, and an optional edge weight. These
37-
edges are specified by TSV lines of the following form:
38-
39-
```
40-
source_id<TAB>target_id[<TAB>edge_weight]
41-
```
42-
43-
If no `edge_weight` is specified, it defaults to 1.0. If your input graph is
44-
not symmetric and you'd like all edges in it to be treated as bi-directional,
45-
you can use the `--add_undirected_edges` flag to accomplish that. To build a
46-
graph based on the similarity of your instances' dense embeddings, you can use
47-
the `build_graph.py` tool included in the Neural Structured Learning
48-
package.
49-
50-
This program merges into each labeled example the features of that example's
51-
out-edge neighbors according to that instance's in-edges in the graph. If a
52-
value is specified for the `--max_nbrs` flag, then at most that many neighbors'
53-
features are merged into each labeled instance (based on which neighbors have
54-
the largest edge weights, with ties broken using instance IDs).
55-
56-
Here's how the merging process works. For each labeled example, the features of
57-
its `i`'th out-edge neighbor will be prefixed by `NL_nbr_<i>_`, with indexes `i`
58-
in the half-open interval `[0, K)`, where K is the minimum of `--max_nbrs` and
59-
the number of the labeled example's out-edges in the graph. A feature named
60-
`NL_nbr_<i>_weight` will also be merged into the labeled example whose value
61-
will be the neighbor's corresponding edge weight. The top neighbors to use in
62-
this process are selected by consulting the input graph and selecting the
63-
labeled example's out-edge neighbors with the largest edge weight; ties are
64-
broken by preferring neighbor IDs with larger lexicographic order. Finally, a
65-
feature named `NL_num_nbrs` is set on the result (a singleton `int64_list`)
66-
denoting the number of neighbors `K` merged into the labeled example.
67-
68-
Finally, the merged examples are written to a TFRecord file named by the
69-
*output.tfr* command-line argument.
70-
71-
For details about this program's flags, run `python pack_nbrs.py --help`.
15+
r"""Library to prepare input for graph-based Neural Structured Learning.
16+
17+
A python-based program for preparing graph input also exists on
18+
[GitHub](https://github.com/tensorflow/neural-structured-learning/tree/master/neural_structured_learning/tools/input_maker.py).
7219
"""
7320

7421
from __future__ import absolute_import
@@ -78,8 +25,6 @@
7825
import collections
7926
import time
8027

81-
from absl import app
82-
from absl import flags
8328
from absl import logging
8429
from neural_structured_learning.tools import graph_utils
8530
import six
@@ -90,11 +35,11 @@ def _read_tfrecord_examples(filename, id_feature_name):
9035
"""Returns a dict containing the Examples read from a TFRecord file.
9136
9237
Args:
93-
filename: Name of the TFRecord file to read. Each `tensorflow.Example` in
94-
the input is expected to have a feature named `id` that maps to a
95-
singleton `bytes_list` value.
38+
filename: Name of the TFRecord file to read. Each `tf.train.Example` in the
39+
input is expected to have a feature named `id` that maps to a singleton
40+
`bytes_list` value.
9641
id_feature_name: Name of the singleton `bytes_list` feature in each input
97-
`Example` whose value is the Example's ID.
42+
`tf.train.Example` whose value is the Example's ID.
9843
9944
Returns:
10045
A dictionary that maps the ID of each Example to that Example.
@@ -230,56 +175,98 @@ def merge_examples(seed_ex, nbr_wt_ex_list):
230175
logging.info('Out-degree histogram: %s', sorted(out_degree_count.items()))
231176

232177

233-
def _main(argv):
234-
"""Main function for running the pack_nbrs program."""
235-
flag = flags.FLAGS
236-
flag.showprefixforinfo = False
178+
def pack_nbrs(labeled_examples_path,
179+
unlabeled_examples_path,
180+
graph_path,
181+
output_training_data_path,
182+
add_undirected_edges=False,
183+
max_nbrs=None,
184+
id_feature_name='id'):
185+
"""Prepares input for graph-based Neural Structured Learning and persists it.
186+
187+
In particular, this function merges into each labeled training example the
188+
features from its out-edge neighbor examples according to a supplied
189+
similarity graph, and persists the resulting (augmented) training data.
190+
191+
Each `tf.train.Example` read from the files identified by
192+
`labeled_examples_path` and `unlabeled_examples_path` is expected to have a
193+
feature that contains its ID (represented as a singleton `bytes_list` value);
194+
the name of this feature is specified by the value of `id_feature_name`.
195+
196+
Each edge in the graph specified by `graph_path` is identified by a source
197+
instance ID, a target instance ID, and an optional edge weight. These edges
198+
are specified by TSV lines of the following form:
199+
200+
```
201+
source_id<TAB>target_id[<TAB>edge_weight]
202+
```
203+
204+
If no `edge_weight` is specified, it defaults to 1.0. If the input graph is
205+
not symmetric and if `add_undirected_edges` is `True`, then all edges will be
206+
treated as bi-directional. To build a graph based on the similarity of
207+
instances' dense embeddings, see `nsl.tools.build_graph`.
208+
209+
This function merges into each labeled example the features of that example's
210+
out-edge neighbors according to that instance's in-edges in the graph. If a
211+
value is specified for `max_nbrs`, then at most that many neighbors' features
212+
are merged into each labeled instance (based on which neighbors have the
213+
largest edge weights, with ties broken using instance IDs).
214+
215+
Here's how the merging process works. For each labeled example, the features
216+
of its `i`'th out-edge neighbor will be prefixed by `NL_nbr_<i>_`, with
217+
indexes `i` in the half-open interval `[0, K)`, where K is the minimum of
218+
`max_nbrs` and the number of the labeled example's out-edges in the graph. A
219+
feature named `NL_nbr_<i>_weight` will also be merged into the labeled example
220+
whose value will be the neighbor's corresponding edge weight. The top
221+
neighbors to use in this process are selected by consulting the input graph
222+
and selecting the labeled example's out-edge neighbors with the largest edge
223+
weight; ties are broken by preferring neighbor IDs with larger lexicographic
224+
order. Finally, a feature named `NL_num_nbrs` is set on the result (a
225+
singleton `int64_list`) denoting the number of neighbors `K` merged into the
226+
labeled example.
227+
228+
Finally, the merged examples are written to a TFRecord file named by
229+
`output_training_data_path`.
230+
231+
Args:
232+
labeled_examples_path: Names a TFRecord file containing labeled
233+
`tf.train.Example` instances.
234+
unlabeled_examples_path: Names a TFRecord file containing unlabeled
235+
`tf.train.Example` instances. This can be an empty string if there are no
236+
unlabeled examples.
237+
graph_path: Names a TSV file that specifies a graph as a set of edges
238+
representing similarity relationships.
239+
output_training_data_path: Path to a file where the resulting augmented
240+
training data in the form of `tf.train.Example` instances will be
241+
persisted in the TFRecord format.
242+
add_undirected_edges: `Boolean` indicating whether or not to treat adges as
243+
bi-directional.
244+
max_nbrs: The maximum number of neighbors to use to generate the augmented
245+
training data for downstream training.
246+
id_feature_name: The name of the feature in the input labeled and unlabeled
247+
`tf.train.Example` objects representing the ID of examples.
248+
"""
237249
start_time = time.time()
238-
# Check that the correct number of arguments have been provided.
239-
if len(argv) != 5:
240-
raise app.UsageError(
241-
'Invalid number of arguments; expected 4, got %d' % (len(argv) - 1))
242250

243251
# Read seed and neighbor TFRecord input files.
244-
seed_exs = _read_tfrecord_examples(argv[1], flag.id_feature_name)
252+
seed_exs = _read_tfrecord_examples(labeled_examples_path, id_feature_name)
245253
# Unlabeled neighbor input instances are optional. If not provided, all
246254
# neighbors used will be labeled instances.
247-
nbr_exs = _read_tfrecord_examples(argv[2],
248-
flag.id_feature_name) if argv[2] else {}
255+
nbr_exs = _read_tfrecord_examples(
256+
unlabeled_examples_path,
257+
id_feature_name) if unlabeled_examples_path else {}
249258

250259
# Read the input graph in TSV format, and conditionally reverse all its edges.
251-
graph = graph_utils.read_tsv_graph(argv[3])
252-
if flag.add_undirected_edges: graph_utils.add_undirected_edges(graph)
260+
graph = graph_utils.read_tsv_graph(graph_path)
261+
if add_undirected_edges:
262+
graph_utils.add_undirected_edges(graph)
253263

254264
# Join the edges with the seed and neighbor Examples, and write out the
255265
# results to the output TFRecord file.
256-
output_tfr = argv[4]
257-
with tf.io.TFRecordWriter(output_tfr) as writer:
258-
for merged_ex in _join_examples(seed_exs, nbr_exs, graph, flag.max_nbrs):
266+
with tf.io.TFRecordWriter(output_training_data_path) as writer:
267+
for merged_ex in _join_examples(seed_exs, nbr_exs, graph, max_nbrs):
259268
writer.write(merged_ex.SerializeToString())
260-
logging.info('Output written to TFRecord file: %s.', output_tfr)
269+
logging.info('Output written to TFRecord file: %s.',
270+
output_training_data_path)
261271
logging.info('Total running time: %.2f minutes.',
262272
(time.time() - start_time) / 60.0)
263-
264-
265-
if __name__ == '__main__':
266-
flags.DEFINE_integer(
267-
'max_nbrs', None,
268-
'The maximum number of neighbors to merge into each labeled Example.')
269-
flags.DEFINE_string(
270-
'id_feature_name', 'id',
271-
"""Name of the singleton bytes_list feature in each input Example
272-
whose value is the Example's ID."""
273-
)
274-
flags.DEFINE_bool(
275-
'add_undirected_edges', False,
276-
"""By default, the set of neighbors of a node S are
277-
only those nodes T such that there is an edge S-->T in the input graph. If
278-
this flag is True, all edges of the graph will be made symmetric before
279-
determining each node's neighbors (and in the case where edges S-->T and
280-
T-->S exist in the input graph with weights w1 and w2, respectively, the
281-
weight of the symmetric edge will be max(w1, w2)).""")
282-
283-
# Ensure TF 2.0 behavior even if TF 1.X is installed.
284-
tf.compat.v1.enable_v2_behavior()
285-
app.run(_main)

0 commit comments

Comments
 (0)