1212# See the License for the specific language governing permissions and
1313# limitations under the License.
1414
15- r"""Tool that prepares input for graph-based Neural Structured Learning.
16-
17- In particular, this tool merges into each labeled training example the features
18- from its out-edge neighbor examples according to a supplied *similarity graph*.
19-
20- USAGE:
21-
22- `python pack_nbrs.py` [*flags*] *labeled.tfr unlabeled.tfr graph.tsv output.tfr*
23-
24- The *labeled.tfr* command-line argument is expected to name a TFRecord file
25- containing labeled `tf.train.Examples`, while the *unlabeled.tfr* command-line
26- argument is expected to name a TFRecord file containing unlabeled examples.
27- The *unlabeled.tfr* argument can be an empty string ('' or "" as the shell
28- command-line argument) if there are no unlabeled examples. Each example read
29- from either of those files is expected to have a feature that contains its ID
30- (represented as a singleton `bytes_list` value); the name of this feature is
31- specified by the value of the `--id_feature_name` flag (default: 'id').
32-
33- The *graph.tsv* command-line argument is expected to name a TSV file that
34- specifies a graph as a set of edges representing similarity relationships
35- between the labeled and unlabeled `Example`s. Each graph edge is identified by a
36- source instance ID, a target instance ID, and an optional edge weight. These
37- edges are specified by TSV lines of the following form:
38-
39- ```
40- source_id<TAB>target_id[<TAB>edge_weight]
41- ```
42-
43- If no `edge_weight` is specified, it defaults to 1.0. If your input graph is
44- not symmetric and you'd like all edges in it to be treated as bi-directional,
45- you can use the `--add_undirected_edges` flag to accomplish that. To build a
46- graph based on the similarity of your instances' dense embeddings, you can use
47- the `build_graph.py` tool included in the Neural Structured Learning
48- package.
49-
50- This program merges into each labeled example the features of that example's
51- out-edge neighbors according to that instance's in-edges in the graph. If a
52- value is specified for the `--max_nbrs` flag, then at most that many neighbors'
53- features are merged into each labeled instance (based on which neighbors have
54- the largest edge weights, with ties broken using instance IDs).
55-
56- Here's how the merging process works. For each labeled example, the features of
57- its `i`'th out-edge neighbor will be prefixed by `NL_nbr_<i>_`, with indexes `i`
58- in the half-open interval `[0, K)`, where K is the minimum of `--max_nbrs` and
59- the number of the labeled example's out-edges in the graph. A feature named
60- `NL_nbr_<i>_weight` will also be merged into the labeled example whose value
61- will be the neighbor's corresponding edge weight. The top neighbors to use in
62- this process are selected by consulting the input graph and selecting the
63- labeled example's out-edge neighbors with the largest edge weight; ties are
64- broken by preferring neighbor IDs with larger lexicographic order. Finally, a
65- feature named `NL_num_nbrs` is set on the result (a singleton `int64_list`)
66- denoting the number of neighbors `K` merged into the labeled example.
67-
68- Finally, the merged examples are written to a TFRecord file named by the
69- *output.tfr* command-line argument.
70-
71- For details about this program's flags, run `python pack_nbrs.py --help`.
15+ r"""Library to prepare input for graph-based Neural Structured Learning.
16+
17+ A python-based program for preparing graph input also exists on
18+ [GitHub](https://github.com/tensorflow/neural-structured-learning/tree/master/neural_structured_learning/tools/input_maker.py).
7219"""
7320
7421from __future__ import absolute_import
7825import collections
7926import time
8027
81- from absl import app
82- from absl import flags
8328from absl import logging
8429from neural_structured_learning .tools import graph_utils
8530import six
@@ -90,11 +35,11 @@ def _read_tfrecord_examples(filename, id_feature_name):
9035 """Returns a dict containing the Examples read from a TFRecord file.
9136
9237 Args:
93- filename: Name of the TFRecord file to read. Each `tensorflow. Example` in
94- the input is expected to have a feature named `id` that maps to a
95- singleton `bytes_list` value.
38+ filename: Name of the TFRecord file to read. Each `tf.train. Example` in the
39+ input is expected to have a feature named `id` that maps to a singleton
40+ `bytes_list` value.
9641 id_feature_name: Name of the singleton `bytes_list` feature in each input
97- `Example` whose value is the Example's ID.
42+ `tf.train. Example` whose value is the Example's ID.
9843
9944 Returns:
10045 A dictionary that maps the ID of each Example to that Example.
@@ -230,56 +175,98 @@ def merge_examples(seed_ex, nbr_wt_ex_list):
230175 logging .info ('Out-degree histogram: %s' , sorted (out_degree_count .items ()))
231176
232177
233- def _main (argv ):
234- """Main function for running the pack_nbrs program."""
235- flag = flags .FLAGS
236- flag .showprefixforinfo = False
178+ def pack_nbrs (labeled_examples_path ,
179+ unlabeled_examples_path ,
180+ graph_path ,
181+ output_training_data_path ,
182+ add_undirected_edges = False ,
183+ max_nbrs = None ,
184+ id_feature_name = 'id' ):
185+ """Prepares input for graph-based Neural Structured Learning and persists it.
186+
187+ In particular, this function merges into each labeled training example the
188+ features from its out-edge neighbor examples according to a supplied
189+ similarity graph, and persists the resulting (augmented) training data.
190+
191+ Each `tf.train.Example` read from the files identified by
192+ `labeled_examples_path` and `unlabeled_examples_path` is expected to have a
193+ feature that contains its ID (represented as a singleton `bytes_list` value);
194+ the name of this feature is specified by the value of `id_feature_name`.
195+
196+ Each edge in the graph specified by `graph_path` is identified by a source
197+ instance ID, a target instance ID, and an optional edge weight. These edges
198+ are specified by TSV lines of the following form:
199+
200+ ```
201+ source_id<TAB>target_id[<TAB>edge_weight]
202+ ```
203+
204+ If no `edge_weight` is specified, it defaults to 1.0. If the input graph is
205+ not symmetric and if `add_undirected_edges` is `True`, then all edges will be
206+ treated as bi-directional. To build a graph based on the similarity of
207+ instances' dense embeddings, see `nsl.tools.build_graph`.
208+
209+ This function merges into each labeled example the features of that example's
210+ out-edge neighbors according to that instance's in-edges in the graph. If a
211+ value is specified for `max_nbrs`, then at most that many neighbors' features
212+ are merged into each labeled instance (based on which neighbors have the
213+ largest edge weights, with ties broken using instance IDs).
214+
215+ Here's how the merging process works. For each labeled example, the features
216+ of its `i`'th out-edge neighbor will be prefixed by `NL_nbr_<i>_`, with
217+ indexes `i` in the half-open interval `[0, K)`, where K is the minimum of
218+ `max_nbrs` and the number of the labeled example's out-edges in the graph. A
219+ feature named `NL_nbr_<i>_weight` will also be merged into the labeled example
220+ whose value will be the neighbor's corresponding edge weight. The top
221+ neighbors to use in this process are selected by consulting the input graph
222+ and selecting the labeled example's out-edge neighbors with the largest edge
223+ weight; ties are broken by preferring neighbor IDs with larger lexicographic
224+ order. Finally, a feature named `NL_num_nbrs` is set on the result (a
225+ singleton `int64_list`) denoting the number of neighbors `K` merged into the
226+ labeled example.
227+
228+ Finally, the merged examples are written to a TFRecord file named by
229+ `output_training_data_path`.
230+
231+ Args:
232+ labeled_examples_path: Names a TFRecord file containing labeled
233+ `tf.train.Example` instances.
234+ unlabeled_examples_path: Names a TFRecord file containing unlabeled
235+ `tf.train.Example` instances. This can be an empty string if there are no
236+ unlabeled examples.
237+ graph_path: Names a TSV file that specifies a graph as a set of edges
238+ representing similarity relationships.
239+ output_training_data_path: Path to a file where the resulting augmented
240+ training data in the form of `tf.train.Example` instances will be
241+ persisted in the TFRecord format.
242+ add_undirected_edges: `Boolean` indicating whether or not to treat adges as
243+ bi-directional.
244+ max_nbrs: The maximum number of neighbors to use to generate the augmented
245+ training data for downstream training.
246+ id_feature_name: The name of the feature in the input labeled and unlabeled
247+ `tf.train.Example` objects representing the ID of examples.
248+ """
237249 start_time = time .time ()
238- # Check that the correct number of arguments have been provided.
239- if len (argv ) != 5 :
240- raise app .UsageError (
241- 'Invalid number of arguments; expected 4, got %d' % (len (argv ) - 1 ))
242250
243251 # Read seed and neighbor TFRecord input files.
244- seed_exs = _read_tfrecord_examples (argv [ 1 ], flag . id_feature_name )
252+ seed_exs = _read_tfrecord_examples (labeled_examples_path , id_feature_name )
245253 # Unlabeled neighbor input instances are optional. If not provided, all
246254 # neighbors used will be labeled instances.
247- nbr_exs = _read_tfrecord_examples (argv [2 ],
248- flag .id_feature_name ) if argv [2 ] else {}
255+ nbr_exs = _read_tfrecord_examples (
256+ unlabeled_examples_path ,
257+ id_feature_name ) if unlabeled_examples_path else {}
249258
250259 # Read the input graph in TSV format, and conditionally reverse all its edges.
251- graph = graph_utils .read_tsv_graph (argv [3 ])
252- if flag .add_undirected_edges : graph_utils .add_undirected_edges (graph )
260+ graph = graph_utils .read_tsv_graph (graph_path )
261+ if add_undirected_edges :
262+ graph_utils .add_undirected_edges (graph )
253263
254264 # Join the edges with the seed and neighbor Examples, and write out the
255265 # results to the output TFRecord file.
256- output_tfr = argv [4 ]
257- with tf .io .TFRecordWriter (output_tfr ) as writer :
258- for merged_ex in _join_examples (seed_exs , nbr_exs , graph , flag .max_nbrs ):
266+ with tf .io .TFRecordWriter (output_training_data_path ) as writer :
267+ for merged_ex in _join_examples (seed_exs , nbr_exs , graph , max_nbrs ):
259268 writer .write (merged_ex .SerializeToString ())
260- logging .info ('Output written to TFRecord file: %s.' , output_tfr )
269+ logging .info ('Output written to TFRecord file: %s.' ,
270+ output_training_data_path )
261271 logging .info ('Total running time: %.2f minutes.' ,
262272 (time .time () - start_time ) / 60.0 )
263-
264-
265- if __name__ == '__main__' :
266- flags .DEFINE_integer (
267- 'max_nbrs' , None ,
268- 'The maximum number of neighbors to merge into each labeled Example.' )
269- flags .DEFINE_string (
270- 'id_feature_name' , 'id' ,
271- """Name of the singleton bytes_list feature in each input Example
272- whose value is the Example's ID."""
273- )
274- flags .DEFINE_bool (
275- 'add_undirected_edges' , False ,
276- """By default, the set of neighbors of a node S are
277- only those nodes T such that there is an edge S-->T in the input graph. If
278- this flag is True, all edges of the graph will be made symmetric before
279- determining each node's neighbors (and in the case where edges S-->T and
280- T-->S exist in the input graph with weights w1 and w2, respectively, the
281- weight of the symmetric edge will be max(w1, w2)).""" )
282-
283- # Ensure TF 2.0 behavior even if TF 1.X is installed.
284- tf .compat .v1 .enable_v2_behavior ()
285- app .run (_main )
0 commit comments