Add a library for preparing graph input with unit tests.

arjung · tensorflow-copybara · commit f34ea03a7ec6 · 2019-10-03T15:46:38.000-07:00
This is in line with the rest of the modules in NSL (and in TF), and also allows us to write unit tests.

PiperOrigin-RevId: 272758407
diff --git a/neural_structured_learning/tools/BUILD b/neural_structured_learning/tools/BUILD
@@ -31,7 +31,7 @@ py_library(
     deps = [
         ":graph_builder",
         ":graph_utils",
-        ":pack_nbrs_lib",
+        ":input_maker_lib",
     ],
 )
 
@@ -94,24 +94,40 @@ py_binary(
 )
 
 py_library(
-    name = "pack_nbrs_lib",
-    srcs = ["pack_nbrs.py"],
+    name = "input_maker_lib",
+    srcs = ["input_maker_lib.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":graph_utils",
-        # package absl:app
-        # package absl/flags
         # package absl/logging
         # package six
         # package tensorflow
     ],
 )
 
+py_test(
+    name = "input_maker_lib_test",
+    srcs = ["input_maker_lib_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":graph_utils",
+        ":input_maker_lib",
+        # package protobuf,
+        # package absl/testing:absltest
+        # package tensorflow
+    ],
+)
+
 py_binary(
-    name = "pack_nbrs",
-    srcs = ["pack_nbrs.py"],
+    name = "input_maker",
+    srcs = ["input_maker.py"],
     python_version = "PY3",
-    deps = [":pack_nbrs_lib"],
+    deps = [
+        ":input_maker_lib",
+        # package absl:app
+        # package absl/flags
+        # package tensorflow
+    ],
 )
 
 py_binary(
diff --git a/neural_structured_learning/tools/__init__.py b/neural_structured_learning/tools/__init__.py
@@ -5,4 +5,4 @@
 from neural_structured_learning.tools.graph_utils import add_undirected_edges
 from neural_structured_learning.tools.graph_utils import read_tsv_graph
 from neural_structured_learning.tools.graph_utils import write_tsv_graph
-import neural_structured_learning.tools.pack_nbrs
+from neural_structured_learning.tools.input_maker_lib import pack_nbrs
diff --git a/neural_structured_learning/tools/input_maker.py b/neural_structured_learning/tools/input_maker.py
@@ -0,0 +1,70 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Tool that prepares input for graph-based Neural Structured Learning.
+
+This is a wrapper around the `nsl.tools.pack_nbrs` API. See its documentation
+for more details.
+
+USAGE:
+
+`python input_maker.py` [*flags*] *labeled.tfr unlabeled.tfr graph.tsv
+output.tfr*
+
+For details about this program's flags, run `python input_maker.py --help`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+from neural_structured_learning.tools import input_maker_lib
+import tensorflow as tf
+
+
+def _main(argv):
+  """Main function for running the input_maker program."""
+  flag = flags.FLAGS
+  flag.showprefixforinfo = False
+  # Check that the correct number of arguments have been provided.
+  if len(argv) != 5:
+    raise app.UsageError('Invalid number of arguments; expected 4, got %d' %
+                         (len(argv) - 1))
+
+  input_maker_lib.pack_nbrs(argv[1], argv[2], argv[3], argv[4],
+                            flag.add_undirected_edges, flag.max_nbrs,
+                            flag.id_feature_name)
+
+
+if __name__ == '__main__':
+  flags.DEFINE_integer(
+      'max_nbrs', None,
+      'The maximum number of neighbors to merge into each labeled Example.')
+  flags.DEFINE_string(
+      'id_feature_name', 'id',
+      """Name of the singleton bytes_list feature in each input Example
+      whose value is the Example's ID.""")
+  flags.DEFINE_bool(
+      'add_undirected_edges', False,
+      """By default, the set of neighbors of a node S are
+      only those nodes T such that there is an edge S-->T in the input graph. If
+      this flag is True, all edges of the graph will be made symmetric before
+      determining each node's neighbors (and in the case where edges S-->T and
+      T-->S exist in the input graph with weights w1 and w2, respectively, the
+      weight of the symmetric edge will be max(w1, w2)).""")
+
+  # Ensure TF 2.0 behavior even if TF 1.X is installed.
+  tf.compat.v1.enable_v2_behavior()
+  app.run(_main)
diff --git a/neural_structured_learning/tools/input_maker_lib.py b/neural_structured_learning/tools/input_maker_lib.py
@@ -12,63 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-r"""Tool that prepares input for graph-based Neural Structured Learning.
-
-In particular, this tool merges into each labeled training example the features
-from its out-edge neighbor examples according to a supplied *similarity graph*.
-
-USAGE:
-
-`python pack_nbrs.py` [*flags*] *labeled.tfr unlabeled.tfr graph.tsv output.tfr*
-
-The *labeled.tfr* command-line argument is expected to name a TFRecord file
-containing labeled `tf.train.Examples`, while the *unlabeled.tfr* command-line
-argument is expected to name a TFRecord file containing unlabeled examples.
-The *unlabeled.tfr* argument can be an empty string ('' or "" as the shell
-command-line argument) if there are no unlabeled examples. Each example read
-from either of those files is expected to have a feature that contains its ID
-(represented as a singleton `bytes_list` value); the name of this feature is
-specified by the value of the `--id_feature_name` flag (default: 'id').
-
-The *graph.tsv* command-line argument is expected to name a TSV file that
-specifies a graph as a set of edges representing similarity relationships
-between the labeled and unlabeled `Example`s. Each graph edge is identified by a
-source instance ID, a target instance ID, and an optional edge weight. These
-edges are specified by TSV lines of the following form:
-
-```
-source_id<TAB>target_id[<TAB>edge_weight]
-```
-
-If no `edge_weight` is specified, it defaults to 1.0. If your input graph is
-not symmetric and you'd like all edges in it to be treated as bi-directional,
-you can use the `--add_undirected_edges` flag to accomplish that. To build a
-graph based on the similarity of your instances' dense embeddings, you can use
-the `build_graph.py` tool included in the Neural Structured Learning
-package.
-
-This program merges into each labeled example the features of that example's
-out-edge neighbors according to that instance's in-edges in the graph. If a
-value is specified for the `--max_nbrs` flag, then at most that many neighbors'
-features are merged into each labeled instance (based on which neighbors have
-the largest edge weights, with ties broken using instance IDs).
-
-Here's how the merging process works. For each labeled example, the features of
-its `i`'th out-edge neighbor will be prefixed by `NL_nbr_<i>_`, with indexes `i`
-in the half-open interval `[0, K)`, where K is the minimum of `--max_nbrs` and
-the number of the labeled example's out-edges in the graph. A feature named
-`NL_nbr_<i>_weight` will also be merged into the labeled example whose value
-will be the neighbor's corresponding edge weight. The top neighbors to use in
-this process are selected by consulting the input graph and selecting the
-labeled example's out-edge neighbors with the largest edge weight; ties are
-broken by preferring neighbor IDs with larger lexicographic order. Finally, a
-feature named `NL_num_nbrs` is set on the result (a singleton `int64_list`)
-denoting the number of neighbors `K` merged into the labeled example.
-
-Finally, the merged examples are written to a TFRecord file named by the
-*output.tfr* command-line argument.
-
-For details about this program's flags, run `python pack_nbrs.py --help`.
+r"""Library to prepare input for graph-based Neural Structured Learning.
+
+A python-based program for preparing graph input also exists on
+[GitHub](https://github.com/tensorflow/neural-structured-learning/tree/master/neural_structured_learning/tools/input_maker.py).
 """
 
 from __future__ import absolute_import
@@ -78,8 +25,6 @@
 import collections
 import time
 
-from absl import app
-from absl import flags
 from absl import logging
 from neural_structured_learning.tools import graph_utils
 import six
@@ -90,11 +35,11 @@ def _read_tfrecord_examples(filename, id_feature_name):
   """Returns a dict containing the Examples read from a TFRecord file.
 
   Args:
-    filename: Name of the TFRecord file to read. Each `tensorflow.Example` in
-      the input is expected to have a feature named `id` that maps to a
-      singleton `bytes_list` value.
+    filename: Name of the TFRecord file to read. Each `tf.train.Example` in the
+      input is expected to have a feature named `id` that maps to a singleton
+      `bytes_list` value.
     id_feature_name: Name of the singleton `bytes_list` feature in each input
-      `Example` whose value is the Example's ID.
+      `tf.train.Example` whose value is the Example's ID.
 
   Returns:
     A dictionary that maps the ID of each Example to that Example.
@@ -230,56 +175,98 @@ def merge_examples(seed_ex, nbr_wt_ex_list):
   logging.info('Out-degree histogram: %s', sorted(out_degree_count.items()))
 
 
-def _main(argv):
-  """Main function for running the pack_nbrs program."""
-  flag = flags.FLAGS
-  flag.showprefixforinfo = False
+def pack_nbrs(labeled_examples_path,
+              unlabeled_examples_path,
+              graph_path,
+              output_training_data_path,
+              add_undirected_edges=False,
+              max_nbrs=None,
+              id_feature_name='id'):
+  """Prepares input for graph-based Neural Structured Learning and persists it.
+
+  In particular, this function merges into each labeled training example the
+  features from its out-edge neighbor examples according to a supplied
+  similarity graph, and persists the resulting (augmented) training data.
+
+  Each `tf.train.Example` read from the files identified by
+  `labeled_examples_path` and `unlabeled_examples_path` is expected to have a
+  feature that contains its ID (represented as a singleton `bytes_list` value);
+  the name of this feature is specified by the value of `id_feature_name`.
+
+  Each edge in the graph specified by `graph_path` is identified by a source
+  instance ID, a target instance ID, and an optional edge weight. These edges
+  are specified by TSV lines of the following form:
+
+  ```
+  source_id<TAB>target_id[<TAB>edge_weight]
+  ```
+
+  If no `edge_weight` is specified, it defaults to 1.0. If the input graph is
+  not symmetric and if `add_undirected_edges` is `True`, then all edges will be
+  treated as bi-directional. To build a graph based on the similarity of
+  instances' dense embeddings, see `nsl.tools.build_graph`.
+
+  This function merges into each labeled example the features of that example's
+  out-edge neighbors according to that instance's in-edges in the graph. If a
+  value is specified for `max_nbrs`, then at most that many neighbors' features
+  are merged into each labeled instance (based on which neighbors have the
+  largest edge weights, with ties broken using instance IDs).
+
+  Here's how the merging process works. For each labeled example, the features
+  of its `i`'th out-edge neighbor will be prefixed by `NL_nbr_<i>_`, with
+  indexes `i` in the half-open interval `[0, K)`, where K is the minimum of
+  `max_nbrs` and the number of the labeled example's out-edges in the graph. A
+  feature named `NL_nbr_<i>_weight` will also be merged into the labeled example
+  whose value will be the neighbor's corresponding edge weight. The top
+  neighbors to use in this process are selected by consulting the input graph
+  and selecting the labeled example's out-edge neighbors with the largest edge
+  weight; ties are broken by preferring neighbor IDs with larger lexicographic
+  order. Finally, a feature named `NL_num_nbrs` is set on the result (a
+  singleton `int64_list`) denoting the number of neighbors `K` merged into the
+  labeled example.
+
+  Finally, the merged examples are written to a TFRecord file named by
+  `output_training_data_path`.
+
+  Args:
+    labeled_examples_path: Names a TFRecord file containing labeled
+      `tf.train.Example` instances.
+    unlabeled_examples_path: Names a TFRecord file containing unlabeled
+      `tf.train.Example` instances. This can be an empty string if there are no
+      unlabeled examples.
+    graph_path: Names a TSV file that specifies a graph as a set of edges
+      representing similarity relationships.
+    output_training_data_path: Path to a file where the resulting augmented
+      training data in the form of `tf.train.Example` instances will be
+      persisted in the TFRecord format.
+    add_undirected_edges: `Boolean` indicating whether or not to treat adges as
+      bi-directional.
+    max_nbrs: The maximum number of neighbors to use to generate the augmented
+      training data for downstream training.
+    id_feature_name: The name of the feature in the input labeled and unlabeled
+      `tf.train.Example` objects representing the ID of examples.
+  """
   start_time = time.time()
-  # Check that the correct number of arguments have been provided.
-  if len(argv) != 5:
-    raise app.UsageError(
-        'Invalid number of arguments; expected 4, got %d' % (len(argv) -  1))
 
   # Read seed and neighbor TFRecord input files.
-  seed_exs = _read_tfrecord_examples(argv[1], flag.id_feature_name)
+  seed_exs = _read_tfrecord_examples(labeled_examples_path, id_feature_name)
   # Unlabeled neighbor input instances are optional. If not provided, all
   # neighbors used will be labeled instances.
-  nbr_exs = _read_tfrecord_examples(argv[2],
-                                    flag.id_feature_name) if argv[2] else {}
+  nbr_exs = _read_tfrecord_examples(
+      unlabeled_examples_path,
+      id_feature_name) if unlabeled_examples_path else {}
 
   # Read the input graph in TSV format, and conditionally reverse all its edges.
-  graph = graph_utils.read_tsv_graph(argv[3])
-  if flag.add_undirected_edges: graph_utils.add_undirected_edges(graph)
+  graph = graph_utils.read_tsv_graph(graph_path)
+  if add_undirected_edges:
+    graph_utils.add_undirected_edges(graph)
 
   # Join the edges with the seed and neighbor Examples, and write out the
   # results to the output TFRecord file.
-  output_tfr = argv[4]
-  with tf.io.TFRecordWriter(output_tfr) as writer:
-    for merged_ex in _join_examples(seed_exs, nbr_exs, graph, flag.max_nbrs):
+  with tf.io.TFRecordWriter(output_training_data_path) as writer:
+    for merged_ex in _join_examples(seed_exs, nbr_exs, graph, max_nbrs):
       writer.write(merged_ex.SerializeToString())
-  logging.info('Output written to TFRecord file: %s.', output_tfr)
+  logging.info('Output written to TFRecord file: %s.',
+               output_training_data_path)
   logging.info('Total running time: %.2f minutes.',
                (time.time() - start_time) / 60.0)
-
-
-if __name__ == '__main__':
-  flags.DEFINE_integer(
-      'max_nbrs', None,
-      'The maximum number of neighbors to merge into each labeled Example.')
-  flags.DEFINE_string(
-      'id_feature_name', 'id',
-      """Name of the singleton bytes_list feature in each input Example
-      whose value is the Example's ID."""
-  )
-  flags.DEFINE_bool(
-      'add_undirected_edges', False,
-      """By default, the set of neighbors of a node S are
-      only those nodes T such that there is an edge S-->T in the input graph. If
-      this flag is True, all edges of the graph will be made symmetric before
-      determining each node's neighbors (and in the case where edges S-->T and
-      T-->S exist in the input graph with weights w1 and w2, respectively, the
-      weight of the symmetric edge will be max(w1, w2)).""")
-
-  # Ensure TF 2.0 behavior even if TF 1.X is installed.
-  tf.compat.v1.enable_v2_behavior()
-  app.run(_main)
diff --git a/neural_structured_learning/tools/input_maker_lib_test.py b/neural_structured_learning/tools/input_maker_lib_test.py