Fix documentation and indentation.

otiliastr · otiliastr · commit 69aa91c63293 · 2019-09-29T15:09:07.000-04:00
diff --git a/neural_structured_learning/research/gam/data/loaders.py b/neural_structured_learning/research/gam/data/loaders.py
@@ -122,26 +122,39 @@ def load_data_realistic_ssl(dataset_name, data_path, label_map_path):
   return data
 
 
-def load_from_planetoid_files(dataset_str, path):
-    """Loads input data from gcn/data directory.
-
-    This function is copied and adapted from https://github.com/tkipf/gcn/blob/master/gcn/utils.py.
-
-    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
-    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
-    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
-        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
-    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
-    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
-    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
-    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
-        object;
-    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
-
-    All objects above must be saved using python pickle module.
-
-    :param dataset_str: Dataset name
-    :return: All data input files loaded (as well the training/test data).
+def load_from_planetoid_files(dataset_name, path):
+    """Loads Planetoid data in GCN format, as released with the GCN code.
+
+    This function is adapted from https://github.com/tkipf/gcn.
+
+    This function assumes that the following files can be found at the location
+    specified by `path`:
+    ind.dataset_str.x          => the feature vectors of the training instances
+                                  as scipy.sparse.csr.csr_matrix object.
+    ind.dataset_str.tx         => the feature vectors of the test instances as
+                                  scipy.sparse.csr.csr_matrix object.
+    ind.dataset_str.allx       => the feature vectors of both labeled and
+                                  unlabeled training instances (a superset of
+                                  ind.dataset_str.x) as
+                                  scipy.sparse.csr.csr_matrix object.
+    ind.dataset_str.y          => the one-hot labels of the labeled training
+                                  instances as numpy.ndarray object.
+    ind.dataset_str.ty         => the one-hot labels of the test instances as
+                                  numpy.ndarray object.
+    ind.dataset_str.ally       => the labels for instances in
+                                  ind.dataset_str.allx as numpy.ndarray object.
+    ind.dataset_str.graph      => a dict in the format
+                                  {index: [index_of_neighbor_nodes]} as
+                                  collections.defaultdict object.
+    ind.dataset_str.test.index => the indices of test instances in graph, for
+                                  the inductive setting as list object.
+
+    Arguments:
+    dataset_name: A string representing the dataset name (e.g., `cora`).
+    path: Path to the directory containing the files.
+
+    Returns:
+      All data input files loaded (as well the training/test data).
     """
     def _sample_mask(idx, l):
         """Create mask."""
@@ -159,7 +172,7 @@ def _parse_index_file(filename):
     names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
     objects = []
     for i in range(len(names)):
-        filename = "ind.{}.{}".format(dataset_str, names[i])
+        filename = "ind.{}.{}".format(dataset_name, names[i])
         filename = os.path.join(path, filename)
         with open(filename, 'rb') as f:
             if sys.version_info > (3, 0):
@@ -168,15 +181,16 @@ def _parse_index_file(filename):
                 objects.append(pkl.load(f))
 
     x, y, tx, ty, allx, ally, graph = tuple(objects)
-    filename = "ind.{}.test.index".format(dataset_str)
+    filename = "ind.{}.test.index".format(dataset_name)
     filename = os.path.join(path, filename)
     test_idx_reorder = _parse_index_file(filename)
     test_idx_range = np.sort(test_idx_reorder)
 
-    if dataset_str == 'citeseer':
+    if dataset_name == 'citeseer':
         # Fix citeseer dataset (there are some isolated nodes in the graph).
         # Find isolated nodes, add them as zero-vecs into the right position.
-        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
+        test_idx_range_full = range(min(test_idx_reorder),
+                                    max(test_idx_reorder)+1)
         tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
         tx_extended[test_idx_range-min(test_idx_range), :] = tx
         tx = tx_extended
@@ -187,7 +201,6 @@ def _parse_index_file(filename):
     features = sp.vstack((allx, tx)).tolil()
     features[test_idx_reorder, :] = features[test_idx_range, :]
     adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
-    #adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph, create_using=nx.DiGraph))
 
     labels = np.vstack((ally, ty))
     labels[test_idx_reorder, :] = labels[test_idx_range, :]
@@ -207,19 +220,20 @@ def _parse_index_file(filename):
     y_val[val_mask, :] = labels[val_mask, :]
     y_test[test_mask, :] = labels[test_mask, :]
 
-    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, labels
+    return (adj, features, y_train, y_val, y_test, train_mask, val_mask,
+           test_mask, labels)
 
 
 def load_data_planetoid(name, path, splits_path=None, row_normalize=False):
-  # Load from file.
   if splits_path is None:
-    adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, \
-    labels = load_from_planetoid_files(name, path)
+    # Load from file in Planetoid format.
+    adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask,\
+      labels = load_from_planetoid_files(name, path)
   else:
-    # Otherwise load from splits path.
+    # Otherwise load from a path where we saved a pickle with random splits.
     logging.info('Loading from splits path: %s', splits_path)
-    adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, \
-    labels = pickle.load(open(splits_path, "rb"))
+    adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask,\
+     labels = pickle.load(open(splits_path, "rb"))
 
   return PlanetoidDataset(name, adj, features, train_mask, val_mask, test_mask,
                           labels, row_normalize=row_normalize)