Fixing documentation.

otiliastr · otiliastr · commit 404c29e169db · 2019-09-19T15:03:13.000-04:00
diff --git a/neural_structured_learning/research/gam/models/cnn.py b/neural_structured_learning/research/gam/models/cnn.py
@@ -42,15 +42,11 @@ class ImageCNNAgreement(Model):
     output_dim: Integer representing the number of classes.
     channels: Integer representing the number of channels in the input images
       (e.g., 1 for black and white, 3 for RGB).
-    aggregation: String representing an aggregation operation that could be
-      applied to the inputs. See superclass attributes for details.
-    hidden_prediction: A tuple or list of integers representing the number of
-      units in each layer of output multilayer percepron. After the inputs are
-      passed through the convolution layers (and potentially aggregated), they
-      are passed through a fully connected network with these numbers of hidden
-      units in each layer.
+    aggregation: String representing an aggregation operation, that is applied
+      on the two inputs of the agreement model, after they are encoded through
+      the convolution layers. See superclass attributes for details.
     activation: An activation function to be applied to the outputs of each
-      fully connected layer.
+      fully connected layer of the aggregation network.
     is_binary_classification: Boolean specifying if this is model for
       binary classification. If so, it uses a different loss function and
       returns predictions with a single dimension, batch size.
diff --git a/neural_structured_learning/research/gam/models/wide_resnet.py b/neural_structured_learning/research/gam/models/wide_resnet.py
@@ -121,7 +121,7 @@ def get_encoding_and_params(self,
                               inputs,
                               is_train,
                               update_batch_stats=True,
-                              **kwargs):
+                              **unused_kwargs):
     """Creates the model hidden representations and prediction ops.
 
     For this model, the hidden representation is the last layer
@@ -134,7 +134,7 @@ def get_encoding_and_params(self,
         this model will be used for training or for test.
       update_batch_stats: Boolean specifying whether to update the batch norm
         statistics.
-      **kwargs: Other keyword arguments.
+      **unused_kwargs: Other unused keyword arguments.
 
     Returns:
       encoding: A tensor containing an encoded batch of samples. The first
diff --git a/neural_structured_learning/research/gam/trainer/trainer_agreement.py b/neural_structured_learning/research/gam/trainer/trainer_agreement.py
@@ -614,16 +614,12 @@ def train(self, data, session=None, **kwargs):
     # Compute ratio of positives to negative samples.
     labeled_samples_labels = data.get_labels(labeled_samples)
     ratio_pos_to_neg = self._compute_ratio_pos_neg(labeled_samples_labels)
-    # Select a validation set out of all pairs of labeled samples.
-    # TODO: remove this.
-    # neighbors_val, agreement_labels_val = self._select_val_set(
-    #     labeled_samples, num_samples_val, data, ratio_pos_to_neg)
-    # Create a train iterator that potentially excludes the validation samples.
-    # data_iterator_train = self._train_iterator(
-    #     labeled_samples, neighbors_val, data, ratio_pos_to_neg=ratio_pos_to_neg)
 
+    # Split data into train and validation.
     labeled_samples_train, labeled_nodes_val = self._select_val_samples(
       labeled_samples, self.ratio_val)
+
+    # Create an iterator over training data pairs.
     data_iterator_train = self._pair_iterator(labeled_samples_train, data,
                                               ratio_pos_neg=ratio_pos_to_neg)
 
@@ -659,14 +655,6 @@ def train(self, data, session=None, **kwargs):
         if num_samples_val == 0:
           logging.info('Skipping validation. No validation samples available.')
           break
-        # TODO: remove this.
-        # data_iterator_val = batch_iterator(
-        #     neighbors_val,
-        #     agreement_labels_val,
-        #     self.batch_size,
-        #     shuffle=False,
-        #     allow_smaller_batch=True,
-        #     repeat=False)
         data_iterator_val = self._pair_iterator(labeled_nodes_val, data)
         feed_dict_val = self._construct_feed_dict(
             data_iterator_val, is_train=False)
@@ -880,7 +868,28 @@ def predict_label_by_agreement(self, session, indices, num_neighbors=100):
     return acc
 
   def _pair_iterator(self, labeled_nodes, data, ratio_pos_neg=None):
-    # TODO: add documentation and rename neighbors to samples.
+    """An iterator over pairs of samples for training the agreement model.
+
+    Provides batches of node pairs, including their features and the agreement
+    label (i.e. whether their labels agree).
+
+    Arguments:
+      labeled_nodes:  An array of integers representing the indices of the
+        labeled samples.
+      data: A Dataset object used to provided the labels of the labeled samples.
+      ratio_pos_neg: A float representing the ratio of positive to negative
+        samples in the training set. If this is provided, the train iterator
+        will do rejection sampling based on this ratio to keep the training
+        data balanced. If None, we sample uniformly.
+
+    Yields:
+      neighbors_batch: An array of shape (batch_size, 2), where each row
+        represents a pair of sample indices used for training. It will not
+        include pairs of samples that are in the provided neighbors_val.
+      agreement_batch: An array of shape (batch_size,) with binary values,
+        where each row represents whether the labels of the corresponding
+        neighbor pair agree (1.0) or not (0.0).
+    """
     neighbors_batch = np.empty(shape=(self.batch_size, 2), dtype=np.int32)
     agreement_batch = np.empty(shape=(self.batch_size,), dtype=np.float32)
     while True:
@@ -905,7 +914,23 @@ def _pair_iterator(self, labeled_nodes, data, ratio_pos_neg=None):
       yield neighbors_batch, agreement_batch
 
   def _select_val_samples(self, labeled_samples, ratio_val):
-    # TODO: add documentation.
+    """Split the labeled samples into a train and a validation set.
+
+    The agreement model is trained using pairs of labeled samples from the train
+    set, and is evaluated on pairs of labeled samples from the validation set.
+
+    Arguments:
+      labeled_samples:
+      ratio_val: A number between (0, 1) representing the ratio of all labeled
+        samples to be set aside for validation.
+
+    Returns:
+      labeled_samples_train: An array containig a subset of the provided
+        labeled_samples which will be used for training.
+      labeled_samples_val: An array containig a subset of the provided
+        labeled_samples which will be used for validation. The train and
+        validation indices are non-overlapping.
+    """
     num_labeled_samples = labeled_samples.shape[0]
     num_labeled_samples_val = int(num_labeled_samples * ratio_val)
     self.rng.shuffle(labeled_samples)