tensorflow
diff --git a/‎neural_structured_learning/research/gam/data/dataset.py‎
Lines changed: 269 additions & 111 deletions b/‎neural_structured_learning/research/gam/data/dataset.py‎
Lines changed: 269 additions & 111 deletions
diff --git a/‎neural_structured_learning/research/gam/data/loaders.py‎
Lines changed: 169 additions & 15 deletions b/‎neural_structured_learning/research/gam/data/loaders.py‎
Lines changed: 169 additions & 15 deletions
diff --git a/‎neural_structured_learning/research/gam/data/preprocessing.py‎
Lines changed: 32 additions & 4 deletions b/‎neural_structured_learning/research/gam/data/preprocessing.py‎
Lines changed: 32 additions & 4 deletions
diff --git a/‎neural_structured_learning/research/gam/experiments/helper.py‎
Lines changed: 116 additions & 0 deletions b/‎neural_structured_learning/research/gam/experiments/helper.py‎
Lines changed: 116 additions & 0 deletions
@@ -19,18 +19,23 @@
 
 import json
 import logging
+import os
 import pickle
+import sys
 
-from gam.data.dataset import FixedDataset
+from gam.data.dataset import Dataset
+from gam.data.dataset import PlanetoidDataset
 from gam.data.preprocessing import convert_image
 from gam.data.preprocessing import split_train_val_unlabeled
 
+import networkx as nx
 import numpy as np
+from scipy import sparse as sp
 import tensorflow_datasets as tfds
 
 
-def load_data_tf_datasets(
-    dataset_name, target_num_train_per_class, target_num_val, seed):
+def load_data_tf_datasets(dataset_name, target_num_train_per_class,
+                          target_num_val, seed):
   """Load and preprocess data from tensorflow_datasets."""
   logging.info('Loading and preprocessing data from tensorflow datasets...')
   # Load train data.
@@ -58,17 +63,24 @@ def load_data_tf_datasets(
   unlabeled_labels = data[5]
 
   logging.info('Converting data to Dataset format...')
-  data = FixedDataset(train_inputs, train_labels, val_inputs, val_labels,
-                      test_inputs, test_labels, unlabeled_inputs,
-                      unlabeled_labels, feature_preproc_fn=convert_image)
+  data = Dataset.build_from_splits(
+      name=dataset_name,
+      inputs_train=train_inputs,
+      labels_train=train_labels,
+      inputs_val=val_inputs,
+      labels_val=val_labels,
+      inputs_test=test_inputs,
+      labels_test=test_labels,
+      inputs_unlabeled=unlabeled_inputs,
+      labels_unlabeled=unlabeled_labels,
+      feature_preproc_fn=convert_image)
   return data
 
 
 def load_data_realistic_ssl(dataset_name, data_path, label_map_path):
   """Loads data from the `ealistic Evaluation of Deep SSL Algorithms`."""
   logging.info('Loading data from pickle at %s.', data_path)
-  train_set, validation_set, test_set = pickle.load(
-      open(data_path, 'rb'))
+  train_set, validation_set, test_set = pickle.load(open(data_path, 'rb'))
   train_inputs = train_set['images']
   train_labels = train_set['labels']
   val_inputs = validation_set['images']
@@ -77,8 +89,9 @@ def load_data_realistic_ssl(dataset_name, data_path, label_map_path):
   test_labels = test_set['labels']
   # Load label map that specifies which trainining labeles are available.
   train_indices = json.load(open(label_map_path, 'r'))
-  train_indices = [int(key.encode('ascii', 'ignore'))
-                   for key in train_indices['values']]
+  train_indices = [
+      int(key.encode('ascii', 'ignore')) for key in train_indices['values']
+  ]
   train_indices = np.asarray(train_indices)
 
   # Select the loaded train indices, and make the rest unlabeled.
@@ -90,11 +103,152 @@ def load_data_realistic_ssl(dataset_name, data_path, label_map_path):
   train_labels = train_labels[train_indices]
 
   # Select a feature preprocessing function, depending on the dataset.
-  feature_preproc_fn = ((lambda image: image) if dataset_name == 'cifar10' else
-                        convert_image)
+  feature_preproc_fn = ((lambda image: image)
+                        if dataset_name == 'cifar10' else convert_image)
 
-  data = FixedDataset(
-      train_inputs, train_labels, val_inputs, val_labels, test_inputs,
-      test_labels, unlabeled_inputs, unlabeled_labels,
+  data = Dataset.build_from_splits(
+      name=dataset_name,
+      inputs_train=train_inputs,
+      labels_train=train_labels,
+      inputs_val=val_inputs,
+      labels_val=val_labels,
+      inputs_test=test_inputs,
+      labels_test=test_labels,
+      inputs_unlabeled=unlabeled_inputs,
+      labels_unlabeled=unlabeled_labels,
       feature_preproc_fn=feature_preproc_fn)
   return data
+
+
+def load_from_planetoid_files(dataset_name, path):
+  """Loads Planetoid data in GCN format, as released with the GCN code.
+
+  This function is adapted from https://github.com/tkipf/gcn.
+
+  This function assumes that the following files can be found at the location
+  specified by `path`:
+
+  ind.dataset_str.x          => the feature vectors of the training instances
+                                as scipy.sparse.csr.csr_matrix object.
+  ind.dataset_str.tx         => the feature vectors of the test instances as
+                                scipy.sparse.csr.csr_matrix object.
+  ind.dataset_str.allx       => the feature vectors of both labeled and
+                                unlabeled training instances (a superset of
+                                ind.dataset_str.x) as
+                                scipy.sparse.csr.csr_matrix object.
+  ind.dataset_str.y          => the one-hot labels of the labeled training
+                                instances as numpy.ndarray object.
+  ind.dataset_str.ty         => the one-hot labels of the test instances as
+                                numpy.ndarray object.
+  ind.dataset_str.ally       => the labels for instances in
+                                ind.dataset_str.allx as numpy.ndarray object.
+  ind.dataset_str.graph      => a dict in the format
+                                {index: [index_of_neighbor_nodes]} as
+                                collections.defaultdict object.
+  ind.dataset_str.test.index => the indices of test instances in graph, for
+                                the inductive setting as list object.
+
+  Args:
+    dataset_name: A string representing the dataset name (e.g., `cora`).
+    path: Path to the directory containing the files.
+
+  Returns:
+    All data input files loaded (as well the training/test data).
+  """
+
+  def _sample_mask(idx, l):
+    """Create mask."""
+    mask = np.zeros(l)
+    mask[idx] = 1
+    return np.array(mask, dtype=np.bool)
+
+  def _parse_index_file(filename):
+    """Parse index file."""
+    index = []
+    for line in open(filename):
+      index.append(int(line.strip()))
+    return index
+
+  def _load_file(name):
+    """Load from data file."""
+    filename = 'ind.{}.{}'.format(dataset_name, name)
+    filename = os.path.join(path, filename)
+    with open(filename, 'rb') as f:
+      if sys.version_info > (3, 0):
+        return pickle.load(f, encoding='latin1')  # pylint: disable=unexpected-keyword-arg
+      else:
+        return pickle.load(f)
+
+  x = _load_file('x')
+  y = _load_file('y')
+  tx = _load_file('tx')
+  ty = _load_file('ty')
+  allx = _load_file('allx')
+  ally = _load_file('ally')
+  graph = _load_file('graph')
+
+  filename = 'ind.{}.test.index'.format(dataset_name)
+  filename = os.path.join(path, filename)
+  test_idx_reorder = _parse_index_file(filename)
+  test_idx_range = np.sort(test_idx_reorder)
+
+  if dataset_name == 'citeseer':
+    # Fix citeseer dataset (there are some isolated nodes in the graph).
+    # Find isolated nodes, add them as zero-vecs into the right position.
+    test_idx_range_full = range(
+        min(test_idx_reorder),
+        max(test_idx_reorder) + 1)
+    tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
+    tx_extended[test_idx_range - min(test_idx_range), :] = tx
+    tx = tx_extended
+    ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
+    ty_extended[test_idx_range - min(test_idx_range), :] = ty
+    ty = ty_extended
+
+  features = sp.vstack((allx, tx)).tolil()
+  features[test_idx_reorder, :] = features[test_idx_range, :]
+  adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
+
+  labels = np.vstack((ally, ty))
+  labels[test_idx_reorder, :] = labels[test_idx_range, :]
+
+  idx_test = test_idx_range.tolist()
+  idx_train = range(len(y))
+  idx_val = range(len(y), len(y) + 500)
+
+  train_mask = _sample_mask(idx_train, labels.shape[0])
+  val_mask = _sample_mask(idx_val, labels.shape[0])
+  test_mask = _sample_mask(idx_test, labels.shape[0])
+
+  y_train = np.zeros(labels.shape)
+  y_val = np.zeros(labels.shape)
+  y_test = np.zeros(labels.shape)
+  y_train[train_mask, :] = labels[train_mask, :]
+  y_val[val_mask, :] = labels[val_mask, :]
+  y_test[test_mask, :] = labels[test_mask, :]
+
+  return (adj, features, y_train, y_val, y_test, train_mask, val_mask,
+          test_mask, labels)
+
+
+def load_data_planetoid(name, path, splits_path=None, row_normalize=False):
+  """Load Planetoid data."""
+  if splits_path is None:
+    # Load from file in Planetoid format.
+    (adj, features, _, _, _, train_mask, val_mask, test_mask,
+     labels) = load_from_planetoid_files(name, path)
+  else:
+    # Otherwise load from a path where we saved a pickle with random splits.
+    logging.info('Loading from splits path: %s', splits_path)
+    (adj, features, _, _, _, train_mask, val_mask, test_mask,
+     labels) = pickle.load(open(splits_path, 'rb'))
+
+  return PlanetoidDataset(
+      name,
+      adj,
+      features,
+      train_mask,
+      val_mask,
+      test_mask,
+      labels,
+      row_normalize=row_normalize)
@@ -29,8 +29,36 @@ def convert_image(image):
   return image
 
 
-def split_train_val_unlabeled(train_inputs, train_labels,
-                              target_num_train_per_class, target_num_val,
+def split_train_val(indices, ratio_val, rng, max_num_val=None):
+  """Split the train sample indices into train and validation.
+
+  Args:
+    indices: A numpy array containing the indices of the training samples.
+    ratio_val: A float number between (0, 1) representing the ratio of samples
+      to use for validation.
+    rng: A random number generator.
+    max_num_val: An integer representing the maximum number of samples to
+      include in the validation set.
+
+  Returns:
+    Two numpy arrays containing the subset of indices used for training, and
+    validation, respectively.
+  """
+  num_samples = indices.shape[0]
+  num_val = int(ratio_val * num_samples)
+  if max_num_val and num_val > max_num_val:
+    num_val = max_num_val
+  ind = np.arange(0, num_samples)
+  rng.shuffle(ind)
+  ind_val = ind[:num_val]
+  ind_train = ind[num_val:]
+  return ind_train, ind_val
+
+
+def split_train_val_unlabeled(train_inputs,
+                              train_labels,
+                              target_num_train_per_class,
+                              target_num_val,
                               seed=None):
   """Splits the training data into train, validation and unlabeled samples.
 
@@ -102,5 +130,5 @@ def split_train_val_unlabeled(train_inputs, train_labels,
   train_inputs = train_inputs[ind_train]
   train_labels = train_labels[ind_train]
 
-  return (train_inputs, train_labels, val_inputs, val_labels,
-          unlabeled_inputs, unlabeled_labels)
+  return (train_inputs, train_labels, val_inputs, val_labels, unlabeled_inputs,
+          unlabeled_labels)
@@ -0,0 +1,116 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helper functions for GAMs."""
+from gam.models.cnn import ImageCNNAgreement
+from gam.models.mlp import MLP
+from gam.models.wide_resnet import WideResnet
+
+import tensorflow as tf
+
+
+def parse_layers_string(layers_string):
+  """Convert a layer size string (e.g., `128_64_32`) to a list of integers."""
+  if not layers_string:
+    return ()
+  num_hidden = layers_string.split('_')
+  num_hidden = [int(num) for num in num_hidden]
+  return num_hidden
+
+
+def get_model_cls(model_name, data, dataset_name, hidden=None, **unused_kwargs):
+  """Picks the models depending on the provided configuration flags."""
+  # Create model classification.
+  if model_name == 'mlp':
+    hidden = parse_layers_string(hidden) if hidden is not None else ()
+    return MLP(
+        output_dim=data.num_classes,
+        hidden_sizes=hidden,
+        activation=tf.nn.leaky_relu,
+        name='mlp_cls')
+  elif model_name == 'cnn':
+    if dataset_name in ('mnist', 'fashion_mnist'):
+      channels = 1
+    elif dataset_name in ('cifar10', 'cifar100', 'svhn_cropped', 'svhn'):
+      channels = 3
+    else:
+      raise ValueError('Dataset name `%s` unsupported.' % dataset_name)
+    return ImageCNNAgreement(
+        output_dim=data.num_classes,
+        channels=channels,
+        activation=tf.nn.leaky_relu,
+        name='cnn_cls')
+  elif model_name == 'wide_resnet':
+    return WideResnet(
+        num_classes=data.num_classes,
+        lrelu_leakiness=0.1,
+        horizontal_flip=dataset_name in ('cifar10',),
+        random_translation=True,
+        gaussian_noise=dataset_name not in ('svhn', 'svhn_cropped'),
+        width=2,
+        num_residual_units=4,
+        name='wide_resnet_cls')
+  else:
+    raise NotImplementedError()
+
+
+def get_model_agr(model_name,
+                  dataset_name,
+                  hidden_aggreg=None,
+                  aggregation_agr_inputs='dist',
+                  hidden=None,
+                  **unused_kwargs):
+  """Create agreement model."""
+  hidden = parse_layers_string(hidden) if hidden is not None else ()
+  hidden_aggreg = (
+      parse_layers_string(hidden_aggreg) if hidden_aggreg is not None else ())
+  if model_name == 'mlp':
+    return MLP(
+        output_dim=1,
+        hidden_sizes=hidden,
+        activation=tf.nn.leaky_relu,
+        aggregation=aggregation_agr_inputs,
+        hidden_aggregation=hidden_aggreg,
+        is_binary_classification=True,
+        name='mlp_agr')
+  elif model_name == 'cnn':
+    if dataset_name in ('mnist', 'fashion_mnist'):
+      channels = 1
+    elif dataset_name in ('cifar10', 'cifar100', 'svhn_cropped', 'svhn'):
+      channels = 3
+    else:
+      raise ValueError('Dataset name `%s` unsupported.' % dataset_name)
+    return ImageCNNAgreement(
+        output_dim=1,
+        channels=channels,
+        activation=tf.nn.leaky_relu,
+        aggregation=aggregation_agr_inputs,
+        hidden_aggregation=hidden_aggreg,
+        is_binary_classification=True,
+        name='cnn_agr')
+  elif model_name == 'wide_resnet':
+    return WideResnet(
+        num_classes=1,
+        lrelu_leakiness=0.1,
+        horizontal_flip=dataset_name in ('cifar10',),
+        random_translation=True,
+        gaussian_noise=dataset_name not in ('svhn', 'svhn_cropped'),
+        width=2,
+        num_residual_units=4,
+        name='wide_resnet_cls',
+        is_binary_classification=True,
+        aggregation=aggregation_agr_inputs,
+        activation=tf.nn.leaky_relu,
+        hidden_aggregation=hidden_aggreg)
+  else:
+    raise NotImplementedError()