add decoder part attention visualisation

Lingjun Liu · Lingjun Liu · commit 3c4cae17d134 · 2019-09-01T12:24:31.000+01:00
diff --git a/tensorlayer/models/transformer/attention_layer.py b/tensorlayer/models/transformer/attention_layer.py
@@ -124,6 +124,7 @@ def forward(self, x, y, mask, cache=None):
 
     Returns:
       Attention layer output with shape [batch_size, length_x, hidden_size]
+      Attention weights with shape [batch_size, number_of_head, length_x, length_y]
     """
         # Linearly project the query (q), key (k) and value (v) using different
         # learned projections. This is in preparation of splitting them into
diff --git a/tensorlayer/models/transformer/transformer.py b/tensorlayer/models/transformer/transformer.py
@@ -77,12 +77,55 @@ def forward(self, inputs, targets=None):
       training: boolean, whether in training mode or not.
 
     Returns:
-      If targets is defined, then return logits for each word in the target
-      sequence. float tensor with shape [batch_size, target_length, vocab_size]
-      If target is none, then generate output sequence one token at a time.
-        returns a dictionary {
-          outputs: [batch_size, decoded length]
-          scores: [batch_size, float]}
+      If targets is defined:
+        Logits for each word in the target sequence: 
+            float tensor with shape [batch_size, target_length, vocab_size]
+        Self-attention weights for encoder part:
+            a dictionary of float tensors {
+                "layer_0": [batch_size, number_of_heads, source_length, source_length],
+                "layer_1": [batch_size, number_of_heads, source_length, source_length],
+                ...
+            }
+        Weights for decoder part:
+            a dictionary of dictionary of float tensors {
+                "self": {
+                    "layer_0": [batch_size, number_of_heads, target_length, target_length],
+                    "layer_1": [batch_size, number_of_heads, target_length, target_length],
+                    ...
+                }
+                "enc_dec": {
+                    "layer_0": [batch_size, number_of_heads, source_length, target_length],
+                    "layer_1": [batch_size, number_of_heads, source_length, target_length],
+                    ...
+                }
+            }
+    
+      If target is none:
+        Auto-regressive beam-search decoding to generate output each one time step:
+            a dictionary {
+            outputs: [batch_size, decoded length]
+            scores: [batch_size, float]}
+            }
+        Weights for decoder part:
+            a dictionary of dictionary of float tensors {
+                "self": {
+                    "layer_0": [batch_size, number_of_heads, target_length, target_length],
+                    "layer_1": [batch_size, number_of_heads, target_length, target_length],
+                    ...
+                }
+                "enc_dec": {
+                    "layer_0": [batch_size, number_of_heads, source_length, target_length],
+                    "layer_1": [batch_size, number_of_heads, source_length, target_length],
+                    ...
+                }
+            }
+        Self-attention weights for encoder part:
+            a dictionary of float tensors {
+                "layer_0": [batch_size, number_of_heads, source_length, source_length],
+                "layer_1": [batch_size, number_of_heads, source_length, source_length],
+                ...
+            }
+
     """
     # # Variance scaling is used here because it seems to work in many problems.
     # # Other reasonable initializers may also work just as well.
@@ -118,6 +161,7 @@ def encode(self, inputs, attention_bias):
 
     Returns:
       float tensor with shape [batch_size, input_length, hidden_size]
+      
     """
     
       # Prepare inputs to the layer stack by adding positional encodings and
@@ -223,7 +267,12 @@ def symbols_to_logits_fn(ids, i, cache):
     return symbols_to_logits_fn, weights
 
   def predict(self, encoder_outputs, encoder_decoder_attention_bias):
-    """Return predicted sequence."""
+    """
+    
+    Return predicted sequence, and decoder attention weights.
+
+    
+    """
     batch_size = tf.shape(encoder_outputs)[0]
     input_length = tf.shape(encoder_outputs)[1]
     max_decode_length = input_length + self.params.extra_decode_length
@@ -263,7 +312,15 @@ def predict(self, encoder_outputs, encoder_decoder_attention_bias):
     top_decoded_ids = decoded_ids[:, 0, 1:]
     top_scores = scores[:, 0]
 
-    return {"outputs": top_decoded_ids, "scores": top_scores}, weights
+    # post-process the weight attention
+    for i, weight in enumerate(weights):
+        if (i == 0):
+            w = weight
+        else:
+            for k in range(len(w['self'])):
+                w['self']['layer_%d' % k] = tf.concat([w['self']['layer_%d' % k], weight['self']['layer_%d' % k]], 3)
+                w['enc_dec']['layer_%d' % k] = tf.concat([w['enc_dec']['layer_%d' % k], weight['enc_dec']['layer_%d' % k]], 2)
+    return {"outputs": top_decoded_ids, "scores": top_scores}, w
 
 
 class LayerNormalization(tl.layers.Layer):
diff --git a/tensorlayer/models/transformer/utils/__init__.py b/tensorlayer/models/transformer/utils/__init__.py
@@ -1,4 +1,3 @@
 from .model_utils import *
 from .metrics import *
-from .subtokenizer import *
 from .attention_visualisation import *
diff --git a/tests/models/test_transformer.py b/tests/models/test_transformer.py
@@ -41,7 +41,7 @@ class TINY_PARAMS(object):
 
     # Default prediction params
     extra_decode_length=5
-    beam_size=2
+    beam_size=1
     alpha=0.6 # used to calculate length normalization in beam search
 
 
@@ -64,7 +64,7 @@ def setUpClass(cls):
 
         assert cls.src_len == cls.tgt_len
 
-        cls.num_epochs = 20
+        cls.num_epochs = 100
         cls.n_step = cls.src_len // cls.batch_size
 
     @classmethod
@@ -108,25 +108,34 @@ def test_basic_simpleSeq2Seq(self):
             model_.eval()
             [prediction, weights_decoder], weights_encoder = model_(inputs = test_sample)
             
+
             print("Prediction: >>>>>  ", prediction["outputs"], "\n Target: >>>>>  ", trainY[0:2, :], "\n\n")
 
             print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, self.num_epochs, total_loss / n_iter))
 
 
-        # visualise the self-attention weights at encoder 
+        # visualise the self-attention weights at encoder during training
         trainX, trainY = shuffle(self.trainX, self.trainY)
         X = [trainX[0]]
         Y = [trainY[0]]
         logits, weights_encoder, weights_decoder = model_(inputs = X, targets = Y)
         attention_visualisation.plot_attention_weights(weights_encoder["layer_0"], X[0].numpy(), X[0].numpy())
 
-        # visualise the self-attention weights at encoder 
+        # visualise the encoder-decoder-attention weights at decoder during training
         trainX, trainY = shuffle(self.trainX, self.trainY)
         X = [trainX[0]]
         Y = [trainY[0]]
         logits, weights_encoder, weights_decoder = model_(inputs = X, targets = Y)
         attention_visualisation.plot_attention_weights(weights_decoder["enc_dec"]["layer_0"], X[0].numpy(), Y[0])
 
+        # visualise the encoder-decoder-attention weights at decoder during inference
+        trainX, trainY = shuffle(self.trainX, self.trainY)
+        X = [trainX[0]]
+        # Y = [trainY[0]]
+        model_.eval()
+        [prediction, weights_decoder], weights_encoder = model_(inputs = X)
+        # print(X[0].numpy(), prediction["outputs"][0].numpy())
+        attention_visualisation.plot_attention_weights(weights_decoder["enc_dec"]["layer_0"], X[0].numpy(), prediction["outputs"][0].numpy())
 
 if __name__ == '__main__':
     unittest.main()