Skip to content

Commit 21161cb

Browse files
Lingjun LiuLingjun Liu
authored andcommitted
adjust files
1 parent 1df422c commit 21161cb

File tree

2 files changed

+266
-0
lines changed

2 files changed

+266
-0
lines changed

tensorlayer/optimizers/lazyAdam.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ==============================================================================
15+
"""Optimizer from addons and learning rate scheduler."""
16+
17+
from __future__ import absolute_import
18+
from __future__ import division
19+
from __future__ import print_function
20+
21+
import numpy as np
22+
import tensorflow as tf
23+
K = tf.keras.backend
24+
25+
26+
class LazyAdam(tf.keras.optimizers.Adam):
27+
"""Variant of the Adam optimizer that handles sparse updates more efficiently.
28+
29+
The original Adam algorithm maintains two moving-average accumulators for
30+
each trainable variable; the accumulators are updated at every step.
31+
This class provides lazier handling of gradient updates for sparse
32+
variables. It only updates moving-average accumulators for sparse variable
33+
indices that appear in the current batch, rather than updating the
34+
accumulators for all indices. Compared with the original Adam optimizer,
35+
it can provide large improvements in model training throughput for some
36+
applications. However, it provides slightly different semantics than the
37+
original Adam algorithm, and may lead to different empirical results.
38+
Note, amsgrad is currently not supported and the argument can only be
39+
False.
40+
41+
This class is borrowed from:
42+
https://github.com/tensorflow/addons/blob/master/tensorflow_addons/optimizers/lazy_adam.py
43+
"""
44+
45+
def _resource_apply_sparse(self, grad, var, indices):
46+
"""Applies grad for one step."""
47+
var_dtype = var.dtype.base_dtype
48+
lr_t = self._decayed_lr(var_dtype)
49+
beta_1_t = self._get_hyper('beta_1', var_dtype)
50+
beta_2_t = self._get_hyper('beta_2', var_dtype)
51+
local_step = tf.cast(self.iterations + 1, var_dtype)
52+
beta_1_power = tf.math.pow(beta_1_t, local_step)
53+
beta_2_power = tf.math.pow(beta_2_t, local_step)
54+
epsilon_t = tf.convert_to_tensor(self.epsilon, var_dtype)
55+
lr = (lr_t * tf.math.sqrt(1 - beta_2_power) / (1 - beta_1_power))
56+
57+
# \\(m := beta1 * m + (1 - beta1) * g_t\\)
58+
m = self.get_slot(var, 'm')
59+
m_t_slice = beta_1_t * tf.gather(m, indices) + (1 - beta_1_t) * grad
60+
61+
m_update_kwargs = {'resource': m.handle, 'indices': indices, 'updates': m_t_slice}
62+
m_update_op = tf.raw_ops.ResourceScatterUpdate(**m_update_kwargs)
63+
64+
# \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
65+
v = self.get_slot(var, 'v')
66+
v_t_slice = (beta_2_t * tf.gather(v, indices) + (1 - beta_2_t) * tf.math.square(grad))
67+
68+
v_update_kwargs = {'resource': v.handle, 'indices': indices, 'updates': v_t_slice}
69+
v_update_op = tf.raw_ops.ResourceScatterUpdate(**v_update_kwargs)
70+
71+
# \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
72+
var_slice = lr * m_t_slice / (tf.math.sqrt(v_t_slice) + epsilon_t)
73+
74+
var_update_kwargs = {'resource': var.handle, 'indices': indices, 'updates': var_slice}
75+
var_update_op = tf.raw_ops.ResourceScatterSub(**var_update_kwargs)
76+
77+
return tf.group(*[var_update_op, m_update_op, v_update_op])
78+
79+
80+
class LearningRateFn(object):
81+
"""Creates learning rate function."""
82+
83+
def __init__(self, learning_rate, hidden_size, warmup_steps):
84+
self.learning_rate = learning_rate
85+
self.hidden_size = hidden_size
86+
self.warmup_steps = float(warmup_steps)
87+
88+
def __call__(self, global_step):
89+
"""Calculate learning rate with linear warmup and rsqrt decay."""
90+
step = float(global_step)
91+
learning_rate = self.learning_rate
92+
learning_rate *= (self.hidden_size**-0.5)
93+
# Apply linear warmup
94+
learning_rate *= np.minimum(1.0, step / self.warmup_steps)
95+
# Apply rsqrt decay
96+
learning_rate /= np.sqrt(np.maximum(step, self.warmup_steps))
97+
return learning_rate
98+
99+
100+
class LearningRateScheduler(tf.keras.callbacks.Callback):
101+
"""Keras callback to schedule learning rate.
102+
103+
TODO(tianlin): Refactor this scheduler and LearningRateBatchScheduler in
104+
official/resnet/keras/keras_common.py.
105+
"""
106+
107+
def __init__(self, schedule, init_steps=None, verbose=False):
108+
super(LearningRateScheduler, self).__init__()
109+
self.schedule = schedule
110+
self.verbose = verbose
111+
if init_steps is None:
112+
init_steps = 0.0
113+
self.steps = float(init_steps) # Total steps during training.
114+
115+
def on_epoch_begin(self, epoch, logs=None):
116+
if not hasattr(self.model.optimizer, 'lr'):
117+
raise ValueError('Optimizer must have a "lr" attribute.')
118+
if not hasattr(self.model.optimizer, 'iterations'):
119+
raise ValueError('Optimizer must have a "iterations" attribute.')
120+
121+
def on_train_batch_begin(self, batch, logs=None):
122+
"""Adjusts learning rate for each train batch."""
123+
if self.verbose > 0:
124+
iterations = K.get_value(self.model.optimizer.iterations)
125+
print('Original iteration %d' % iterations)
126+
127+
self.steps += 1.0
128+
try: # new API
129+
lr = float(K.get_value(self.model.optimizer.lr))
130+
lr = self.schedule(self.steps, lr)
131+
except TypeError: # Support for old API for backward compatibility
132+
lr = self.schedule(self.steps)
133+
if not isinstance(lr, (float, np.float32, np.float64)):
134+
raise ValueError('The output of the "schedule" function ' 'should be float.')
135+
K.set_value(self.model.optimizer.lr, lr)
136+
K.set_value(self.model.optimizer.iterations, self.steps)
137+
138+
if self.verbose > 0:
139+
print(
140+
'Batch %05d Step %05d: LearningRateScheduler setting learning '
141+
'rate to %s.' % (batch + 1, self.steps, lr)
142+
)
143+
144+
def on_epoch_end(self, epoch, logs=None):
145+
logs = logs or {}
146+
logs['lr'] = K.get_value(self.model.optimizer.lr)
147+
logs['steps'] = self.steps

tests/models/test_transformer.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
import os
5+
import unittest
6+
7+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
8+
9+
import numpy as np
10+
import tensorflow as tf
11+
import tensorlayer as tl
12+
from tqdm import tqdm
13+
from sklearn.utils import shuffle
14+
from tensorlayer.models.transformer import Transformer
15+
from tests.utils import CustomTestCase
16+
from tensorlayer.models.transformer.utils import metrics
17+
from tensorlayer.cost import cross_entropy_seq
18+
from tensorlayer.optimizers import lazyAdam as optimizer
19+
import time
20+
21+
22+
23+
24+
25+
class TINY_PARAMS(object):
26+
vocab_size = 50
27+
encoder_num_layers = 2
28+
decoder_num_layers = 2
29+
filter_number = 256
30+
R1 = 4
31+
R2 = 8
32+
n_channels = 2
33+
n_units = 128
34+
H = 32
35+
light_filter_size=(1,3)
36+
filter_size = light_filter_size[-1]
37+
hidden_size = 64
38+
ff_size = 16
39+
num_heads = 4
40+
keep_prob = 0.9
41+
42+
43+
44+
# Default prediction params
45+
extra_decode_length=5
46+
beam_size=2
47+
alpha=0.6 # used to calculate length normalization in beam search
48+
49+
50+
class Model_SEQ2SEQ_Test(CustomTestCase):
51+
52+
@classmethod
53+
def setUpClass(cls):
54+
cls.batch_size = 16
55+
56+
cls.embedding_size = 32
57+
cls.dec_seq_length = 5
58+
cls.trainX = np.random.randint(low=2, high=50, size=(50, 11))
59+
cls.trainY = np.random.randint(low=2, high=50, size=(50, 10))
60+
61+
cls.trainX[:,-1] = 1
62+
cls.trainY[:,-1] = 1
63+
# Parameters
64+
cls.src_len = len(cls.trainX)
65+
cls.tgt_len = len(cls.trainY)
66+
67+
assert cls.src_len == cls.tgt_len
68+
69+
cls.num_epochs = 1000
70+
cls.n_step = cls.src_len // cls.batch_size
71+
72+
@classmethod
73+
def tearDownClass(cls):
74+
pass
75+
76+
def test_basic_simpleSeq2Seq(self):
77+
78+
model_ = Transformer(TINY_PARAMS)
79+
80+
# print(", ".join(x for x in [t.name for t in model_.trainable_weights]))
81+
82+
self.vocab_size = TINY_PARAMS.vocab_size
83+
optimizer = tf.optimizers.Adam(learning_rate=0.01)
84+
for epoch in range(self.num_epochs):
85+
model_.train()
86+
t = time.time()
87+
trainX, trainY = shuffle(self.trainX, self.trainY)
88+
total_loss, n_iter = 0, 0
89+
for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=self.batch_size,
90+
shuffle=False), total=self.n_step,
91+
desc='Epoch[{}/{}]'.format(epoch + 1, self.num_epochs), leave=False):
92+
93+
with tf.GradientTape() as tape:
94+
95+
targets = Y
96+
logits = model_(inputs = X, targets = Y)
97+
logits = metrics.MetricLayer(self.vocab_size)([logits, targets])
98+
logits, loss = metrics.LossLayer(self.vocab_size, 0.1)([logits, targets])
99+
100+
grad = tape.gradient(loss, model_.all_weights)
101+
optimizer.apply_gradients(zip(grad, model_.all_weights))
102+
103+
104+
total_loss += loss
105+
n_iter += 1
106+
print(time.time()-t)
107+
tl.files.save_npz(model_.all_weights, name='./model_v4.npz')
108+
model_.eval()
109+
test_sample = trainX[0:2, :]
110+
model_.eval()
111+
prediction = model_(inputs = test_sample)
112+
113+
print("Prediction: >>>>> ", prediction["outputs"], "\n Target: >>>>> ", trainY[0:2, :], "\n\n")
114+
115+
print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, self.num_epochs, total_loss / n_iter))
116+
117+
118+
if __name__ == '__main__':
119+
unittest.main()

0 commit comments

Comments
 (0)