In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from __future__ import division, print_function, unicode_literals

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

The flowing code trains a Deep Neural Network (DNN) for classification with two hidden layers (one with 300
neurons, and the other with 100 neurons) and a softmax output layer with 10 neurons. We run this code on the MNIST dataset. Moreover, we apply the some changes (listed below) to the network to tackle the following challenges:
* Overfitting: the l1 regularization
* Vanishing gradinet: the He initilization, ELU activation function in hidden layers, and Batch Normalization at each layer
* Training speed: Adam optimization

In this code, we also use the **learning rate scheduling** technique to update the learning reate over exeuting the model. Finding a good learning rate can be tricky. If you set it way too high, training may actually diverge, and if you set it too low, training will eventually converge to the optimum, but it will take a very long time. In this example we use the exponential scheduling to update the learning rate. To do so, after setting the hyperparameter values, we create a nontrainable variable `global_step` (initialized to 0) to keep track of the current training iteration number. Then we define an exponentially decaying learning rate using TensorFlow's `exponential_decay()` function. Next, we create an optimizer using this decaying learning rate. Finally, we create the training operation by calling the optimizer's `minimize()` method; since we pass it the `global_step` variable, it will kindly take care of incrementing it.

In [None]:
# manual building layers
reset_graph()

n_features = 28 * 28  # MNIST
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
learning_rate = 0.01
batch_norm_momentum = 0.9
n_epochs = 20
batch_size = 50

########################################
# load the mnist dataset
########################################
mnist = input_data.read_data_sets("/tmp/data/")
X_train = mnist.train.images
X_test = mnist.test.images
y_train = mnist.train.labels.astype("int")
y_test = mnist.test.labels.astype("int")

########################################
# define the placeholders
########################################
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name="training")

########################################
# build the network (using l1-regularization and batch normalizatio)
########################################
with tf.name_scope("dnn"):
    scale = 0.001

    # he initilization
    he_init = tf.contrib.layers.variance_scaling_initializer()

    hidden1 = tf.layers.dense(X, n_hidden1, kernel_initializer=he_init, 
                              kernel_regularizer=tf.contrib.layers.l1_regularizer(scale), name="hidden1")
    bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=batch_norm_momentum)
    bn1_act = tf.nn.elu(bn1)
    
    hidden2 = tf.layers.dense(bn1_act, n_hidden2, kernel_initializer=he_init, 
                              kernel_regularizer=tf.contrib.layers.l1_regularizer(scale), name="hidden2")
    bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=batch_norm_momentum)
    bn2_act = tf.nn.elu(bn2)
    
    logits_before_bn = tf.layers.dense(bn2_act, n_outputs, kernel_initializer=he_init, 
                              kernel_regularizer=tf.contrib.layers.l1_regularizer(scale), name="outputs")
    logits = tf.layers.batch_normalization(logits_before_bn, training=training, momentum=batch_norm_momentum)

########################################
# define the cost (loss) function
########################################
with tf.name_scope("loss"):
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)                                # not shown
    base_cost = tf.reduce_mean(cross_entropy, name="avg_xentropy")
    reg_cost = tf.losses.get_regularization_loss()
    cost = base_cost + reg_cost

########################################
# train the model (use  learning rate scheduling)
########################################
with tf.name_scope("train"):
    initial_learning_rate = 0.1
    decay_steps = 10000
    decay_rate = 1 / 10
    global_step = tf.Variable(0, trainable=False, name="global_step")
    learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step, decay_steps, decay_rate)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(cost, global_step=global_step)

########################################
# define the evaluation metrics
########################################
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

########################################
# execute the model
########################################
init = tf.global_variables_initializer()

# since we are using tf.layers.batch_normalization(), we need to explicitly run the extra update operations
# needed by batch normalization
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run([training_op, extra_update_ops], feed_dict={training: True, X: X_batch, y: y_batch})
        
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
        print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)        

Here, we use the dropout technique instead of l1-regularization to solve the overfitting problem.

In [None]:
# manual building layers
reset_graph()

n_features = 28 * 28  # MNIST
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
learning_rate = 0.01
batch_norm_momentum = 0.9
n_epochs = 20
batch_size = 50
dropout_rate = 0.5

########################################
# load the mnist dataset
########################################
mnist = input_data.read_data_sets("/tmp/data/")
X_train = mnist.train.images
X_test = mnist.test.images
y_train = mnist.train.labels.astype("int")
y_test = mnist.test.labels.astype("int")

########################################
# define the placeholders
########################################
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name="training")

########################################
# build the network (using l1-regularization and batch normalizatio)
########################################
with tf.name_scope("dnn"):
    X_drop = tf.layers.dropout(X, dropout_rate, training=training)

    # he initilization
    he_init = tf.contrib.layers.variance_scaling_initializer()

    hidden1 = tf.layers.dense(X_drop, n_hidden1, kernel_initializer=he_init, name="hidden1")
    bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=batch_norm_momentum)
    bn1_act = tf.nn.elu(bn1)
    hidden1_drop = tf.layers.dropout(bn1_act, dropout_rate, training=training)
    
    hidden2 = tf.layers.dense(hidden1_drop, n_hidden2, kernel_initializer=he_init, name="hidden2")
    bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=batch_norm_momentum)
    bn2_act = tf.nn.elu(bn2)
    hidden2_drop = tf.layers.dropout(bn2_act, dropout_rate, training=training)
    
    logits_before_bn = tf.layers.dense(hidden2_drop, n_outputs, kernel_initializer=he_init, name="outputs")
    logits = tf.layers.batch_normalization(logits_before_bn, training=training, momentum=batch_norm_momentum)

########################################
# define the cost (loss) function
########################################
with tf.name_scope("loss"):
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)                                # not shown
    cost = tf.reduce_mean(cross_entropy, name="cost")

########################################
# train the model (use  learning rate scheduling)
########################################
with tf.name_scope("train"):
    initial_learning_rate = 0.1
    decay_steps = 10000
    decay_rate = 1 / 10
    global_step = tf.Variable(0, trainable=False, name="global_step")
    learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step, decay_steps, decay_rate)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(cost, global_step=global_step)

########################################
# define the evaluation metrics
########################################
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

########################################
# execute the model
########################################
init = tf.global_variables_initializer()

# since we are using tf.layers.batch_normalization(), we need to explicitly run the extra update operations
# needed by batch normalization
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run([training_op, extra_update_ops], feed_dict={training: True, X: X_batch, y: y_batch})
        
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
        print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)        