This course proposes a quick introduction to deep learning and two of its major networks, convolutional neural networks (CNNs) and recurrent neural networks (RNNs). The purpose is to give an intuitive sense of how to implement deep learning approaches for various tasks. To use this iPython notebook, run the python code in separate files for each cell. The content below each cell of this notebook is the output for running those cells.
Simple perceptron¶
import numpy as np
# sigmoid function
def sigmoid(x,deriv=False):
if(deriv==True):
return x*(1-x)
return 1/(1+np.exp(-x))
# input dataset
X = np.array([[0,0,1],
[0,1,1],
[1,0,1],
[1,1,1]])
# output dataset
y = np.array([[0,0,1,1]]).T
# seed random numbers to make calculation
# deterministic (just a good practice)
np.random.seed(1)
# initialize weights randomly with mean 0
syn0 = 2*np.random.random((3,1)) - 1
for j in range(100000):
# forward propagation
l0 = X
l1 = sigmoid(np.dot(l0,syn0))
# how much did we miss?
l1_error = y - l1
if (j% 10000) == 0:
print("Error:" + str(np.mean(np.abs(l1_error))))
# multiply how much we missed by the
# slope of the sigmoid at the values in l1
l1_delta = l1_error * sigmoid(l1,True)
# update weights
syn0 += np.dot(l0.T,l1_delta)
print()
print("Prediction after Training:")
print(l1)
What is the loss function here? How is it calculated?
Any idea how it would perform on non-linearly separable data? How could we test it?
Multilayer perceptron¶
Let’s use the fact that the sigmoid is differenciable (while the step function we saw in the slides is not). This allows us to add more layers (hence more modelling power).
import numpy as np
def sigmoid(x,deriv=False):
if(deriv==True):
return x*(1-x)
return 1/(1+np.exp(-x))
X = np.array([[0,0,1],
[0,1,1],
[1,0,1],
[1,1,1]])
y = np.array([[0],
[1],
[1],
[0]])
np.random.seed(1)
# randomly initialize our weights with mean 0
syn0 = 2*np.random.random((3,4)) - 1
syn1 = 2*np.random.random((4,1)) - 1
for j in range(100000):
# Feed forward through layers 0, 1, and 2
l0 = X
l1 = sigmoid(np.dot(l0,syn0))
l2 = sigmoid(np.dot(l1,syn1))
# how much did we miss the target value?
l2_error = y - l2
if (j% 10000) == 0:
print("Error:" + str(np.mean(np.abs(l2_error))))
# in what direction is the target value?
# were we really sure? if so, don't change too much.
l2_delta = l2_error*sigmoid(l2,deriv=True)
# how much did each l1 value contribute to the l2 error (according to the weights)?
l1_error = l2_delta.dot(syn1.T)
# in what direction is the target l1?
# were we really sure? if so, don't change too much.
l1_delta = l1_error * sigmoid(l1,deriv=True)
syn1 += l1.T.dot(l2_delta)
syn0 += l0.T.dot(l1_delta)
print()
print(l2)
Setting up the environment¶
We have done toy examples for feedforward networks. Things quickly become complicated, so let’s go deeper by relying on high-level frameworks: TensorFlow and Keras. Most technicalities are thus avoided so that you can directly play with networks.
!conda install tensorflow keras
import tensorflow as tf
import keras
hello = tf.constant('Hello, TensorFlow!')
sess = tf.Session()
print(sess.run(hello))
CNNs¶
We are going to use the MNIST dataset for our first task. The code below loads the dataset and shows one training example and its label.
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from pylab import *
# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()
print("The first training instance is labeled as: "+str(y_train[0]))
figure(1)
imshow(x_train[0], interpolation='nearest')
Now study the following code. What is the network we use? How many layers? What hyper parameters?
# Setup some hyper parameters
batch_size = 128
num_classes = 10
epochs = 15
# input image dimensions
img_rows, img_cols = 28, 28
# This is some technicality regarding Keras' dataset
if K.image_data_format() == 'channels_first':
x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
input_shape = (1, img_rows, img_cols)
else:
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)
# We convert the matrices to floats as we will use real numbers
x_train = x_train.astype('float32')[:1000]
x_test = x_test.astype('float32')[:200]
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)[:1000]
y_test = keras.utils.to_categorical(y_test, num_classes)[:200]
# Build network
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
activation='relu',
input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
# model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=keras.optimizers.Adam(),
metrics=['accuracy'])
# Train
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_data=(x_test, y_test))
# Evaluate on test data
score = model.evaluate(x_test, y_test, verbose=0)
print()
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Evaluate on training data
score = model.evaluate(x_train, y_train, verbose=0)
print()
print('Train loss:', score[0])
print('Train accuracy:', score[1])
Is there anything wrong here?
How do you think a linear classifier performs?
# Setup some hyper parameters
batch_size = 128
num_classes = 10
epochs = 15
# input image dimensions
img_rows, img_cols = 28, 28
# This is some technicality regarding Keras' dataset
if K.image_data_format() == 'channels_first':
x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
input_shape = (1, img_rows, img_cols)
else:
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)
# We convert the matrices to floats as we will use real numbers
x_train = x_train.astype('float32')[:1000]
x_test = x_test.astype('float32')[:200]
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)[:1000]
y_test = keras.utils.to_categorical(y_test, num_classes)[:200]
# Build network
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
activation='relu',
input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=keras.optimizers.Adam(),
metrics=['accuracy'])
# Train
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_data=(x_test, y_test))
# Evaluate on test data
score = model.evaluate(x_test, y_test, verbose=0)
print()
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Evaluate on training data
score = model.evaluate(x_train, y_train, verbose=0)
print()
print('Train loss:', score[0])
print('Train accuracy:', score[1])
Let’s use this model to predict a value for the first training instance we vizualized.
print(model.predict(np.expand_dims(x_train[0], axis=0)))
Is the model correct here? What is the output of the network?
RNNs¶
We will now switch to RNNs. These require more resources, so we can’t do the fanciest applications during the workshop. We will do some sentiment classification of movie reviews.
from __future__ import print_function
import numpy as np
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb
# Number of considered words, based on frequencies
max_features = 20000
# cut texts after this number of words
maxlen = 100
batch_size = 32
print('Loading data...')
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=max_features, index_from=3)
# This is just for pretty printing the sentences...
word_to_id = keras.datasets.imdb.get_word_index()
word_to_id = {k:(v+3) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}
print("Here's the input for the first training instance:")
print(' '.join(id_to_word[id] for id in x_train[0] ))
What do you think about this text? Is it a positive or negative review?
print("Here are the dataset shapes")
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print("And the input for the first instance is represented as:")
print(x_train[0])
What do these numbers represent? Is there any limitation you can imagine coming from this?
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)[:5000]
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)[:5000]
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)[:5000]
y_test = np.array(y_test)[:5000]
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
print('Train...')
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=4,
validation_data=[x_test, y_test])
print("The neural net predicts that the first instance sentiment is:")
print(model.predict(np.expand_dims(x_train[0], axis=0)))
Remarks? Comments?
How do the training scores compare to the test scores? How can we improve this? What are the current limitations?
This RNN use case takes more time to train but it is definitely more impressive. We will model the language, by training on a novel. For each (set of) word(s) in the novel, the objective is to predict the following word. This can be done on any text, and we don’t need annotated data – the text itself is enough.
Have a look at the following piece of code and try to understand what it does. Then, run it and see the network generating text! At first, the output is not meaningful, but it becomes so over time. This is the magic I was referring to.
Beware: this will take longer to run on a CPU. A GPU is recommended, but you can still try to run it for a while to see the predictions evolve. On my laptop, an epoch takes 6mins so the full training takes 6hrs. About 20 epochs are required for the generated text to be somewhat meaningful.
Note, however, that although this seems long, training actual deep learning models for concrete tasks takes days, even on multiple GPUs. This is mostly because of the data size and the much deeper networks.
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io
# We load a text from Nietzsche
path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
with io.open(path, encoding='utf-8') as f:
text = f.read().lower()
print('corpus length:', len(text))
# We create dictionaries of character > index and the other way around
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
sentences.append(text[i: i + maxlen])
next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
x[i, t, char_indices[char]] = 1
y[i, char_indices[next_chars[i]]] = 1
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
def sample(preds, temperature=1.0):
# helper function to sample an index from a probability array
preds = np.asarray(preds).astype('float64')
preds = np.log(preds) / temperature
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds, 1)
return np.argmax(probas)
def on_epoch_end(epoch, logs):
# Function invoked at end of each epoch. Prints generated text.
print()
print('----- Generating text after Epoch: %d' % epoch)
start_index = random.randint(0, len(text) - maxlen - 1)
for diversity in [0.2, 0.5, 1.0, 1.2]:
print('----- diversity:', diversity)
generated = ''
sentence = text[start_index: start_index + maxlen]
generated += sentence
print('----- Generating with seed: "' + sentence + '"')
sys.stdout.write(generated)
for i in range(400):
x_pred = np.zeros((1, maxlen, len(chars)))
for t, char in enumerate(sentence):
x_pred[0, t, char_indices[char]] = 1.
preds = model.predict(x_pred, verbose=0)[0]
next_index = sample(preds, diversity)
next_char = indices_char[next_index]
generated += next_char
sentence = sentence[1:] + next_char
sys.stdout.write(next_char)
sys.stdout.flush()
print()
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
model.fit(x, y,
batch_size=128,
epochs=60,
callbacks=[print_callback])