## Stat Learning and Data Mining
## Convolutional Neural Networks in R 
## by Mengli Xiao <xiaox345@umn.edu>


# Uncomment to install the package "keras"
# install.packages("keras")
set.seed(7475)

library(keras)
# Uncomment this if this is your first time installation of keras. It will provide you with default CPU-based installations of Keras and TensorFlow 
# install_keras()

use_backend("tensorflow")

batch_size <- 128
num_classes <- 10
epochs <- 100

# load the data --------------------------------------------------------
# shuffle and split between train and test sets
mnist <- dataset_mnist()
x_train <- mnist$train$x
y_train <- mnist$train$y
x_test <- mnist$test$x
y_test <- mnist$test$y

img_rows <- 28
img_cols <- 28

str(x_train)
x_train <- array_reshape(x_train,c(nrow(x_train),img_rows,img_cols,1))
str(x_train)
x_test <- array_reshape(x_test, c(nrow(x_test), img_rows, img_cols, 1))
input_shape <- c(img_rows, img_cols, 1)
# Standardize the RGB values to (0,1) (Optional)
x_train <- x_train/255
x_test <- x_test/255

str(y_train)

# convert class vectors to binary class matrices
y_train <- to_categorical(y_train, num_classes)
y_test <- to_categorical(y_test, num_classes)

# Define the model structure -----------------------------------------------------------------------------------
model <- keras_model_sequential()%>%
  # The first convolution layer with 32 filters/kernels and each kernel is a 3 by 3 matrix
  # The default padding setting is no-padding, and you can change the padding to be "same" in order to get the same 
  # size of the output as the input.
  # The default strides setting is 1, and you can change the strides by setting strides= an integer number
  layer_conv_2d(filter=32, kernel_size=c(3,3),activation='relu',input_shape = input_shape) %>%
  # The second convolution layer with 32 filters/kernels and each kernel is a 3 by 3 matrix 
  layer_conv_2d(filter=32, kernel_size=c(3,3),activation='relu') %>%
  # Max pooling reduces sparsity and coarse-grains the strong representation of the features
  layer_max_pooling_2d(pool_size=c(2,2)) %>%
  
  layer_flatten() %>%
  # fully-connected layer
  layer_dense(units=128,activation='relu')%>%
  
  
  # Regularization layer. In the training process, removing randomly selected 50% of the neurons, resulting in sampled subnetworks. The gradient are set to 0 for those removed neurons. The average gradient over the subnetworks is calculated for each neuron in each training batch
  
  # The dropout rate 0.5 and the dropout layer's location is also a tuning parameters (e.g., add flag_numeric=c('dropout_1',0.25,'first_dropout') into FLAGS)
  layer_dropout(rate=0.5)%>%
  # Softmax serves as the non-linearity at the end and it outputs the probability
  # for all the ten categories(ten digits)
  layer_dense(units=num_classes,activation='softmax')

# Compile model -------------------------------------------------------------------------------------
# Keras has most common optimizers, and the choice of optimizer is tunable
# Loss function is also a tuning parameter
# The metrics can use any customized R function that takes the tensor as its input. The default will record both accuracy and loss
model %>% compile(
  loss=loss_categorical_crossentropy,optimizer='sgd',metrics=c('accuracy')
  )

# Stop training when the validaion accuracy stops improving after 2 epochs 
early_stopping <- callback_early_stopping(monitor = 'val_acc', patience = 2)


m1.hist <- model %>% fit(
  x_train,y_train,
  batch_size=batch_size,
  epochs=epochs,
  validation_split=0.2, # split the 20% of the training data to be the validation data, and the validation data is the same for every epoch
  callbacks=c(early_stopping) # delete this to run over all 100 epochs
)

# Plot the accuracy and loss of the training and validation
plot(m1.hist)
# Store the result
m1result <- as.data.frame(m1.hist)

# Get the metrics of the test performance
# Verbose means how you present the process in the progress bar
test_metric <- model %>% evaluate(
  x_test,y_test,verbose=0
)
# loss
test_metric[1]
# $loss
# [1] 0.0947

# accuracy
test_metric[2]

# $acc
# [1] 0.9716

# Find which epoch the training stops at
max(m1result$epoch[!is.na(m1result$value)])
# [1] 20