%%script false --no-raise-error
import numpy as np
import tensorflow as tf
import keras
from keras.layers import Conv2D, Dense, Flatten, ReLU, MaxPool2D, concatenate
from keras.optimizers import Adam
import time
import random
import pygame
from collections import namedtuple


direction_dict = {'Right': 0, 'Left': 1, 'Up': 2, 'Down': 3}
Point = namedtuple('Point', 'x, y')
BLOCK_SIZE = 20

%%script false --no-raise-error

class SnakeGameAI:
    def __init__(self, w=640, h=480):
        self.w = w
        self.h = h
        self.game_over = False
        self.reset()

%%script false --no-raise-error
   def reset(self):
          # init game state
          self.direction = 0 # we start moving to the right

          self.head = Point(self.w / 2, self.h / 2) # our snake starts in the middle of the field (width/2 and height/2)
          # the snakes body begins left from the head
          self.snake = [self.head,
                        Point(self.head.x - BLOCK_SIZE, self.head.y),
                        Point(self.head.x - (2 * BLOCK_SIZE), self.head.y)]

          self.score = 0
          self.food = None
          self._place_food()
          self.frame_iteration = 0
          self.game_over = False

%%script false --no-raise-error
def _place_food(self):
      x = random.randint(0, (self.w - BLOCK_SIZE) // BLOCK_SIZE) * BLOCK_SIZE
      y = random.randint(0, (self.h - BLOCK_SIZE) // BLOCK_SIZE) * BLOCK_SIZE
      self.food = Point(x, y)
      if self.food in self.snake:
          self._place_food()

%%script false --no-raise-error
    def play_step(self, action):
        self.frame_iteration += 1
      
        # Move
        reward = self._move(action)  # update the head

        # Check if game over
        if self.is_collision():
            done = True
            reward = -1
            self.reset()
        else:
            done = False
            
        # Return game over and score
        return reward, done, self.score

%%script false --no-raise-error
    def is_collision(self, pt=None):
        if self.head in self.snake[1:]:
            self.game_over = True
            return True
        else:
            return False

  %%script false --no-raise-error
  def _move(self, action):
        reward = 0
        self.direction = action

        distance_to_food_x = abs(self.food.x - self.head.x)
        distance_to_food_y = abs(self.food.y - self.head.y)
        distance_to_food_ = distance_to_food_x + distance_to_food_y

        x = self.head.x
        y = self.head.y

        if self.direction == 0:
            x += BLOCK_SIZE
        elif self.direction == 1:
            x -= BLOCK_SIZE
        elif self.direction == 2:
            y += BLOCK_SIZE
        elif self.direction == 3:
            y -= BLOCK_SIZE

        # cross the border of the map and enter on the other side
        if x > self.w:
            x = 0
        if x < 0:
            x = self.w
        if y > self.h:
            y = 0
        if y < 0:
            y = self.h

        # moving head of snake
        self.head = Point(x, y)
        self.snake.insert(0, self.head)

        # reward for going in the right direction
        distance_to_food_x = abs(self.food.x - self.head.x)
        distance_to_food_y = abs(self.food.y - self.head.y)
        distance_to_food = distance_to_food_x + distance_to_food_y
        if (distance_to_food_ - distance_to_food) > 0:
            reward = 0.001
        else:
            reward = -0.001

        if self.head == self.food:
            self.score += 1
            reward = 1
            self._place_food()
        else:
            self.snake.pop()
        return reward

%%script false --no-raise-error
    def get_action_space(self):
        choices = ['Right', 'Left', 'Up', 'Down']

        # if direction right, can't go left
        if self.direction == direction_dict['Right']:
            choices = [x for x in choices if x != 'Left']
        # if direction left, can't go right
        if self.direction == direction_dict['Left']:
            choices = [x for x in choices if x != 'Right']
        #if direction up, can't go down
        if self.direction == direction_dict['Up']:
            choices = [x for x in choices if x != 'Down']
        #if direction down, can't go up
        if self.direction == direction_dict['Down']:
            choices = [x for x in choices if x != 'Up']

        action_space = [direction_dict[x] for x in choices]
        return action_space

%%script false --no-raise-error
    def get_conv_state(self):
        state = np.zeros((int(self.w/BLOCK_SIZE), int(self.h/BLOCK_SIZE), 3))
        for snake_cell in self.snake:
            state[int(snake_cell.x/BLOCK_SIZE) - 1, int(snake_cell.y/BLOCK_SIZE) - 1, 0] = 1
        state[int(self.head.x/BLOCK_SIZE) - 1, int(self.head.y/BLOCK_SIZE) - 1, 1] = 1
        state[int(self.food.x/BLOCK_SIZE) - 1, int(self.food.y/BLOCK_SIZE) - 1, 2] = 1
        return state

    def get_state(self):
      # state = [difference in location food and head x axis,
      #           difference in location food and head y axis,
      #           dummy if there is snake cell below head
      #           dummy if there is snake cell above head
      #           dummy if there is snake cell left of head
      #           dummy if there is snake cell right of head]
      state = [int(self.food.x / BLOCK_SIZE) - int(self.head.x / BLOCK_SIZE),
               int(self.food.y / BLOCK_SIZE) - int(self.head.y / BLOCK_SIZE),
               int(any([(snake_cell.y == self.head.y - BLOCK_SIZE) for snake_cell in self.snake if
                        snake_cell.x == self.head.x])),
               int(any([(snake_cell.y == self.head.y + BLOCK_SIZE) for snake_cell in self.snake if
                        snake_cell.x == self.head.x])),
               int(any([(snake_cell.x == self.head.x - BLOCK_SIZE) for snake_cell in self.snake if
                        snake_cell.y == self.head.y])),
               int(any([(snake_cell.x == self.head.x + BLOCK_SIZE) for snake_cell in self.snake if
                        snake_cell.y == self.head.y]))]
      return np.array(state)

%%script false --no-raise-error

def get_example_action(self):
      actions = []
      state = self.get_state()
      # if food is on the right and no snake cell on right
      if state[0] > 0 and state[5] == 0:
          actions.append(0)
      # if food is on the left and no snake cell on left
      elif state[0] < 0 and state[4] == 0:
          actions.append(1)
      # if food is up and no snake cell up
      if state[1] > 0 and state[3] == 0:
          actions.append(2)
      # if food is down and no snake cell down
      elif state[1] < 0 and state[2] == 0:
          actions.append(3)

      if len(actions) == 0:
          actions = self.get_action_space()
          
      return random.choice(actions)

class ReplayBuffer(object):
    def __init__(self, max_size, input_dims_conv=1, input_dims):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.conv_state_memory = np.zeros((self.mem_size, *input_dims_conv),
                                     dtype=np.float32)
        self.state_memory = np.zeros((self.mem_size, input_dims),
                                     dtype=np.float32)
        self.new_conv_state_memory = np.zeros((self.mem_size, *input_dims_conv),
                                         dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, input_dims),
                                         dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

    def store_transition(self, conv_state, state, action, reward, conv_state_, state_, done):
        index = self.mem_cntr % self.mem_size # if the maximum memory size is exceeded, the snake starts to forget the oldest memories
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.conv_state_memory[index] = conv_state
        self.new_conv_state_memory[index] = conv_state_
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = done
        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size, replace=False)

        states = self.state_memory[batch]
        conv_states = self.conv_state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        conv_states_ = self.new_conv_state_memory[batch]
        terminal = self.terminal_memory[batch]

        return conv_states, states, actions, rewards, conv_states_, states_, terminal

class Agent:
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims_conv=None, input_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, model_version='v1'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.input_dims_conv = input_dims_conv
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.model_version = moder_version
        
        if self.model_version == 'v1':
          self.memory = ReplayBuffer(mem_size, input_dims)
          self.q_eval = DeepQNetwork(input_dims, n_actions)
          self.q_eval.compile(optimizer=Adam(learning_rate=lr))
          self.q_next = DeepQNetwork(input_dims, n_actions)
          self.q_next.compile(optimizer=Adam(learning_rate=lr))
        else:
          self.memory = ReplayBuffer(mem_size, input_dims_conv, input_dims)
          self.q_eval = DeepQNetwork(input_dims_conv, input_dims, n_actions)
          self.q_eval.compile(optimizer=Adam(learning_rate=lr))
          self.q_next = DeepQNetwork(input_dims_conv, input_dims, n_actions)
          self.q_next.compile(optimizer=Adam(learning_rate=lr))

    def save_models(self):
        self.q_eval.save_weights('model'+self.model_version+'.h5')
        print('... models saved successfully ...')

    def load_models(self):
        self.q_eval.load_weights('model'+self.model_version+'.h5')
        self.q_next.load_weights('model'+self.model_version+'.h5')
        print('... models loaded successfully ...')

    def store_transition(self, conv_state, state, action, reward, conv_state_, state_, done):
        self.memory.store_transition(conv_state, state, action, reward, conv_state_, state_, done)

    def sample_memory(self):
        conv_state, state, action, reward, new_conv_state, new_state, done = self.memory.sample_buffer(self.batch_size)
        conv_states = tf.convert_to_tensor(conv_state)
        states = tf.convert_to_tensor(state)
        rewards = tf.convert_to_tensor(reward)
        dones = tf.convert_to_tensor(done)
        actions = tf.convert_to_tensor(action, dtype=tf.int32)
        conv_states_ = tf.convert_to_tensor(new_conv_state)
        states_ = tf.convert_to_tensor(new_state)
        return conv_states, states, actions, rewards, conv_states_, states_, dones

    def choose_action(self, state, action_space):
        if np.random.random() > self.epsilon:
            if self.model_version=='v1':
              state = tf.convert_to_tensor([state])
            else:
              state = (tf.convert_to_tensor([state[0]]), tf.convert_to_tensor([state[1]])) 
            
            # evaluate all actions and pick the one with the highest estimated Q value
            actions = self.q_eval(state)
            sorted_actions = tf.argsort(actions, axis=1).numpy()[0]
            action = sorted_actions[-1]
            if action in action_space:
                return action
            else:
                action = sorted_actions[-2]
                return action
        else:
            action = np.random.choice(action_space)
        return action

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.set_weights(self.q_eval.get_weights())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                           if self.epsilon > self.eps_min else self.eps_min

   def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.replace_target_network()

        conv_states, states, actions, rewards, conv_states_, states_, dones = self.sample_memory()

        indices = tf.range(self.batch_size, dtype=tf.int32)
        action_indices = tf.stack([indices, actions], axis=1)
        
        if self.model_version == 'v1':
          eval_input = states
          next_input = states_
        else:
          eval_input = (conv_states, states)
          next_input = (conv_states_, states_)

        with tf.GradientTape() as tape:
            q_pred = tf.gather_nd(self.q_eval(eval_input), indices=action_indices)
            q_next = self.q_next(next_input)

            max_actions = tf.math.argmax(q_next, axis=1, output_type=tf.int32)
            max_action_idx = tf.stack([indices, max_actions], axis=1)

            q_target = rewards + \
                self.gamma*tf.gather_nd(q_next, indices=max_action_idx) *\
                (1 - dones.numpy()) 

            loss = keras.losses.MSE(q_pred, q_target)

        params = self.q_eval.trainable_variables
        grads = tape.gradient(loss, params)

        self.q_eval.optimizer.apply_gradients(zip(grads, params))

        self.learn_step_counter += 1

        self.decrement_epsilon()

%%script false --no-raise-error

class DeepQNetwork(keras.Model):
    def __init__(self, input_dims, n_actions):
        super(DeepQNetwork, self).__init__()
        self.fc1 = Dense(16, activation='relu', input_shape=(None, input_dims))
        self.fc2 = Dense(16, activation='relu')
        self.action_layer = Dense(n_actions, activation=None)

    def call(self, state):
        x = self.fc1(state)
        x = self.fc2(x)
        x = self.action_layer(x)
        return x

game_env = SnakeGameAI()
gamma = 0.9
epsilon = 1
lr = 0.001
n_actions = 4
mem_size = 100000
block_size = 20
input_dims = 6
batch_size = 32

dqn_agent = Agent(gamma, epsilon, lr, n_actions, input_dims,
                      mem_size, batch_size, eps_min=0.1, eps_dec=1e-5,
                      replace=200)

%%script false --no-raise-error

loop_nr = 0
reward_list = []
max_score = 0

while loop_nr < 100000:
    loop_nr += 1
    state = game_env.get_state()
    action = dqn_agent.choose_action(state, game_env.get_action_space())
    reward, done, score = game_env.play_step(action)

    max_score = max(max_score, score)
    new_state = game_env.get_state()
    
    if loop_nr % 1000 > 800:
      #for proportion of the time help the algorithm by giving example actions
      action = game_env.get_example_action()

    dqn_agent.store_transition(state=state, action=action, reward=reward, state_=new_state, done=done)
    dqn_agent.learn()

    reward_list.append(reward)

    if loop_nr % 10000 == 0:
        # per 10000 training iterations check the average rewards and scores of the game
        print('avg rewards: {}'.format(np.mean(reward_list)))
        print('food eaten: {}'.format(sum([x for x in reward_list if x == 1])))
        print('died: {}'.format(sum([x for x in reward_list if x == -1])))
        print('max score: {}'.format(max_score))
        print('current eps: {}'.format(dqn_agent.epsilon))
        reward_list = []
        dqn_agent.save_models()

%%script false --no-raise-error

class DeepQNetwork(keras.Model):
    def __init__(self, input_conv_dims, input_dims, n_actions):
        super(DeepQNetwork, self).__init__()
        input_conv_shape = (None, input_conv_dims[0], input_conv_dims[1], input_conv_dims[2])
        self.conv1 = Conv2D(8, (4, 4), strides=2, activation='relu', padding='same', input_shape=input_conv_shape)
        self.conv2 = Conv2D(8, (4, 4), strides=2, activation='relu', padding='same')
        self.flat = Flatten()
        self.fc1 = Dense(256, activation='relu')
        self.fc2 = Dense(64, activation='relu')
        self.fc3 = Dense(16, activation='relu', input_shape=(None, input_dims))
        self.fc4 = Dense(16, activation='relu')
        self.action_layer = Dense(n_actions, activation=None)

    def call(self, states):  
        conv_state, state = states
        
        #conv block
        x1 = self.conv1(conv_state)
        x1 = self.conv2(x1)
        x1 = self.flat(x1)
        x1 = self.fc1(x1)
        x1 = self.fc2(x1)
        
        #features block
        x2 = self.fc3(state)
        
        #final block
        x = concatenate([x1, x2])
        x = self.fc4(x)
        x = self.action_layer(x)
        
        return x

%%script false --no-raise-error

game_env = SnakeGameAI()
gamma = 0.9
epsilon = 1
lr = 0.001
n_actions = 4
mem_size = 100000
block_size = 20
input_dims = 6
input_dims_conv = (int(game_env.w / block_size), int(game_env.h / block_size), 3)
batch_size = 32

dqn_agent = Agent(gamma, epsilon, lr, n_actions, input_dims_conv, input_dims,
                      mem_size, batch_size, eps_min=0.1, eps_dec=1e-5,
                      replace=200, model_version='v2')

dqn_agent.q_next.build(input_shape=[(None, ) + input_dims_conv, (None, input_dims)])
dqn_agent.q_eval.build(input_shape=[(None, ) + input_dims_conv, (None, input_dims)])

%%script false --no-raise-error

loop_nr = 0
reward_list = []
max_score = 0

while loop_nr < 500000:
    loop_nr += 1
    state = (game_env.get_conv_state(), game_env.get_state())
    action = dqn_agent.choose_action(state, game_env.get_action_space())
    
    reward, done, score = game_env.play_step(action)

    max_score = max(max_score, score)
    new_state = (game_env.get_conv_state(), game_env.get_state())
    
    if loop_nr % 1000 > 800:
        action = game_env.get_example_action()

    dqn_agent.store_transition(conv_state=state[0], state=state[1], action=action,
                               reward=reward, conv_state_=new_state[0], state_=new_state[1], done=done)
    dqn_agent.learn()

    reward_list.append(reward)

    if loop_nr % 50000 == 0:
        print('avg rewards: {}'.format(np.mean(reward_list)))
        print('food eaten: {}'.format(sum([x for x in reward_list if x == 1])))
        print('died: {}'.format(sum([x for x in reward_list if x == -1])))
        print('max score: {}'.format(max_score))
        print('current eps: {}'.format(dqn_agent.epsilon))
        reward_list = []
        dqn_agent.save_models()

model_v = 'v1' # or v2
dqn_agent.eps_min = 0.0
dqn_agent.epsilon = 0.0
rewards_list = []
score_list = []
game_env.reset()
dones_count = 0
max_score = 0
current_game_score = 0
while dones_count < 100:
  if model_v == 'v1':
    state =  game_env.get_state()
  elif model_v == 'v2'
    state =  (game_env.get_conv_state(), game_env.get_state())
  else:
    print('no model version specified')
    pass
  action = dqn_agent.choose_action(state, game_env.get_action_space())
  reward, done, score = game_env.play_step(action)
  max_score = max(max_score, score)
  current_game_score = max(current_game_score, score)
  if done:
    dones_count += 1
    score_list.append(current_game_score)
    current_game_score = 0
  rewards_list.append(reward)
    
print('avg score: {}'.format(np.mean(score_list)))
print('max score: {}'.format(max_score))

	v1 model	v2 model
avg score	22.28	27.16
max score	47	72

Small reminder: how to play Snake¶

The models we implemented¶

Snake Game code¶

Game Class¶

Replay Buffer¶

The Agent¶

Learning and Discount Rate¶

Parameter settings¶

Training the first model¶

Initalizing the second model¶

Training the second model¶

Comparing the models¶

Conclusion¶