# Coding 13: Reinforcement Learning

This week we will train a deep network that learns to race in SuperTuxKart using RL or gradient free optimization.

<img src="https://a.fsdn.com/con/app/proj/supertuxkart/screenshots/500px-Hac.jpg/max/max/1" width=512px/>

RL will likely not work on the image, or any state that is too high dimensional (within the timeframe of this class). In order to simplify the problem, we'll use some features extracted from the input by the original autopilot. In addition, we will only look at gradient-free optimization techniques in this notebook.

## Installation

For this notebook you'll need `ray`, a parallelization package we will use to collect rollouts asynchronously.

In [None]:
!pip install PySuperTuxKart ray

## The Environment

The following code wraps the `pystk` library and provides a basic interface for playing the game. This is the same code we used in last week's exercise.

In [None]:
import pystk
import numpy as np
import random

from tqdm.notebook import tqdm


class PyTux(object):
    INITED = False

    def __init__(self, track, screen_width=128, screen_height=96, use_graphics=True):
        self.race = None
        self.config = pystk.GraphicsConfig.hd() if use_graphics else pystk.GraphicsConfig.none()
        self.config.screen_width = screen_width
        self.config.screen_height = screen_height
        self.track = track

        if not PyTux.INITED:
            pystk.init(self.config)
            PyTux.INITED = True

    @staticmethod
    def _magical_auto_pilot(player, track, distance=20):
        """
        This function return a magical steering, acceleration and drift values.
        This is used in the auto-pilot and meant to be hard to read ;)
        Feel free to get inspired by this if you can decipher it
        (it's probably not worth your time though)
        """
        __ = PyTux._point_on_track(player.kart.distance_down_track+distance, track)
        __ = __ - np.array(player.kart.location)
        _ = np.array(player.kart.front) - np.array(player.kart.location)
        _ = _ / max(np.linalg.norm(_), 1e-10)
        _ = np.cross([0,1,0], _)
        return lambda ___: (_.dot(__), int(___<15), abs(__.dot(_))>1)

    @staticmethod
    def _point_on_track(distance, track):
        node_idx = np.searchsorted(track.path_distance[..., 1], distance % track.path_distance[-1, 1]) % len(track.path_nodes),
        d = track.path_distance[node_idx]
        x = track.path_nodes[node_idx]
        t = (distance - d[0]) / (d[1] - d[0])
        return x[1] * t + x[0] * (1 - t)

    def clean(self):
        if self.race is not None:
            self.race.stop()
            del self.race

    def __enter__(self):
        self.clean()

        config = pystk.RaceConfig(num_kart=1, laps=1, track=self.track, step_size=0.1)
        config.players[0].controller = pystk.PlayerConfig.Controller.PLAYER_CONTROL

        self.race = pystk.Race(config)
        self.race.start()
        self.race.step()

        return self

    def __exit__(self, type, value, traceback):
        self.clean()

    def rollout(self, agent, max_frames=1000, use_image=False):
        """
        agent: an object that implements the act method
        max_frames: maximum number of frames to play for

        returns: tuple of (number steps, overall distance, did the agent finish)
        """
        state = pystk.WorldState()
        track = pystk.Track()

        time_bonus = 0

        for t in range(max_frames):
            state.update()
            track.update()

            player = state.players[0]

            time_bonus = 100 * (max_frames - t - 1) / max_frames
            # Terminate if the kart finishes a lap.
            if np.isclose(player.kart.overall_distance / track.length, 1.0, atol=2e-3):
                return t, 200 - 100 * (t + 1) / max_frames, True

            # TODO: Compute features for our RL agent


            loc = np.array(player.kart.location)
            front_dir = np.array(player.kart.front) - loc
            front_dir = front_dir / max(np.linalg.norm(front_dir), 1e-10)
            left_dir = np.cross([0,1,0], front_dir)
            kwargs = {
                'feature': [left_dir.dot(np.array(PyTux._point_on_track(player.kart.distance_down_track+d, track))-loc) for d in [5, 10, 15, 20]],
                'speed': np.linalg.norm(player.kart.velocity)
            }
            if use_image:
                kwargs['image'] = np.array(self.race.render_data[0].image)
            action = agent.act(**kwargs)
            self.race.step(action)
        return t, 100 * player.kart.distance_down_track / track.length + time_bonus, False

import ray

@ray.remote
class RayPyTux(PyTux):
    def __init__(self, track, screen_width=128, screen_height=96, use_graphics=True):
        config = pystk.GraphicsConfig.hd() if use_graphics else pystk.GraphicsConfig.none()
        config.screen_width = screen_width
        config.screen_height = screen_height
        pystk.init(config)
        

        race_config = pystk.RaceConfig(num_kart=1, laps=1, track=track, step_size=0.1)
        race_config.players[0].controller = pystk.PlayerConfig.Controller.PLAYER_CONTROL

        self.race = pystk.Race(race_config)
        self.race.start()
        self.race.step()


    def clean(self):
        pass

    def __enter__(self):
        pass

    def __exit__(self, type, value, traceback):
        pass

    def rollout(self, agent, max_frames=1000, use_image=False):
        self.race.restart()
        self.race.step()
        return super().rollout(agent, max_frames, use_image)

TRACK = 'lighthouse'
TRACK_TIME = 700

## The Agent

The agent wrapper is, once again, the same as last week's. The only change here is that we store some high-level features in addition to the images.

In [None]:
class AgentWrapper(object):
    """
    Wraps any agent to collect extra information, used for
    - collecting data
    - visualizing runs
    """
    ACTIONS = ['steer', 'acceleration', 'brake', 'drift']

    def __init__(self, agent, noise=0):
        self.agent = agent

        self.images = list()
        self.features = list()
        self.actions = list()

    def act(self, feature, speed, image=None):
        action = self.agent.act(feature, speed)
        if image is not None:
            self.images.append(image.copy())
        self.features.append((feature, speed))
        self.actions.append([getattr(action, x) for x in self.ACTIONS])

        return action

    def show(self):
        """
        Call on the last line of a cell to visualize the most recent run.
        """
        from moviepy.editor import ImageSequenceClip
        from IPython.display import display

        display(ImageSequenceClip(self.images, fps=15).ipython_display(width=512, autoplay=True, loop=True, max_duration=200))

## Tensorboard

In [None]:
import torch.utils.tensorboard as tb

log_dir = 'rl_log'

%load_ext tensorboard
%tensorboard --logdir {log_dir} --reload_interval 1

## Policy Network

Now we can finally define our model. The model takes in a set of computed features and the current speed of the kart and decides what action to take.

In [None]:
import torch


class Policy(torch.nn.Module):
    def __init__(self, c_in=4):
        super().__init__()
        # Implement the policy network -- don't get fancy here. We can get good
        # results with a _very_ simple policy network. You may want to use
        # separate networks for steering and acceleration. I would recommend
        # avoiding the drift and brake actions for now. Note that your network
        # will be called with both the features and the current speed of the
        # kart -- your network architecture should take this into account.
        
        # NOTE: c_in=4 is appropriate for the set of features that are tracked
        # by the code above. If you want to, you are welcome to mess with that
        # code to gather more or fewer features, and then you'll need to set
        # c_in accordingly.

    def forward(self, feature, speed):
        # TODO

    def act(self, feature, speed, image=None):
        # TODO: Given a set of features and a speed, take an action


## Training

The first training approach we'll try is the simplest. We'll just sample random policies and then keep track of the one which performs the best. This will work best if your model has few parameters.

In [None]:
import time
from copy import deepcopy


def train_v0(model, device, epochs=100):
    # logger = tb.SummaryWriter(log_dir + '/{}'.format(time.strftime('%H-%M-%S')), flush_secs=1)
    env = RayPyTux.remote(TRACK, use_graphics=False)
    best_model = deepcopy(model)
    
    for _ in range(100):
        
        # TODO: Get a random model -- don't overthink this!
        
        with torch.no_grad():
            agent = AgentWrapper(model)
            _, reward, _ = ray.get(env.rollout.remote(agent, use_image=False, max_frames=200))
        
        # logger.add_scalar('train/reward', reward, global_step=epoch)
        
        # TODO: If the reward is better than the best reward seen so far, store
        # the current model in best_model
        
        print(f'Current Reward: {reward:6.2f}  (best {best_reward:6.2f})')
    
    return best_model

# Train your model.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Policy()

model = train_v0(model, device, epochs=20)

After training, you can use the cell at the bottom of this notebook to see what your controller is doing.

Now let's try something slightly more interesting. Starting from the best policy we found using `train_v0`, we'll sample some random perturbations and apply those perturbations to the policy parameters. We'll keep track of the best-performing result and return it. By doing this multiple times, we can "walk" through the parameter space to better and better models. The `train_v1` function performs one step of this process -- it samples some number of perturbations near a given policy and returns the best one. We can then use `train_v1` repeatedly to train the policy. You may find it useful to change the noise scale over time.

In [None]:
def train_v1(model, device, epochs=10, noise_scale=1):
    # logger = tb.SummaryWriter(log_dir + '/{}'.format(time.strftime('%H-%M-%S')), flush_secs=1)
    env = RayPyTux.remote(TRACK, use_graphics=False)
    original_parameters = [p.data.clone() for p in model.parameters()]
    best_model = deepcopy(model)
    # TODO: Make sure you check the reward of the input model
    
    for _ in range(epochs):
        
        # Sample a random perturbation and apply it to the model
        for op, p in zip(original_parameters, model.parameters()):
            noise = noise_scale * torch.randn(p.data.shape)
            p.data[...] = op + noise
        
        with torch.no_grad():
            agent = AgentWrapper(model)
            _, reward, _ = ray.get(env.rollout.remote(agent, use_image=False, max_frames=500))
        
        print(reward)
        
    # Restore the original parameters to the model
    for op, p in zip(original_parameters, model.parameters()):
        p.data[...] = op
        
    return best_model

# Train your model

## Fully Autonomous Driving

Now that all that has been taken care of, it's time to finally test out your trained model.

In [None]:
# Test out your model!
with PyTux(TRACK) as env:
    agent = AgentWrapper(better_model)
    with torch.no_grad():
        print('Time: %d Score: %.2f Success: %d' % env.rollout(agent, use_image=True, max_frames=600))

agent.show()