Model.py

import torch
from torch import nn
import numpy as np

def argmax(x):
    ''' Own variant of np.argmax with random tie breaking '''
    try:
        return torch.tensor(np.random.choice(torch.where(x == torch.max(x))[0]))
    except:
        return torch.argmax(x)

class MLP(nn.Module):
    """ Simple multi-layer perceptron
        Can be used as policy or value network
    """
    def __init__(self, input_dim, output_dim, value=False, shots=50):
        super(MLP, self).__init__()
        self.value = value

        self.hidden_layers = nn.Sequential(
                nn.Linear(input_dim, 64),
                nn.ReLU(),
                nn.Linear(64, 8),
                nn.ReLU()
            )

        if self.value:
            self.value_layer = nn.Sequential(
                nn.Linear(8, 1),
                nn.ReLU()
            )
        else:
            self.policy_layer = nn.Sequential(
                nn.Linear(8, output_dim),
                nn.Softmax(dim=1)
                )

    def forward(self, x, device):
        x = torch.tensor(x, dtype=torch.float32, device=device).unsqueeze(0)
        x = self.hidden_layers(x)
        if self.value : return self.value_layer(x)[0]
        else: return self.policy_layer(x)