-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathActorCritic.py
84 lines (70 loc) · 3.22 KB
/
ActorCritic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import numpy as np
import gym
from env.environment import CustomEnv
from policy import ParameterizedPiWithNN
from functionApproximator import NonLinearApproximatorOfStateValuesWithNN
from utility import evaluation
from matplotlib import pyplot as plt
import random
import torch
def ActorCritic(seed):
random.seed(seed) # Get consistent hashing for tile coding
torch.manual_seed(seed)
np.random.seed(seed)
# Initialize environment, state-value approximator, parameterized policy
env = gym.make('CustomEnv-v0')
V = NonLinearApproximatorOfStateValuesWithNN(alpha=1e-5, stateLow=env.stateLow, stateHigh=env.stateHigh)
pi = ParameterizedPiWithNN(alpha=1e-5, state_dims=env.stateDimension, num_actions=env.numActions)
# Initialize training parameters
INF = 999
epochSteps = 600
numEpochs = 500
maxSteps = epochSteps * numEpochs
# max epoch returns (epochSteps steps is one epoch). Each number represents
# the max return episode return (from starting state) over all episodes in that epoch.
# E.g. maxEpRets = [-44, -21, -7.5] means we have run 3 epochs and -44 is the maximal
# episode return among all, say, 5.7 episodes executed during the first epoch. (.7 because)
# agent might be in the middle of an episode when the current epoch is finished.
maxEpRets = []
# Initialize counting apparatus
t = 0 # number of steps agent has executed so far
localT = 0 # number of steps within current epoch
maxReward = -INF # maximal non-zero reward encoutered during each step within current epoch
while True:
state = env.reset()
done = False
while not done:
# Choose A from parameterized policy
action = pi.action(state)
# Take A and observe R and S'.
nextState, reward, done, info = env.step(action)
# Compute one-step return target (R + gamma * V(S'; w))
target = reward + env.gamma * V(nextState)
# Update our function approximator
V.update(s=state, G=target)
# Update our parameterized policy
pi.update(s=state, a=action, gamma_t=env.gamma ** t, delta=target - V(state))
state = nextState
# increment local time step within current epoch
localT += 1
if reward != 0: # end of one episode
maxReward = max(maxReward, reward)
if localT == epochSteps: # one epoch is full
# under our speical reward design, maximal non-zero reward is also
# the highest episode return among all episodes wihtin this epoch.
maxEpRets.append(maxReward)
print(str(len(maxEpRets)) + "th epoch is finished. localT = " + str(localT))
print("global t = " + str(t+1))
maxReward = -INF
localT = 0
# increment global time step
t += 1
if t == maxSteps:
break
if t == maxSteps:
break
np.savetxt("ActorCritic_maxEpRet_seed_" + str(seed) + ".npy", maxEpRets)
if __name__ == "__main__":
for seed in range(5):
print("initialize ActorCritic with seed = " + str(seed))
ActorCritic(seed)