Hi,
I am trying to complete exercise 2.2, where Start_training.py needs to call sarsa.py. I am confused how should I get the action2 within start_training.py?
Thanks & Regards,
Soumik Basu
Hi,
I am trying to complete exercise 2.2, where Start_training.py needs to call sarsa.py. I am confused how should I get the action2 within start_training.py?
Thanks & Regards,
Soumik Basu
Hi @BlueCap ,
Could you share your rosject with me please?
This is the best way to reproduce the error
I do have start_training.py which used to call qlearn.py; now I need to call sarsa.py. In sarsa, learn(self, state1, action1, reward, state2, action2) methods needs one more parameter action2. In start_traing how can I get the calculation for that action2?
start_training.py ==>
#!/usr/bin/env python
import gym
import time
import numpy
import random
import time
import qlearn
from gym import wrappers
import rospy
import rospkg
from functools import reduce
from openai_ros.task_envs.cartpole_stay_up import stay_up
if name == ‘main’:
rospy.init_node('cartpole_gym', anonymous=True, log_level=rospy.WARN)
# Create the Gym environment
env = gym.make('CartPoleStayUp-v0')
rospy.loginfo ( "Gym environment done")
# Set the logging system
rospack = rospkg.RosPack()
pkg_path = rospack.get_path('cartpole_v0_training')
outdir = pkg_path + '/training_results'
env = wrappers.Monitor(env, outdir, force=True)
rospy.loginfo ( "Monitor Wrapper started")
last_time_steps = numpy.ndarray(0)
# Loads parameters from the ROS param server
# Parameters are stored in a yaml file inside the config directory
# They are loaded at runtime by the launch file
Alpha = rospy.get_param("/cartpole_v0/alpha")
Epsilon = rospy.get_param("/cartpole_v0/epsilon")
Gamma = rospy.get_param("/cartpole_v0/gamma")
epsilon_discount = rospy.get_param("/cartpole_v0/epsilon_discount")
nepisodes = rospy.get_param("/cartpole_v0/nepisodes")
nsteps = rospy.get_param("/cartpole_v0/nsteps")
running_step = rospy.get_param("/cartpole_v0/running_step")
# Initialises the algorithm that we are going to use for learning
qlearn = qlearn.QLearn(actions=range(env.action_space.n),
alpha=Alpha, gamma=Gamma, epsilon=Epsilon)
initial_epsilon = qlearn.epsilon
start_time = time.time()
highest_reward = 0
# Starts the main training loop: the one about the episodes to do
for x in range(nepisodes):
rospy.logdebug("############### START EPISODE => " + str(x))
cumulated_reward = 0
done = False
if qlearn.epsilon > 0.05:
qlearn.epsilon *= epsilon_discount
# Initialize the environment and get first state of the robot
observation = env.reset()
state = ''.join(map(str, observation))
# Show on screen the actual situation of the robot
# for each episode, we test the robot for nsteps
for i in range(nsteps):
rospy.loginfo("############### Start Step => "+str(i))
# Pick an action based on the current state
action = qlearn.chooseAction(state)
rospy.loginfo ("Next action is: %d", action)
# Execute the action in the environment and get feedback
observation, reward, done, info = env.step(action)
rospy.loginfo(str(observation) + " " + str(reward))
cumulated_reward += reward
if highest_reward < cumulated_reward:
highest_reward = cumulated_reward
nextState = ''.join(map(str, observation))
# Make the algorithm learn based on the results
#rospy.logwarn("############### State we were => " + str(state))
#rospy.logwarn("############### Action that we took => " + str(action))
#rospy.logwarn("############### Reward that action gave => " + str(reward))
#rospy.logwarn("############### State in which we will start next step => " + str(nextState))
qlearn.learn(state, action, reward, nextState)
if not(done):
state = nextState
else:
rospy.loginfo ("DONE")
last_time_steps = numpy.append(last_time_steps, [int(i + 1)])
break
rospy.loginfo("############### END Step =>" + str(i))
#raw_input("Next Step...PRESS KEY")
#rospy.sleep(2.0)
m, s = divmod(int(time.time() - start_time), 60)
h, m = divmod(m, 60)
rospy.logwarn ( ("EP: "+str(x+1)+" - [alpha: "+str(round(qlearn.alpha,2))+" - gamma: "+str(round(qlearn.gamma,2))+" - epsilon: "+str(round(qlearn.epsilon,2))+"] - Reward: "+str(cumulated_reward)+" Time: %d:%02d:%02d" % (h, m, s)))
rospy.loginfo ( ("\n|"+str(nepisodes)+"|"+str(qlearn.alpha)+"|"+str(qlearn.gamma)+"|"+str(initial_epsilon)+"*"+str(epsilon_discount)+"|"+str(highest_reward)+"| PICTURE |"))
l = last_time_steps.tolist()
l.sort()
#print("Parameters: a="+str)
rospy.loginfo("Overall score: {:0.2f}".format(last_time_steps.mean()))
rospy.loginfo("Best 100 score: {:0.2f}".format(reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:])))
env.close()
sarsa.py ==>
import random
class Sarsa:
def init(self, actions, epsilon=0.1, alpha=0.2, gamma=0.9):
self.q = {}
self.epsilon = epsilon
self.alpha = alpha
self.gamma = gamma
self.actions = actions
def getQ(self, state, action):
return self.q.get((state, action), 0.0)
def learnQ(self, state, action, reward, value):
oldv = self.q.get((state, action), None)
if oldv is None:
self.q[(state, action)] = reward
else:
self.q[(state, action)] = oldv + self.alpha * (value - oldv)
def chooseAction(self, state):
if random.random() < self.epsilon:
action = random.choice(self.actions)
else:
q = [self.getQ(state, a) for a in self.actions]
maxQ = max(q)
count = q.count(maxQ)
if count > 1:
best = [i for i in range(len(self.actions)) if q[i] == maxQ]
i = random.choice(best)
else:
i = q.index(maxQ)
action = self.actions[i]
return action
def learn(self, state1, action1, reward, state2, action2):
qnext = self.getQ(state2, action2)
self.learnQ(state1, action1, reward, reward + self.gamma * qnext)