[Reinforcement Learning / review article / not use tensorflow] Policy Gradient (CartPole)

1

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

import numpy as np

import gym

class NN:

def __init__(self):

self.numHiddenLayerNeurons = 10

self.learningRate = 1e-2

self.discountFactorForReward = 0.99

self.inputDimension = 4

self.W1 = self.HeInitialization(self.inputDimension, self.numHiddenLayerNeurons)

self.W2 = self.HeInitialization(self.numHiddenLayerNeurons, 1)

self.W1GradientBuffer = np.zeros_like(self.W1)

self.W2GradientBuffer = np.zeros_like(self.W2)

def sigmoid(self,x):

return 1.0 / (1.0 + np.exp(-x))

def dsigmoid(self,x):

return x * (1. - x)

def tanh(self,x):

return np.tanh(x)

def dtanh(self,x):

return 1.0 - x * x

def ReLU(self, x):

return x * (x > 0)

def dReLU(self,x):

return 1.0 * (x > 0)

def softmax(self, x):

if x.ndim == 1:

x = x.reshape([1, x.size])

modifiedX = x - np.max(x, 1).reshape([x.shape[0], 1]);

sigmoid = np.exp(modifiedX)

return sigmoid / np.sum(sigmoid, axis=1).reshape([sigmoid.shape[0], 1]);

def XavierInitialization(self, NumIn, NumOut):

return np.random.randn(NumIn, NumOut) / np.sqrt(NumIn)

def HeInitialization(self, NumIn, NumOut):

return np.random.randn(NumIn, NumOut) / np.sqrt(NumIn / 2)

def feedForward(self, x):

y1 = self.ReLU(np.matmul(x, self.W1))

score = np.matmul(y1, self.W2)

probability = self.sigmoid(score)

return y1, probability

def backpropagation(self, x, error, y1, reward):

discountedReward = self.discountReward(reward)

discountedReward -= np.mean(discountedReward)

discountedReward /= np.std(discountedReward)

error *= discountedReward

# dY2 = np.matmul(error, self.weights['W2'].T)

dY2 = np.outer(error, self.W2)

dY1 = self.dReLU(y1)

dW1 = np.matmul(x.T, (dY2 * dY1))

dW2 = np.matmul(y1.T, error)

self.W1GradientBuffer += dW1;

self.W2GradientBuffer += dW2;

def update(self):

self.W1 += self.learningRate * self.W1GradientBuffer

self.W2 += self.learningRate * self.W2GradientBuffer

self.W1GradientBuffer = np.zeros_like(self.W1)

self.W2GradientBuffer = np.zeros_like(self.W2)

def discountReward(self, r):

discounted_r = np.zeros_like(r)

running_add = 0

for t in reversed(range(0, r.size)):

running_add = running_add * self.discountFactorForReward + r[t]

discounted_r[t] = running_add

return discounted_r

if __name__ == '__main__':

batchSize = 5

env = gym.make('CartPole-v0')

observation = env.reset()

NN = NN()

arrX, arrReward, arrY1, arrError = [], [], [], []

rewardSum = 0

episodeIndex = 1

env.reset()

while episodeIndex <= 10000:

x = np.reshape(observation, [1, NN.inputDimension])

y1, probability = NN.feedForward(x)

action = 1 if np.random.uniform() < probability else 0 #e-greedy 필요할듯

arrX.append(x)

arrY1.append(y1)

arrError.append(action - probability)

observation, reward, done, info = env.step(action)

rewardSum += reward

arrReward.append(reward)

if done:

episodeIndex += 1

episodeX = np.vstack(arrX)

episodeReward = np.vstack(arrReward)

episodeY1 = np.vstack(arrY1)

episodeError = np.vstack(arrError)

arrX, arrReward, arrY1, arrError = [], [], [], []

NN.backpropagation(episodeX, episodeError, episodeY1, episodeReward)

if episodeIndex % batchSize == 0:

NN.update()

print('Average reward for episode %f. Total average reward %f.' % (

rewardSum / batchSize, rewardSum / batchSize))

if rewardSum / batchSize >= 200:

print("Task solved in", episodeIndex, 'episodes!')

break

rewardSum = 0

observation = env.reset()

Colored by Color Scripter

[Reinforcement Learning / learn article] Deep Q-Networks and Beyond (0)	2017.04.17
[Reinforcement Learning / learn article] Model-Based RL (CartPole) (1)	2017.04.10
[Supervised Learning / python / not use tensorflow] MNIST - Softmax regression (0)	2017.03.28
[Reinforcement Learning / review article / c++] Policy Gradient (Two-armed Bandit) (0)	2017.03.26
[Reinforcement Learning / learn article] Policy Gradient (CartPole) (0)	2017.03.15

Program Programming Programmer

[Reinforcement Learning / review article / not use tensorflow] Policy Gradient (CartPole)

'Deep learning' 카테고리의 다른 글

'Deep learning'의 다른글

관련글

Tag

최근글과 인기글

최근댓글

공지사항

페이스북 트위터 플러그인

Archives

Calendar

방문자수Total

티스토리툴바

« 2024/07 »
일	월	화	수	목	금	토
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30	31