import tensorflow as tf
model = tf.keras.Sequential([
tf.keras.layers.Dense(2, input_shape=(1,))
])
def play(env, model):
step = 100
env.reset()
action = 0.0
grads = []
rewards = []
actions = []
for i in range(200):
observation, reward, done, info = env.step(np.array([action]))
rewards.append(reward)
if done:
break
x = np.array([[1.0 if observation == 1 else -1.0]])
with tf.GradientTape() as tape:
logits = model(x)
index = tf.random.categorical(logits, num_samples=1)
index = index[:, 0]
logprob = tf.nn.sparse_softmax_cross_entropy_with_logits(index, logits)
grad = tape.gradient(logprob, model.trainable_weights)
grads.append(grad)
if index.numpy()[0] == 0:
action -= step
else:
action += step
if action > 1000:
action = 1000
if action < -1000:
action = -1000
actions.append(action)
return {
'actions': actions,
'rewards': rewards,
'grads': grads,
'target': info['number']
}
result = play(env, model)
import matplotlib.pyplot as plt
actions = result['actions']
target = result['target']
t = np.arange(len(actions))
plt.plot(t, actions)
plt.plot([0, 199], [target, target], linestyle='dashed', color='lightgrey')
plt.ylim(-1000, 1000)
(-1000.0, 1000.0)
