optimizer = tf.keras.optimizers.SGD(learning_rate=0.05)
model = tf.keras.Sequential([
tf.keras.layers.Dense(2, input_shape=(1, ))
])
[<tf.Variable 'dense_3/kernel:0' shape=(1, 2) dtype=float32, numpy=array([[0.9812864 , 0.59330356]], dtype=float32)>,
<tf.Variable 'dense_3/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>]
tf.nn.softmax(model(np.array([[1.0], [-1.0]])))
<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.02947669, 0.97052336],
[0.97409624, 0.02590378]], dtype=float32)>
result = play(env, model)
plt.plot(t, result['actions'])
plt.plot([0, 199], [result['target'], result['target']], linestyle='dashed', color='lightgrey')
plt.ylim(-1000, 1000)
(-1000.0, 1000.0)

rewards = result['rewards'][1:]
for reward, grad in zip(rewards, result['grads']):
update = [(reward - 0.5) * g for g in grad]
optimizer.apply_gradients(zip(update, model.trainable_weights))