Gym 실습 (2) 에피소드

import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Dense(2, input_shape=(1,))
])

def play(env, model):
    step = 100
    env.reset()
    action = 0.0

    grads = []
    rewards = []
    actions = []

    for i in range(200):
        observation, reward, done, info = env.step(np.array([action]))
        rewards.append(reward)

        if done:
            break

        x = np.array([[1.0 if observation == 1 else -1.0]])
        with tf.GradientTape() as tape:
            logits = model(x)
            index = tf.random.categorical(logits, num_samples=1)
            index = index[:, 0]
            logprob = tf.nn.sparse_softmax_cross_entropy_with_logits(index, logits)
        grad = tape.gradient(logprob, model.trainable_weights)
        grads.append(grad)

        if index.numpy()[0] == 0:
            action -= step
        else:
            action += step

        if action > 1000:
            action = 1000
        if action < -1000:
            action = -1000
        actions.append(action)

    return {
        'actions': actions,
        'rewards': rewards,
        'grads': grads,
        'target': info['number']
    }

result = play(env, model)

import matplotlib.pyplot as plt

actions = result['actions']
target = result['target']

t = np.arange(len(actions))

plt.plot(t, actions)
plt.plot([0, 199], [target, target], linestyle='dashed', color='lightgrey')
plt.ylim(-1000, 1000)

(-1000.0, 1000.0)