Commit 3287af7d authored by 高雅喆's avatar 高雅喆

“test”

parent 2e92b377
This diff is collapsed.
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network
import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.environments import py_environment
from tf_agents.specs import array_spec
import numpy as np
from tf_agents.trajectories import time_step as ts
tf.compat.v1.enable_v2_behavior()
class CardGameEnv(py_environment.PyEnvironment):
def __init__(self):
self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
self._observation_spec = array_spec.BoundedArraySpec(
shape=(1,), dtype=np.int32, minimum=0, name='observation')
self._state = 0
self._episode_ended = False
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _reset(self):
self._state = 0
self._episode_ended = False
return ts.restart(np.array([self._state], dtype=np.int32))
def _step(self, action):
if self._episode_ended:
# The last action ended the episode. Ignore the current action and start
# a new episode.
return self.reset()
# Make sure episodes don't go on forever.
if action == 1:
self._episode_ended = True
elif action == 0:
new_card = np.random.randint(1, 11)
# print("random card")
# print(new_card)
# print("state")
# print(self._state)
self._state += new_card
else:
raise ValueError('`action` should be 0 or 1.')
if self._episode_ended or self._state >= 21:
reward = self._state - 21 if self._state <= 21 else -21
return ts.termination(np.array([self._state], dtype=np.int32), reward)
else:
return ts.transition(
np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)
num_iterations =3000 # @param
initial_collect_steps = 1000 # @param
collect_steps_per_iteration = 1 # @param
replay_buffer_capacity = 100000 # @param
fc_layer_params = (100,)
batch_size = 64 # @param
learning_rate = 1e-3 # @param
log_interval = 200 # @param
num_eval_episodes = 10 # @param
eval_interval = 1000 # @param
env = CardGameEnv()
env.reset()
print('Observation Spec:')
print(env.time_step_spec().observation)
print('Action Spec:')
print(env.action_spec())
time_step = env.reset()
print('Time step:')
print(time_step)
action = 1
next_time_step = env.step(action)
print('Next time step:')
print(next_time_step)
train_py_env = CardGameEnv()
eval_py_env = CardGameEnv()
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
#agent
q_net = q_network.QNetwork(
train_env.observation_spec(),
train_env.action_spec(),
fc_layer_params=fc_layer_params)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
train_step_counter = tf.compat.v2.Variable(0)
tf_agent = dqn_agent.DqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network=q_net,
optimizer=optimizer,
td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
train_step_counter=train_step_counter)
tf_agent.initialize()
#policy
eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
train_env.action_spec())
#Metrics and Evaluation
def compute_avg_return(environment, policy, num_episodes=10):
total_return = 0.0
for _ in range(num_episodes):
time_step = environment.reset()
episode_return = 0.0
while not time_step.is_last():
action_step = policy.action(time_step)
# print("eval action","-"*100)
# print(action_step.action)
time_step = environment.step(action_step.action)
episode_return += time_step.reward
total_return += episode_return
avg_return = total_return / num_episodes
return avg_return.numpy()[0]
#Replay Buffer
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=tf_agent.collect_data_spec,
batch_size=train_env.batch_size,
max_length=replay_buffer_capacity)
#Data Collection
def collect_step(environment, policy):
time_step = environment.current_time_step()
action_step = policy.action(time_step)
next_time_step = environment.step(action_step.action)
traj = trajectory.from_transition(time_step, action_step, next_time_step)
# Add trajectory to the replay buffer
replay_buffer.add_batch(traj)
for _ in range(initial_collect_steps):
collect_step(train_env, random_policy)
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)
iterator = iter(dataset)
#Training the agent
tf_agent.train = common.function(tf_agent.train)
# Reset the train step
tf_agent.train_step_counter.assign(0)
# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
returns = [avg_return]
# returns = []
for _ in range(num_iterations):
# Collect a few steps using collect_policy and save to the replay buffer.
for _ in range(collect_steps_per_iteration):
collect_step(train_env, tf_agent.collect_policy)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
train_loss = tf_agent.train(experience)
step = tf_agent.train_step_counter.numpy()
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss.loss))
if step % eval_interval == 0:
avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
print('step = {0}: Average Return = {1}'.format(step, avg_return))
returns.append(avg_return)
#plots
steps = range(0, num_iterations + 1, eval_interval)
print("-"*100)
print(steps,returns)
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network
import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
tf.compat.v1.enable_v2_behavior()
# CartPole-v0 Environment
env_name = 'CartPole-v0'
num_iterations =1000 # @param
initial_collect_steps = 1000 # @param
collect_steps_per_iteration = 1 # @param
replay_buffer_capacity = 100000 # @param
fc_layer_params = (100,)
batch_size = 64 # @param
learning_rate = 1e-3 # @param
log_interval = 200 # @param
num_eval_episodes = 10 # @param
eval_interval = 1000 # @param
env = suite_gym.load(env_name)
env.reset()
print('Observation Spec:')
print(env.time_step_spec().observation)
print('Action Spec:')
print(env.action_spec())
time_step = env.reset()
print('Time step:')
print(time_step)
action = 1
next_time_step = env.step(action)
print('Next time step:')
print(next_time_step)
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
#agent
q_net = q_network.QNetwork(
train_env.observation_spec(),
train_env.action_spec(),
fc_layer_params=fc_layer_params)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
train_step_counter = tf.compat.v2.Variable(0)
tf_agent = dqn_agent.DqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network=q_net,
optimizer=optimizer,
td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
train_step_counter=train_step_counter)
tf_agent.initialize()
#policy
eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
train_env.action_spec())
#Metrics and Evaluation
def compute_avg_return(environment, policy, num_episodes=10):
total_return = 0.0
for _ in range(num_episodes):
time_step = environment.reset()
episode_return = 0.0
while not time_step.is_last():
action_step = policy.action(time_step)
print("print eval action","-"*100)
print(action_step.action)
time_step = environment.step(action_step.action)
episode_return += time_step.reward
total_return += episode_return
avg_return = total_return / num_episodes
return avg_return.numpy()[0]
#Replay Buffer
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=tf_agent.collect_data_spec,
batch_size=train_env.batch_size,
max_length=replay_buffer_capacity)
#Data Collection
def collect_step(environment, policy):
time_step = environment.current_time_step()
action_step = policy.action(time_step)
next_time_step = environment.step(action_step.action)
traj = trajectory.from_transition(time_step, action_step, next_time_step)
# Add trajectory to the replay buffer
replay_buffer.add_batch(traj)
for _ in range(initial_collect_steps):
collect_step(train_env, random_policy)
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)
iterator = iter(dataset)
#Training the agent
tf_agent.train = common.function(tf_agent.train)
# Reset the train step
tf_agent.train_step_counter.assign(0)
# Evaluate the agent's policy once before training.
# avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
# returns = [avg_return]
# print("before training returns")
# print(returns)
returns = []
for _ in range(num_iterations):
# Collect a few steps using collect_policy and save to the replay buffer.
for _ in range(collect_steps_per_iteration):
collect_step(train_env, tf_agent.collect_policy)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
train_loss = tf_agent.train(experience)
step = tf_agent.train_step_counter.numpy()
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss.loss))
if step % eval_interval == 0:
avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
print('step = {0}: Average Return = {1}'.format(step, avg_return))
returns.append(avg_return)
#plots
steps = range(0, num_iterations + 1, eval_interval)
print("-"*100)
print("after training returns")
print(returns)
def helloworld():
print("hello world")
print("hello world2")
print("hello world3")
print("hello world4")
a=1
b=2
c=3
for i in range(1,3):
print("i",i)
print(i)
a=1
b=2
c=3
helloworld()
a=1
b=2
c=3
\ No newline at end of file
import numpy as np
import tensorflow as tf
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.policies import random_tf_policy
from tf_agents.metrics import tf_metrics
from tf_agents.drivers import dynamic_episode_driver
tf.compat.v1.enable_v2_behavior()
#TensorFlow Drivers
env = suite_gym.load('CartPole-v0')
tf_env = tf_py_environment.TFPyEnvironment(env)
tf_policy = random_tf_policy.RandomTFPolicy(action_spec=tf_env.action_spec(),
time_step_spec=tf_env.time_step_spec())
num_episodes = tf_metrics.NumberOfEpisodes()
env_steps = tf_metrics.EnvironmentSteps()
observers = [num_episodes, env_steps]
driver = dynamic_episode_driver.DynamicEpisodeDriver(
tf_env, tf_policy, observers, num_episodes=2)
# Initial driver.run will reset the environment and initialize the policy.
final_time_step, policy_state = driver.run()
print('final_time_step', final_time_step)
print('Number of Steps: ', env_steps.result().numpy())
print('Number of Episodes: ', num_episodes.result().numpy())
# Continue running from previous state
final_time_step, _ = driver.run(final_time_step, policy_state)
print('final_time_step', final_time_step)
print('Number of Steps: ', env_steps.result().numpy())
print('Number of Episodes: ', num_episodes.result().numpy())
\ No newline at end of file
import tensorflow as tf
from tf_agents.environments import py_environment
from tf_agents.specs import array_spec
import numpy as np
from tf_agents.trajectories import time_step as ts
tf.compat.v1.enable_v2_behavior()
class CardGameEnv(py_environment.PyEnvironment):
def __init__(self):
self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
self._observation_spec = array_spec.BoundedArraySpec(
shape=(1,), dtype=np.int32, minimum=0, name='observation')
self._state = 0
self._episode_ended = False
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _reset(self):
self._state = 0
self._episode_ended = False
return ts.restart(np.array([self._state], dtype=np.int32))
def _step(self, action):
if self._episode_ended:
# The last action ended the episode. Ignore the current action and start
# a new episode.
return self.reset()
# Make sure episodes don't go on forever.
if action == 1:
self._episode_ended = True
elif action == 0:
new_card = np.random.randint(1, 11)
self._state += new_card
else:
raise ValueError('`action` should be 0 or 1.')
if self._episode_ended or self._state >= 21:
reward = self._state - 21 if self._state <= 21 else -21
return ts.termination(np.array([self._state], dtype=np.int32), reward)
else:
return ts.transition(
np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)
get_new_card_action = 0
end_round_action = 1
environment = CardGameEnv()
time_step = environment.reset()
print(time_step)
cumulative_reward = time_step.reward
for _ in range(3):
time_step = environment.step(get_new_card_action)
print(time_step)
cumulative_reward += time_step.reward
time_step = environment.step(end_round_action)
print(time_step)
cumulative_reward += time_step.reward
print('Final Reward = ', cumulative_reward)
\ No newline at end of file
from tf_agents.specs import array_spec
from tf_agents.policies import random_py_policy
import numpy as np
from tf_agents.policies import scripted_py_policy
#Random Python Policy
action_spec = array_spec.BoundedArraySpec(shape=(1,), dtype=np.int32, minimum=0, maximum=10)
my_random_py_policy = random_py_policy.RandomPyPolicy(time_step_spec=None,
action_spec=action_spec)
time_step = None
action_step = my_random_py_policy.action(time_step)
print(action_step)
action_step = my_random_py_policy.action(time_step)
print(action_step)
print("*"*100)
#Scripted Python Policy
action_spec = array_spec.BoundedArraySpec((2,), np.int32, -10, 10)
action_script = [(1, np.array([5, 2], dtype=np.int32)),
(0, np.array([0, 0], dtype=np.int32)), # Setting `num_repeates` to 0 will skip this action.
(2, np.array([1, 2], dtype=np.int32)),
(1, np.array([3, 4], dtype=np.int32))]
my_scripted_py_policy = scripted_py_policy.ScriptedPyPolicy(
time_step_spec=None, action_spec=action_spec, action_script=action_script)
policy_state = my_scripted_py_policy.get_initial_state()
time_step = None
print('Executing scripted policy...')
action_step = my_scripted_py_policy.action(time_step, policy_state)
print(action_step.action[0])
action_step= my_scripted_py_policy.action(time_step, action_step.state)
print(action_step.action[0])
action_step = my_scripted_py_policy.action(time_step, action_step.state)
print(action_step.action[0])
action_step = my_scripted_py_policy.action(time_step, action_step.state)
print(action_step.action[0])
print('Resetting my_scripted_py_policy...')
policy_state = my_scripted_py_policy.get_initial_state()
action_step = my_scripted_py_policy.action(time_step, policy_state)
print(action_step)
print("*"*100)
import tensorflow as tf
import numpy as np
from tf_agents import specs
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network
from tf_agents.replay_buffers import py_uniform_replay_buffer
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step
tf.compat.v1.enable_v2_behavior()
#Creating the buffer
data_spec = (
tf.TensorSpec([3], tf.float32, 'action'),
(
tf.TensorSpec([5], tf.float32, 'lidar'),
tf.TensorSpec([3, 2], tf.float32, 'camera')
)
)
batch_size = 32
max_length = 1000
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec,
batch_size=batch_size,
max_length=max_length)
#Writing to the buffer
action = tf.constant(1 * np.ones(
data_spec[0].shape.as_list(), dtype=np.float32))
lidar = tf.constant(
2 * np.ones(data_spec[1][0].shape.as_list(), dtype=np.float32))
camera = tf.constant(
3 * np.ones(data_spec[1][1].shape.as_list(), dtype=np.float32))
values = (action, (lidar, camera))
values_batched = tf.nest.map_structure(lambda t: tf.stack([t] * batch_size),
values)
replay_buffer.add_batch(values_batched)
#Reading form the buffer
# add more items to the buffer before reading
for _ in range(5):
replay_buffer.add_batch(values_batched)
# Get one sample from the replay buffer with batch size 10 and 1 timestep:
sample = replay_buffer.get_next(sample_batch_size=10, num_steps=1)
# Convert the replay buffer to a tf.data.Dataset and iterate through it
dataset = replay_buffer.as_dataset(
sample_batch_size=4,
num_steps=2)
iterator = iter(dataset)
print("Iterator trajectories:")
trajectories = []
for _ in range(3):
t, _ = next(iterator)
trajectories.append(t)
print(tf.nest.map_structure(lambda t: t.shape, trajectories))
# Read all elements in the replay buffer:
trajectories = replay_buffer.gather_all()
print("Trajectories from gather all:")
print(tf.nest.map_structure(lambda t: t.shape, trajectories))
#PyUniformReplayBuffer
replay_buffer_capacity = 1000*32 # same capacity as the TFUniformReplayBuffer
py_replay_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
capacity=replay_buffer_capacity,
data_spec=tensor_spec.to_nest_array_spec(data_spec))
#Using replay buffers during training
#Data collection
env = suite_gym.load('CartPole-v0')
tf_env = tf_py_environment.TFPyEnvironment(env)
q_net = q_network.QNetwork(
tf_env.time_step_spec().observation,
tf_env.action_spec(),
fc_layer_params=(100,))
agent = dqn_agent.DqnAgent(
tf_env.time_step_spec(),
tf_env.action_spec(),
q_network=q_net,
optimizer=tf.compat.v1.train.AdamOptimizer(0.001))
replay_buffer_capacity = 1000
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
agent.collect_data_spec,
batch_size=tf_env.batch_size,
max_length=replay_buffer_capacity)
# Add an observer that adds to the replay buffer:
replay_observer = [replay_buffer.add_batch]
collect_steps_per_iteration = 10
collect_op = dynamic_step_driver.DynamicStepDriver(
tf_env,
agent.collect_policy,
observers=replay_observer,
num_steps=collect_steps_per_iteration).run()
#Reading data for a train step
# Read the replay buffer as a Dataset,
# read batches of 4 elements, each with 2 timesteps:
dataset = replay_buffer.as_dataset(
sample_batch_size=4,
num_steps=2)
iterator = iter(dataset)
num_train_steps = 10
for _ in range(num_train_steps):
trajectories, _ = next(iterator)
loss = agent.train(experience=trajectories)
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment