“test”

3287af7d · 高雅喆 · 2e92b377 · 3287af7d · 3287af7d · 3287af7d
Commit 3287af7d authored Jul 08, 2019 by 高雅喆
9 changed files
--- a/gm_train_ddpg.py
+++ b/gm_train_ddpg.py
--- a/rl_21_test.py
+++ b/rl_21_test.py
+from tf_agents.environments import suite_gym
+from tf_agents.environments import tf_py_environment
+from tf_agents.networks import q_network
+import tensorflow as tf
+from tf_agents.agents.dqn import dqn_agent
+from tf_agents.policies import random_tf_policy
+from tf_agents.replay_buffers import tf_uniform_replay_buffer
+from tf_agents.trajectories import trajectory
+from tf_agents.utils import common
+from tf_agents.environments import py_environment
+from tf_agents.specs import array_spec
+import numpy as np
+from tf_agents.trajectories import time_step as ts
+tf.compat.v1.enable_v2_behavior()
+
+class CardGameEnv(py_environment.PyEnvironment):
+
+  def __init__(self):
+    self._action_spec = array_spec.BoundedArraySpec(
+        shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
+    self._observation_spec = array_spec.BoundedArraySpec(
+        shape=(1,), dtype=np.int32, minimum=0, name='observation')
+    self._state = 0
+    self._episode_ended = False
+
+  def action_spec(self):
+    return self._action_spec
+
+  def observation_spec(self):
+    return self._observation_spec
+
+  def _reset(self):
+    self._state = 0
+    self._episode_ended = False
+    return ts.restart(np.array([self._state], dtype=np.int32))
+
+  def _step(self, action):
+
+    if self._episode_ended:
+      # The last action ended the episode. Ignore the current action and start
+      # a new episode.
+      return self.reset()
+
+    # Make sure episodes don't go on forever.
+    if action == 1:
+      self._episode_ended = True
+    elif action == 0:
+      new_card = np.random.randint(1, 11)
+      # print("random card")
+      # print(new_card)
+      # print("state")
+      # print(self._state)
+      self._state += new_card
+    else:
+      raise ValueError('`action` should be 0 or 1.')
+
+    if self._episode_ended or self._state >= 21:
+      reward = self._state - 21 if self._state <= 21 else -21
+      return ts.termination(np.array([self._state], dtype=np.int32), reward)
+    else:
+      return ts.transition(
+          np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)
+
+
+num_iterations =3000  # @param
+
+initial_collect_steps = 1000  # @param
+collect_steps_per_iteration = 1  # @param
+replay_buffer_capacity = 100000  # @param
+
+fc_layer_params = (100,)
+
+batch_size = 64  # @param
+learning_rate = 1e-3  # @param
+log_interval = 200  # @param
+
+num_eval_episodes = 10  # @param
+eval_interval = 1000  # @param
+
+
+env = CardGameEnv()
+env.reset()
+print('Observation Spec:')
+print(env.time_step_spec().observation)
+print('Action Spec:')
+print(env.action_spec())
+
+time_step = env.reset()
+print('Time step:')
+print(time_step)
+
+action = 1
+
+next_time_step = env.step(action)
+print('Next time step:')
+print(next_time_step)
+
+
+train_py_env = CardGameEnv()
+eval_py_env = CardGameEnv()
+
+train_env = tf_py_environment.TFPyEnvironment(train_py_env)
+eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
+
+
+
+#agent
+q_net = q_network.QNetwork(
+    train_env.observation_spec(),
+    train_env.action_spec(),
+    fc_layer_params=fc_layer_params)
+
+optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
+
+train_step_counter = tf.compat.v2.Variable(0)
+
+tf_agent = dqn_agent.DqnAgent(
+    train_env.time_step_spec(),
+    train_env.action_spec(),
+    q_network=q_net,
+    optimizer=optimizer,
+    td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
+    train_step_counter=train_step_counter)
+tf_agent.initialize()
+
+
+#policy
+
+eval_policy = tf_agent.policy
+collect_policy = tf_agent.collect_policy
+random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
+                                                train_env.action_spec())
+
+
+#Metrics and Evaluation
+def compute_avg_return(environment, policy, num_episodes=10):
+
+  total_return = 0.0
+  for _ in range(num_episodes):
+
+    time_step = environment.reset()
+    episode_return = 0.0
+
+    while not time_step.is_last():
+      action_step = policy.action(time_step)
+      # print("eval action","-"*100)
+      # print(action_step.action)
+      time_step = environment.step(action_step.action)
+      episode_return += time_step.reward
+    total_return += episode_return
+
+  avg_return = total_return / num_episodes
+  return avg_return.numpy()[0]
+
+
+
+#Replay Buffer
+replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
+    data_spec=tf_agent.collect_data_spec,
+    batch_size=train_env.batch_size,
+    max_length=replay_buffer_capacity)
+
+
+#Data Collection
+def collect_step(environment, policy):
+  time_step = environment.current_time_step()
+  action_step = policy.action(time_step)
+  next_time_step = environment.step(action_step.action)
+  traj = trajectory.from_transition(time_step, action_step, next_time_step)
+
+  # Add trajectory to the replay buffer
+  replay_buffer.add_batch(traj)
+
+
+for _ in range(initial_collect_steps):
+  collect_step(train_env, random_policy)
+
+# Dataset generates trajectories with shape [Bx2x...]
+dataset = replay_buffer.as_dataset(
+    num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)
+
+iterator = iter(dataset)
+
+
+#Training the agent
+tf_agent.train = common.function(tf_agent.train)
+
+# Reset the train step
+tf_agent.train_step_counter.assign(0)
+
+# Evaluate the agent's policy once before training.
+avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
+returns = [avg_return]
+# returns = []
+
+for _ in range(num_iterations):
+
+  # Collect a few steps using collect_policy and save to the replay buffer.
+  for _ in range(collect_steps_per_iteration):
+    collect_step(train_env, tf_agent.collect_policy)
+
+  # Sample a batch of data from the buffer and update the agent's network.
+  experience, unused_info = next(iterator)
+  train_loss = tf_agent.train(experience)
+
+  step = tf_agent.train_step_counter.numpy()
+
+  if step % log_interval == 0:
+    print('step = {0}: loss = {1}'.format(step, train_loss.loss))
+
+  if step % eval_interval == 0:
+    avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
+    print('step = {0}: Average Return = {1}'.format(step, avg_return))
+    returns.append(avg_return)
+
+
+#plots
+steps = range(0, num_iterations + 1, eval_interval)
+print("-"*100)
+print(steps,returns)
+
+
+
--- a/rl_CartPole_test.py
+++ b/rl_CartPole_test.py
+from tf_agents.environments import suite_gym
+from tf_agents.environments import tf_py_environment
+from tf_agents.networks import q_network
+import tensorflow as tf
+from tf_agents.agents.dqn import dqn_agent
+from tf_agents.policies import random_tf_policy
+from tf_agents.replay_buffers import tf_uniform_replay_buffer
+from tf_agents.trajectories import trajectory
+from tf_agents.utils import common
+
+tf.compat.v1.enable_v2_behavior()
+
+# CartPole-v0 Environment
+env_name = 'CartPole-v0'
+num_iterations =1000  # @param
+
+initial_collect_steps = 1000  # @param
+collect_steps_per_iteration = 1  # @param
+replay_buffer_capacity = 100000  # @param
+
+fc_layer_params = (100,)
+
+batch_size = 64  # @param
+learning_rate = 1e-3  # @param
+log_interval = 200  # @param
+
+num_eval_episodes = 10  # @param
+eval_interval = 1000  # @param
+
+
+env = suite_gym.load(env_name)
+env.reset()
+print('Observation Spec:')
+print(env.time_step_spec().observation)
+print('Action Spec:')
+print(env.action_spec())
+
+time_step = env.reset()
+print('Time step:')
+print(time_step)
+
+action = 1
+
+next_time_step = env.step(action)
+print('Next time step:')
+print(next_time_step)
+
+
+train_py_env = suite_gym.load(env_name)
+eval_py_env = suite_gym.load(env_name)
+
+train_env = tf_py_environment.TFPyEnvironment(train_py_env)
+eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
+
+
+
+
+#agent
+q_net = q_network.QNetwork(
+    train_env.observation_spec(),
+    train_env.action_spec(),
+    fc_layer_params=fc_layer_params)
+
+optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
+
+train_step_counter = tf.compat.v2.Variable(0)
+
+tf_agent = dqn_agent.DqnAgent(
+    train_env.time_step_spec(),
+    train_env.action_spec(),
+    q_network=q_net,
+    optimizer=optimizer,
+    td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
+    train_step_counter=train_step_counter)
+tf_agent.initialize()
+
+
+#policy
+
+eval_policy = tf_agent.policy
+collect_policy = tf_agent.collect_policy
+random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
+                                                train_env.action_spec())
+
+
+#Metrics and Evaluation
+def compute_avg_return(environment, policy, num_episodes=10):
+
+  total_return = 0.0
+  for _ in range(num_episodes):
+
+    time_step = environment.reset()
+    episode_return = 0.0
+
+    while not time_step.is_last():
+      action_step = policy.action(time_step)
+      print("print eval action","-"*100)
+      print(action_step.action)
+      time_step = environment.step(action_step.action)
+      episode_return += time_step.reward
+    total_return += episode_return
+
+  avg_return = total_return / num_episodes
+  return avg_return.numpy()[0]
+
+
+#Replay Buffer
+replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
+    data_spec=tf_agent.collect_data_spec,
+    batch_size=train_env.batch_size,
+    max_length=replay_buffer_capacity)
+
+
+#Data Collection
+def collect_step(environment, policy):
+  time_step = environment.current_time_step()
+  action_step = policy.action(time_step)
+  next_time_step = environment.step(action_step.action)
+  traj = trajectory.from_transition(time_step, action_step, next_time_step)
+
+  # Add trajectory to the replay buffer
+  replay_buffer.add_batch(traj)
+
+
+for _ in range(initial_collect_steps):
+  collect_step(train_env, random_policy)
+
+# Dataset generates trajectories with shape [Bx2x...]
+dataset = replay_buffer.as_dataset(
+    num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)
+
+iterator = iter(dataset)
+
+
+#Training the agent
+tf_agent.train = common.function(tf_agent.train)
+
+# Reset the train step
+tf_agent.train_step_counter.assign(0)
+
+# Evaluate the agent's policy once before training.
+# avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
+# returns = [avg_return]
+# print("before training returns")
+# print(returns)
+returns = []
+
+for _ in range(num_iterations):
+
+  # Collect a few steps using collect_policy and save to the replay buffer.
+  for _ in range(collect_steps_per_iteration):
+    collect_step(train_env, tf_agent.collect_policy)
+
+  # Sample a batch of data from the buffer and update the agent's network.
+  experience, unused_info = next(iterator)
+  train_loss = tf_agent.train(experience)
+
+  step = tf_agent.train_step_counter.numpy()
+
+  if step % log_interval == 0:
+    print('step = {0}: loss = {1}'.format(step, train_loss.loss))
+
+  if step % eval_interval == 0:
+    avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
+    print('step = {0}: Average Return = {1}'.format(step, avg_return))
+    returns.append(avg_return)
+
+
+#plots
+steps = range(0, num_iterations + 1, eval_interval)
+print("-"*100)
+print("after training returns")
+print(returns)
+
+
+
--- a/test.py
+++ b/test.py
+def helloworld():
+    print("hello world")
+    print("hello world2")
+    print("hello world3")
+    print("hello world4")
+a=1
+b=2
+c=3
+for i in range(1,3):
+    print("i",i)
+print(i)
+a=1
+b=2
+c=3
+helloworld()
+a=1
+b=2
+c=3
\ No newline at end of file
--- a/test_drivers.py
+++ b/test_drivers.py
+import numpy as np
+import tensorflow as tf
+from tf_agents.environments import suite_gym
+from tf_agents.environments import tf_py_environment
+from tf_agents.policies import random_tf_policy
+from tf_agents.metrics import tf_metrics
+from tf_agents.drivers import dynamic_episode_driver
+
+tf.compat.v1.enable_v2_behavior()
+
+
+#TensorFlow Drivers
+env = suite_gym.load('CartPole-v0')
+tf_env = tf_py_environment.TFPyEnvironment(env)
+
+tf_policy = random_tf_policy.RandomTFPolicy(action_spec=tf_env.action_spec(),
+                                            time_step_spec=tf_env.time_step_spec())
+
+
+num_episodes = tf_metrics.NumberOfEpisodes()
+env_steps = tf_metrics.EnvironmentSteps()
+observers = [num_episodes, env_steps]
+driver = dynamic_episode_driver.DynamicEpisodeDriver(
+    tf_env, tf_policy, observers, num_episodes=2)
+
+# Initial driver.run will reset the environment and initialize the policy.
+final_time_step, policy_state = driver.run()
+
+print('final_time_step', final_time_step)
+print('Number of Steps: ', env_steps.result().numpy())
+print('Number of Episodes: ', num_episodes.result().numpy())
+
+
+# Continue running from previous state
+final_time_step, _ = driver.run(final_time_step, policy_state)
+
+print('final_time_step', final_time_step)
+print('Number of Steps: ', env_steps.result().numpy())
+print('Number of Episodes: ', num_episodes.result().numpy())
\ No newline at end of file
--- a/test_environments.py
+++ b/test_environments.py
+import tensorflow as tf
+from tf_agents.environments import py_environment
+from tf_agents.specs import array_spec
+import numpy as np
+from tf_agents.trajectories import time_step as ts
+tf.compat.v1.enable_v2_behavior()
+
+class CardGameEnv(py_environment.PyEnvironment):
+
+  def __init__(self):
+    self._action_spec = array_spec.BoundedArraySpec(
+        shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
+    self._observation_spec = array_spec.BoundedArraySpec(
+        shape=(1,), dtype=np.int32, minimum=0, name='observation')
+    self._state = 0
+    self._episode_ended = False
+
+  def action_spec(self):
+    return self._action_spec
+
+  def observation_spec(self):
+    return self._observation_spec
+
+  def _reset(self):
+    self._state = 0
+    self._episode_ended = False
+    return ts.restart(np.array([self._state], dtype=np.int32))
+
+  def _step(self, action):
+
+    if self._episode_ended:
+      # The last action ended the episode. Ignore the current action and start
+      # a new episode.
+      return self.reset()
+
+    # Make sure episodes don't go on forever.
+    if action == 1:
+      self._episode_ended = True
+    elif action == 0:
+      new_card = np.random.randint(1, 11)
+      self._state += new_card
+    else:
+      raise ValueError('`action` should be 0 or 1.')
+
+    if self._episode_ended or self._state >= 21:
+      reward = self._state - 21 if self._state <= 21 else -21
+      return ts.termination(np.array([self._state], dtype=np.int32), reward)
+    else:
+      return ts.transition(
+          np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)
+
+
+get_new_card_action = 0
+end_round_action = 1
+
+environment = CardGameEnv()
+time_step = environment.reset()
+print(time_step)
+cumulative_reward = time_step.reward
+
+for _ in range(3):
+  time_step = environment.step(get_new_card_action)
+  print(time_step)
+  cumulative_reward += time_step.reward
+
+time_step = environment.step(end_round_action)
+print(time_step)
+cumulative_reward += time_step.reward
+print('Final Reward = ', cumulative_reward)
\ No newline at end of file
--- a/test_policies.py
+++ b/test_policies.py
+from tf_agents.specs import array_spec
+from tf_agents.policies import random_py_policy
+import numpy as np
+
+from tf_agents.policies import scripted_py_policy
+
+#Random Python Policy
+action_spec = array_spec.BoundedArraySpec(shape=(1,), dtype=np.int32, minimum=0, maximum=10)
+my_random_py_policy = random_py_policy.RandomPyPolicy(time_step_spec=None,
+    action_spec=action_spec)
+time_step = None
+action_step = my_random_py_policy.action(time_step)
+print(action_step)
+action_step = my_random_py_policy.action(time_step)
+print(action_step)
+
+print("*"*100)
+
+#Scripted Python Policy
+action_spec = array_spec.BoundedArraySpec((2,), np.int32, -10, 10)
+action_script = [(1, np.array([5, 2], dtype=np.int32)),
+                 (0, np.array([0, 0], dtype=np.int32)), # Setting `num_repeates` to 0 will skip this action.
+                 (2, np.array([1, 2], dtype=np.int32)),
+                 (1, np.array([3, 4], dtype=np.int32))]
+
+my_scripted_py_policy = scripted_py_policy.ScriptedPyPolicy(
+    time_step_spec=None, action_spec=action_spec, action_script=action_script)
+
+policy_state = my_scripted_py_policy.get_initial_state()
+time_step = None
+print('Executing scripted policy...')
+action_step = my_scripted_py_policy.action(time_step, policy_state)
+print(action_step.action[0])
+action_step= my_scripted_py_policy.action(time_step, action_step.state)
+print(action_step.action[0])
+action_step = my_scripted_py_policy.action(time_step, action_step.state)
+print(action_step.action[0])
+action_step = my_scripted_py_policy.action(time_step, action_step.state)
+print(action_step.action[0])
+
+
+print('Resetting my_scripted_py_policy...')
+policy_state = my_scripted_py_policy.get_initial_state()
+action_step = my_scripted_py_policy.action(time_step, policy_state)
+print(action_step)
+print("*"*100)
+
+
+
+
+
--- a/test_replay_buffers.py
+++ b/test_replay_buffers.py
+import tensorflow as tf
+import numpy as np
+
+from tf_agents import specs
+from tf_agents.agents.dqn import dqn_agent
+from tf_agents.drivers import dynamic_step_driver
+from tf_agents.environments import suite_gym
+from tf_agents.environments import tf_py_environment
+from tf_agents.networks import q_network
+from tf_agents.replay_buffers import py_uniform_replay_buffer
+from tf_agents.replay_buffers import tf_uniform_replay_buffer
+from tf_agents.specs import tensor_spec
+from tf_agents.trajectories import time_step
+
+tf.compat.v1.enable_v2_behavior()
+
+#Creating the buffer
+
+data_spec =  (
+        tf.TensorSpec([3], tf.float32, 'action'),
+        (
+            tf.TensorSpec([5], tf.float32, 'lidar'),
+            tf.TensorSpec([3, 2], tf.float32, 'camera')
+        )
+)
+
+batch_size = 32
+max_length = 1000
+
+replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
+    data_spec,
+    batch_size=batch_size,
+    max_length=max_length)
+
+#Writing to the buffer
+action = tf.constant(1 * np.ones(
+    data_spec[0].shape.as_list(), dtype=np.float32))
+lidar = tf.constant(
+    2 * np.ones(data_spec[1][0].shape.as_list(), dtype=np.float32))
+camera = tf.constant(
+    3 * np.ones(data_spec[1][1].shape.as_list(), dtype=np.float32))
+
+values = (action, (lidar, camera))
+values_batched = tf.nest.map_structure(lambda t: tf.stack([t] * batch_size),
+                                       values)
+
+replay_buffer.add_batch(values_batched)
+
+
+#Reading form the buffer
+# add more items to the buffer before reading
+for _ in range(5):
+    replay_buffer.add_batch(values_batched)
+
+# Get one sample from the replay buffer with batch size 10 and 1 timestep:
+
+sample = replay_buffer.get_next(sample_batch_size=10, num_steps=1)
+
+# Convert the replay buffer to a tf.data.Dataset and iterate through it
+dataset = replay_buffer.as_dataset(
+    sample_batch_size=4,
+    num_steps=2)
+
+iterator = iter(dataset)
+print("Iterator trajectories:")
+trajectories = []
+for _ in range(3):
+    t, _ = next(iterator)
+    trajectories.append(t)
+
+print(tf.nest.map_structure(lambda t: t.shape, trajectories))
+
+# Read all elements in the replay buffer:
+trajectories = replay_buffer.gather_all()
+
+print("Trajectories from gather all:")
+print(tf.nest.map_structure(lambda t: t.shape, trajectories))
+
+
+#PyUniformReplayBuffer
+replay_buffer_capacity = 1000*32 # same capacity as the TFUniformReplayBuffer
+
+py_replay_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
+    capacity=replay_buffer_capacity,
+    data_spec=tensor_spec.to_nest_array_spec(data_spec))
+
+
+#Using replay buffers during training
+#Data collection
+env = suite_gym.load('CartPole-v0')
+tf_env = tf_py_environment.TFPyEnvironment(env)
+
+q_net = q_network.QNetwork(
+    tf_env.time_step_spec().observation,
+    tf_env.action_spec(),
+    fc_layer_params=(100,))
+
+agent = dqn_agent.DqnAgent(
+    tf_env.time_step_spec(),
+    tf_env.action_spec(),
+    q_network=q_net,
+    optimizer=tf.compat.v1.train.AdamOptimizer(0.001))
+
+replay_buffer_capacity = 1000
+
+replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
+    agent.collect_data_spec,
+    batch_size=tf_env.batch_size,
+    max_length=replay_buffer_capacity)
+
+# Add an observer that adds to the replay buffer:
+replay_observer = [replay_buffer.add_batch]
+
+collect_steps_per_iteration = 10
+collect_op = dynamic_step_driver.DynamicStepDriver(
+  tf_env,
+  agent.collect_policy,
+  observers=replay_observer,
+  num_steps=collect_steps_per_iteration).run()
+
+#Reading data for a train step
+# Read the replay buffer as a Dataset,
+# read batches of 4 elements, each with 2 timesteps:
+dataset = replay_buffer.as_dataset(
+    sample_batch_size=4,
+    num_steps=2)
+
+iterator = iter(dataset)
+
+num_train_steps = 10
+
+for _ in range(num_train_steps):
+  trajectories, _ = next(iterator)
+  loss = agent.train(experience=trajectories)
+
--- a/tf_agents_test.py
+++ b/tf_agents_test.py