“test”

2e92b377 · 高雅喆 · 0346a3f1 · 0346a3f1 · 0346a3f1 · 0346a3f1
Commit 2e92b377 authored Jul 08, 2019 by 高雅喆
9 changed files
--- a/gm_train_ddpg.py
+++ b/gm_train_ddpg.py
-# coding=utf-8
-# Copyright 2018 The TF-Agents Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-r"""Train and Eval DDPG.
-
-To run:
-
-```bash
-tensorboard --logdir $HOME/tmp/ddpg/gym/HalfCheetah-v2/ --port 2223 &
-
-python tf_agents/agents/ddpg/examples/v2/train_eval.py \
-  --root_dir=$HOME/tmp/ddpg/gym/HalfCheetah-v2/ \
-  --num_iterations=2000000 \
-  --alsologtostderr
-```
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-
-from absl import app
-from absl import flags
-from absl import logging
-
-import gin
-import tensorflow as tf
-
-from tf_agents.agents.ddpg import actor_network
-from tf_agents.agents.ddpg import critic_network
-from tf_agents.agents.ddpg import ddpg_agent
-from tf_agents.drivers import dynamic_step_driver
-from tf_agents.environments import parallel_py_environment
-from tf_agents.environments import suite_mujoco
-from tf_agents.environments import tf_py_environment
-from tf_agents.environments import py_environment
-from tf_agents.eval import metric_utils
-from tf_agents.metrics import tf_metrics
-from tf_agents.replay_buffers import tf_uniform_replay_buffer
-from tf_agents.utils import common
-import abc
-import tensorflow as tf
-import numpy as np
-
-from tf_agents.environments import py_environment
-from tf_agents.environments import tf_environment
-from tf_agents.environments import tf_py_environment
-from tf_agents.environments import utils
-from tf_agents.specs import array_spec
-from tf_agents.environments import wrappers
-from tf_agents.environments import suite_gym
-from tf_agents.trajectories import time_step as ts
-
-flags.DEFINE_string('root_dir', '$HOME/tmp/ddpg/gym/HalfCheetah-v2/',
-                    'Root directory for writing logs/summaries/checkpoints.')
-flags.DEFINE_integer('num_iterations', 100000,
-                     'Total number train/eval iterations to perform.')
-flags.DEFINE_multi_string('gin_file', None, 'Paths to the gin-config files.')
-flags.DEFINE_multi_string('gin_param', None, 'Gin binding parameters.')
-
-
-FLAGS = flags.FLAGS
-
-class CardGameEnv(py_environment.PyEnvironment):
-
-  def __init__(self):
-    self._action_spec = array_spec.BoundedArraySpec(
-        shape=(), dtype=np.float, minimum=0.0, maximum=1.0, name='action')
-    self._observation_spec = array_spec.BoundedArraySpec(
-        shape=(1,), dtype=np.float, minimum=0.0, name='observation')
-    self._state = 0.0
-    self._episode_ended = False
-    self._current_time_step = self._reset()
-
-  def action_spec(self):
-    return self._action_spec
-
-
-
-  def observation_spec(self):
-    return self._observation_spec
-
-  def _reset(self):
-    self._state = 0.0
-    self._episode_ended = False
-    return ts.restart(np.array([self._state], dtype=np.float))
-
-  def _step(self, action):
-
-    if self._episode_ended:
-      # The last action ended the episode. Ignore the current action and start
-      # a new episode.
-      return self.reset()
-
-    # Make sure episodes don't go on forever.
-    if action >= 1.0:
-      self._episode_ended = True
-    elif action < 1.0:
-      new_card = np.random.randint(1, 11)
-      self._state += new_card
-    else:
-      raise ValueError('`action` should be 0 or 1.')
-
-    if self._episode_ended or self._state >= 21:
-      reward = self._state - 21 if self._state <= 21 else -21
-      return ts.termination(np.array([self._state], dtype=np.float), reward)
-    else:
-      return ts.transition(
-          np.array([self._state], dtype=np.float), reward=0.0, discount=1.0)
-
-
-@gin.configurable
-def train_eval(
-    root_dir,
-    env_name='',
-    env_load_fn=suite_mujoco.load,
-    num_iterations=2000000,
-    actor_fc_layers=(400, 300),
-    critic_obs_fc_layers=(400,),
-    critic_action_fc_layers=None,
-    critic_joint_fc_layers=(300,),
-    # Params for collect
-    initial_collect_steps=1,
-    collect_steps_per_iteration=1,
-    num_parallel_environments=1,
-    replay_buffer_capacity=100000,
-    ou_stddev=0.2,
-    ou_damping=0.15,
-    # Params for target update
-    target_update_tau=0.05,
-    target_update_period=5,
-    # Params for train
-    train_steps_per_iteration=1,
-    batch_size=64,
-    actor_learning_rate=1e-4,
-    critic_learning_rate=1e-3,
-    dqda_clipping=None,
-    td_errors_loss_fn=tf.compat.v1.losses.huber_loss,
-    gamma=0.995,
-    reward_scale_factor=1.0,
-    gradient_clipping=None,
-    use_tf_functions=True,
-    # Params for eval
-    num_eval_episodes=10,
-    eval_interval=10,
-    # Params for checkpoints, summaries, and logging
-    log_interval=10,
-    summary_interval=10,
-    summaries_flush_secs=10,
-    debug_summaries=False,
-    summarize_grads_and_vars=False,
-    eval_metrics_callback=None):
-
-  """A simple train and eval for DDPG."""
-  # tensorboard log
-  root_dir = os.path.expanduser(root_dir)
-  train_dir = os.path.join(root_dir, 'train')
-  eval_dir = os.path.join(root_dir, 'eval')
-
-  train_summary_writer = tf.compat.v2.summary.create_file_writer(
-      train_dir, flush_millis=summaries_flush_secs * 1000)
-  train_summary_writer.set_as_default()
-
-  eval_summary_writer = tf.compat.v2.summary.create_file_writer(
-      eval_dir, flush_millis=summaries_flush_secs * 1000)
-  eval_metrics = [
-      tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
-      tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
-  ]
-
-  # initialize
-  env = CardGameEnv()
-  global_step = tf.compat.v1.train.get_or_create_global_step()
-  with tf.compat.v2.summary.record_if(
-      lambda: tf.math.equal(global_step % summary_interval, 0)):
-    tf_env = tf_py_environment.TFPyEnvironment(env)
-    eval_tf_env = tf_py_environment.TFPyEnvironment(env)
-
-    actor_net = actor_network.ActorNetwork(
-        tf_env.time_step_spec().observation,
-        tf_env.action_spec(),
-        fc_layer_params=actor_fc_layers,
-    )
-
-    critic_net_input_specs = (tf_env.time_step_spec().observation,
-                              tf_env.action_spec())
-
-    critic_net = critic_network.CriticNetwork(
-        critic_net_input_specs,
-        observation_fc_layer_params=critic_obs_fc_layers,
-        action_fc_layer_params=critic_action_fc_layers,
-        joint_fc_layer_params=critic_joint_fc_layers,
-    )
-
-    tf_agent = ddpg_agent.DdpgAgent(
-        tf_env.time_step_spec(),
-        tf_env.action_spec(),
-        actor_network=actor_net,
-        critic_network=critic_net,
-        actor_optimizer=tf.compat.v1.train.AdamOptimizer(
-            learning_rate=actor_learning_rate),
-        critic_optimizer=tf.compat.v1.train.AdamOptimizer(
-            learning_rate=critic_learning_rate),
-        ou_stddev=ou_stddev,
-        ou_damping=ou_damping,
-        target_update_tau=target_update_tau,
-        target_update_period=target_update_period,
-        dqda_clipping=dqda_clipping,
-        td_errors_loss_fn=td_errors_loss_fn,
-        gamma=gamma,
-        reward_scale_factor=reward_scale_factor,
-        gradient_clipping=gradient_clipping,
-        debug_summaries=debug_summaries,
-        summarize_grads_and_vars=summarize_grads_and_vars,
-        train_step_counter=global_step)
-    tf_agent.initialize()
-
-    train_metrics = [
-        tf_metrics.NumberOfEpisodes(),
-        tf_metrics.EnvironmentSteps(),
-        tf_metrics.AverageReturnMetric(),
-        tf_metrics.AverageEpisodeLengthMetric(),
-    ]
-
-    eval_policy = tf_agent.policy
-    collect_policy = tf_agent.collect_policy
-
-    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
-        tf_agent.collect_data_spec,
-        batch_size=tf_env.batch_size,
-        max_length=replay_buffer_capacity)
-
-    initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
-        tf_env,
-        collect_policy,
-        observers=[replay_buffer.add_batch],
-        num_steps=initial_collect_steps)
-
-    collect_driver = dynamic_step_driver.DynamicStepDriver(
-        tf_env,
-        collect_policy,
-        observers=[replay_buffer.add_batch] + train_metrics,
-        num_steps=collect_steps_per_iteration)
-
-    if use_tf_functions:
-      initial_collect_driver.run = common.function(initial_collect_driver.run)
-      collect_driver.run = common.function(collect_driver.run)
-      tf_agent.train = common.function(tf_agent.train)
-
-    # Collect initial replay data.
-    logging.info(
-        'Initializing replay buffer by collecting experience for %d steps with '
-        'a random policy.', initial_collect_steps)
-    initial_collect_driver.run()
-
-    results = metric_utils.eager_compute(
-        eval_metrics,
-        eval_tf_env,
-        eval_policy,
-        num_episodes=num_eval_episodes,
-        train_step=global_step,
-        summary_writer=eval_summary_writer,
-        summary_prefix='Metrics',
-    )
-    if eval_metrics_callback is not None:
-      eval_metrics_callback(results, global_step.numpy())
-    metric_utils.log_metrics(eval_metrics)
-
-    time_step = None
-    policy_state = collect_policy.get_initial_state(tf_env.batch_size)
-
-    timed_at_step = global_step.numpy()
-    time_acc = 0
-
-    # Dataset generates trajectories with shape [Bx2x...]
-    dataset = replay_buffer.as_dataset(
-        num_parallel_calls=3,
-        sample_batch_size=batch_size,
-        num_steps=2).prefetch(3)
-    iterator = iter(dataset)
-
-
-    # train and eval
-    for _ in range(num_iterations):
-      start_time = time.time()
-      time_step, policy_state = collect_driver.run(
-          time_step=time_step,
-          policy_state=policy_state,
-      )
-      for _ in range(train_steps_per_iteration):
-        experience, _ = next(iterator)
-        train_loss = tf_agent.train(experience)
-        action_step = eval_policy.action(time_step)
-        print("print eval action", "-" * 100)
-        print(action_step.action)
-      time_acc += time.time() - start_time
-
-      if global_step.numpy() % log_interval == 0:
-        logging.info('step = %d, loss = %f', global_step.numpy(),
-                     train_loss.loss)
-        steps_per_sec = (global_step.numpy() - timed_at_step) / time_acc
-        logging.info('%.3f steps/sec', steps_per_sec)
-        tf.compat.v2.summary.scalar(
-            name='global_steps_per_sec', data=steps_per_sec, step=global_step)
-        timed_at_step = global_step.numpy()
-        time_acc = 0
-
-      for train_metric in train_metrics:
-        train_metric.tf_summaries(
-            train_step=global_step, step_metrics=train_metrics[:2])
-
-      if global_step.numpy() % eval_interval == 0:
-        results = metric_utils.eager_compute(
-            eval_metrics,
-            eval_tf_env,
-            eval_policy,
-            num_episodes=num_eval_episodes,
-            train_step=global_step,
-            summary_writer=eval_summary_writer,
-            summary_prefix='Metrics',
-        )
-        if eval_metrics_callback is not None:
-          eval_metrics_callback(results, global_step.numpy())
-        metric_utils.log_metrics(eval_metrics)
-
-    return train_loss
-
-
-def main(_):
-  tf.compat.v1.enable_v2_behavior()
-  logging.set_verbosity(logging.INFO)
-  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
-  train_eval(FLAGS.root_dir, num_iterations=FLAGS.num_iterations)
-
-if __name__ == '__main__':
-  app.run(main)
--- a/rl_21_test.py
+++ b/rl_21_test.py
-from tf_agents.environments import suite_gym
-from tf_agents.environments import tf_py_environment
-from tf_agents.networks import q_network
-import tensorflow as tf
-from tf_agents.agents.dqn import dqn_agent
-from tf_agents.policies import random_tf_policy
-from tf_agents.replay_buffers import tf_uniform_replay_buffer
-from tf_agents.trajectories import trajectory
-from tf_agents.utils import common
-from tf_agents.environments import py_environment
-from tf_agents.specs import array_spec
-import numpy as np
-from tf_agents.trajectories import time_step as ts
-tf.compat.v1.enable_v2_behavior()
-
-class CardGameEnv(py_environment.PyEnvironment):
-
-  def __init__(self):
-    self._action_spec = array_spec.BoundedArraySpec(
-        shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
-    self._observation_spec = array_spec.BoundedArraySpec(
-        shape=(1,), dtype=np.int32, minimum=0, name='observation')
-    self._state = 0
-    self._episode_ended = False
-
-  def action_spec(self):
-    return self._action_spec
-
-  def observation_spec(self):
-    return self._observation_spec
-
-  def _reset(self):
-    self._state = 0
-    self._episode_ended = False
-    return ts.restart(np.array([self._state], dtype=np.int32))
-
-  def _step(self, action):
-
-    if self._episode_ended:
-      # The last action ended the episode. Ignore the current action and start
-      # a new episode.
-      return self.reset()
-
-    # Make sure episodes don't go on forever.
-    if action == 1:
-      self._episode_ended = True
-    elif action == 0:
-      new_card = np.random.randint(1, 11)
-      # print("random card")
-      # print(new_card)
-      # print("state")
-      # print(self._state)
-      self._state += new_card
-    else:
-      raise ValueError('`action` should be 0 or 1.')
-
-    if self._episode_ended or self._state >= 21:
-      reward = self._state - 21 if self._state <= 21 else -21
-      return ts.termination(np.array([self._state], dtype=np.int32), reward)
-    else:
-      return ts.transition(
-          np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)
-
-
-num_iterations =3000  # @param
-
-initial_collect_steps = 1000  # @param
-collect_steps_per_iteration = 1  # @param
-replay_buffer_capacity = 100000  # @param
-
-fc_layer_params = (100,)
-
-batch_size = 64  # @param
-learning_rate = 1e-3  # @param
-log_interval = 200  # @param
-
-num_eval_episodes = 10  # @param
-eval_interval = 1000  # @param
-
-
-env = CardGameEnv()
-env.reset()
-print('Observation Spec:')
-print(env.time_step_spec().observation)
-print('Action Spec:')
-print(env.action_spec())
-
-time_step = env.reset()
-print('Time step:')
-print(time_step)
-
-action = 1
-
-next_time_step = env.step(action)
-print('Next time step:')
-print(next_time_step)
-
-
-train_py_env = CardGameEnv()
-eval_py_env = CardGameEnv()
-
-train_env = tf_py_environment.TFPyEnvironment(train_py_env)
-eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
-
-
-
-#agent
-q_net = q_network.QNetwork(
-    train_env.observation_spec(),
-    train_env.action_spec(),
-    fc_layer_params=fc_layer_params)
-
-optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
-
-train_step_counter = tf.compat.v2.Variable(0)
-
-tf_agent = dqn_agent.DqnAgent(
-    train_env.time_step_spec(),
-    train_env.action_spec(),
-    q_network=q_net,
-    optimizer=optimizer,
-    td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
-    train_step_counter=train_step_counter)
-tf_agent.initialize()
-
-
-#policy
-
-eval_policy = tf_agent.policy
-collect_policy = tf_agent.collect_policy
-random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
-                                                train_env.action_spec())
-
-
-#Metrics and Evaluation
-def compute_avg_return(environment, policy, num_episodes=10):
-
-  total_return = 0.0
-  for _ in range(num_episodes):
-
-    time_step = environment.reset()
-    episode_return = 0.0
-
-    while not time_step.is_last():
-      action_step = policy.action(time_step)
-      # print("eval action","-"*100)
-      # print(action_step.action)
-      time_step = environment.step(action_step.action)
-      episode_return += time_step.reward
-    total_return += episode_return
-
-  avg_return = total_return / num_episodes
-  return avg_return.numpy()[0]
-
-
-
-#Replay Buffer
-replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
-    data_spec=tf_agent.collect_data_spec,
-    batch_size=train_env.batch_size,
-    max_length=replay_buffer_capacity)
-
-
-#Data Collection
-def collect_step(environment, policy):
-  time_step = environment.current_time_step()
-  action_step = policy.action(time_step)
-  next_time_step = environment.step(action_step.action)
-  traj = trajectory.from_transition(time_step, action_step, next_time_step)
-
-  # Add trajectory to the replay buffer
-  replay_buffer.add_batch(traj)
-
-
-for _ in range(initial_collect_steps):
-  collect_step(train_env, random_policy)
-
-# Dataset generates trajectories with shape [Bx2x...]
-dataset = replay_buffer.as_dataset(
-    num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)
-
-iterator = iter(dataset)
-
-
-#Training the agent
-tf_agent.train = common.function(tf_agent.train)
-
-# Reset the train step
-tf_agent.train_step_counter.assign(0)
-
-# Evaluate the agent's policy once before training.
-avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
-returns = [avg_return]
-# returns = []
-
-for _ in range(num_iterations):
-
-  # Collect a few steps using collect_policy and save to the replay buffer.
-  for _ in range(collect_steps_per_iteration):
-    collect_step(train_env, tf_agent.collect_policy)
-
-  # Sample a batch of data from the buffer and update the agent's network.
-  experience, unused_info = next(iterator)
-  train_loss = tf_agent.train(experience)
-
-  step = tf_agent.train_step_counter.numpy()
-
-  if step % log_interval == 0:
-    print('step = {0}: loss = {1}'.format(step, train_loss.loss))
-
-  if step % eval_interval == 0:
-    avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
-    print('step = {0}: Average Return = {1}'.format(step, avg_return))
-    returns.append(avg_return)
-
-
-#plots
-steps = range(0, num_iterations + 1, eval_interval)
-print("-"*100)
-print(steps,returns)
-
-
-
--- a/rl_CartPole_test.py
+++ b/rl_CartPole_test.py
-from tf_agents.environments import suite_gym
-from tf_agents.environments import tf_py_environment
-from tf_agents.networks import q_network
-import tensorflow as tf
-from tf_agents.agents.dqn import dqn_agent
-from tf_agents.policies import random_tf_policy
-from tf_agents.replay_buffers import tf_uniform_replay_buffer
-from tf_agents.trajectories import trajectory
-from tf_agents.utils import common
-
-tf.compat.v1.enable_v2_behavior()
-
-# CartPole-v0 Environment
-env_name = 'CartPole-v0'
-num_iterations =1000  # @param
-
-initial_collect_steps = 1000  # @param
-collect_steps_per_iteration = 1  # @param
-replay_buffer_capacity = 100000  # @param
-
-fc_layer_params = (100,)
-
-batch_size = 64  # @param
-learning_rate = 1e-3  # @param
-log_interval = 200  # @param
-
-num_eval_episodes = 10  # @param
-eval_interval = 1000  # @param
-
-
-env = suite_gym.load(env_name)
-env.reset()
-print('Observation Spec:')
-print(env.time_step_spec().observation)
-print('Action Spec:')
-print(env.action_spec())
-
-time_step = env.reset()
-print('Time step:')
-print(time_step)
-
-action = 1
-
-next_time_step = env.step(action)
-print('Next time step:')
-print(next_time_step)
-
-
-train_py_env = suite_gym.load(env_name)
-eval_py_env = suite_gym.load(env_name)
-
-train_env = tf_py_environment.TFPyEnvironment(train_py_env)
-eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
-
-
-
-
-#agent
-q_net = q_network.QNetwork(
-    train_env.observation_spec(),
-    train_env.action_spec(),
-    fc_layer_params=fc_layer_params)
-
-optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
-
-train_step_counter = tf.compat.v2.Variable(0)
-
-tf_agent = dqn_agent.DqnAgent(
-    train_env.time_step_spec(),
-    train_env.action_spec(),
-    q_network=q_net,
-    optimizer=optimizer,
-    td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
-    train_step_counter=train_step_counter)
-tf_agent.initialize()
-
-
-#policy
-
-eval_policy = tf_agent.policy
-collect_policy = tf_agent.collect_policy
-random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
-                                                train_env.action_spec())
-
-
-#Metrics and Evaluation
-def compute_avg_return(environment, policy, num_episodes=10):
-
-  total_return = 0.0
-  for _ in range(num_episodes):
-
-    time_step = environment.reset()
-    episode_return = 0.0
-
-    while not time_step.is_last():
-      action_step = policy.action(time_step)
-      print("print eval action","-"*100)
-      print(action_step.action)
-      time_step = environment.step(action_step.action)
-      episode_return += time_step.reward
-    total_return += episode_return
-
-  avg_return = total_return / num_episodes
-  return avg_return.numpy()[0]
-
-
-#Replay Buffer
-replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
-    data_spec=tf_agent.collect_data_spec,
-    batch_size=train_env.batch_size,
-    max_length=replay_buffer_capacity)
-
-
-#Data Collection
-def collect_step(environment, policy):
-  time_step = environment.current_time_step()
-  action_step = policy.action(time_step)
-  next_time_step = environment.step(action_step.action)
-  traj = trajectory.from_transition(time_step, action_step, next_time_step)
-
-  # Add trajectory to the replay buffer
-  replay_buffer.add_batch(traj)
-
-
-for _ in range(initial_collect_steps):
-  collect_step(train_env, random_policy)
-
-# Dataset generates trajectories with shape [Bx2x...]
-dataset = replay_buffer.as_dataset(
-    num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)
-
-iterator = iter(dataset)
-
-
-#Training the agent
-tf_agent.train = common.function(tf_agent.train)
-
-# Reset the train step
-tf_agent.train_step_counter.assign(0)
-
-# Evaluate the agent's policy once before training.
-# avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
-# returns = [avg_return]
-# print("before training returns")
-# print(returns)
-returns = []
-
-for _ in range(num_iterations):
-
-  # Collect a few steps using collect_policy and save to the replay buffer.
-  for _ in range(collect_steps_per_iteration):
-    collect_step(train_env, tf_agent.collect_policy)
-
-  # Sample a batch of data from the buffer and update the agent's network.
-  experience, unused_info = next(iterator)
-  train_loss = tf_agent.train(experience)
-
-  step = tf_agent.train_step_counter.numpy()
-
-  if step % log_interval == 0:
-    print('step = {0}: loss = {1}'.format(step, train_loss.loss))
-
-  if step % eval_interval == 0:
-    avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes)
-    print('step = {0}: Average Return = {1}'.format(step, avg_return))
-    returns.append(avg_return)
-
-
-#plots
-steps = range(0, num_iterations + 1, eval_interval)
-print("-"*100)
-print("after training returns")
-print(returns)
-
-
-
--- a/test.py
+++ b/test.py
-def helloworld():
-    print("hello world")
-    print("hello world2")
-    print("hello world3")
-    print("hello world4")
-a=1
-b=2
-c=3
-for i in range(1,3):
-    print("i",i)
-print(i)
-a=1
-b=2
-c=3
-helloworld()
-a=1
-b=2
-c=3
\ No newline at end of file
--- a/test_drivers.py
+++ b/test_drivers.py
-import numpy as np
-import tensorflow as tf
-from tf_agents.environments import suite_gym
-from tf_agents.environments import tf_py_environment
-from tf_agents.policies import random_tf_policy
-from tf_agents.metrics import tf_metrics
-from tf_agents.drivers import dynamic_episode_driver
-
-tf.compat.v1.enable_v2_behavior()
-
-
-#TensorFlow Drivers
-env = suite_gym.load('CartPole-v0')
-tf_env = tf_py_environment.TFPyEnvironment(env)
-
-tf_policy = random_tf_policy.RandomTFPolicy(action_spec=tf_env.action_spec(),
-                                            time_step_spec=tf_env.time_step_spec())
-
-
-num_episodes = tf_metrics.NumberOfEpisodes()
-env_steps = tf_metrics.EnvironmentSteps()
-observers = [num_episodes, env_steps]
-driver = dynamic_episode_driver.DynamicEpisodeDriver(
-    tf_env, tf_policy, observers, num_episodes=2)
-
-# Initial driver.run will reset the environment and initialize the policy.
-final_time_step, policy_state = driver.run()
-
-print('final_time_step', final_time_step)
-print('Number of Steps: ', env_steps.result().numpy())
-print('Number of Episodes: ', num_episodes.result().numpy())
-
-
-# Continue running from previous state
-final_time_step, _ = driver.run(final_time_step, policy_state)
-
-print('final_time_step', final_time_step)
-print('Number of Steps: ', env_steps.result().numpy())
-print('Number of Episodes: ', num_episodes.result().numpy())
\ No newline at end of file
--- a/test_environments.py
+++ b/test_environments.py
-import tensorflow as tf
-from tf_agents.environments import py_environment
-from tf_agents.specs import array_spec
-import numpy as np
-from tf_agents.trajectories import time_step as ts
-tf.compat.v1.enable_v2_behavior()
-
-class CardGameEnv(py_environment.PyEnvironment):
-
-  def __init__(self):
-    self._action_spec = array_spec.BoundedArraySpec(
-        shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
-    self._observation_spec = array_spec.BoundedArraySpec(
-        shape=(1,), dtype=np.int32, minimum=0, name='observation')
-    self._state = 0
-    self._episode_ended = False
-
-  def action_spec(self):
-    return self._action_spec
-
-  def observation_spec(self):
-    return self._observation_spec
-
-  def _reset(self):
-    self._state = 0
-    self._episode_ended = False
-    return ts.restart(np.array([self._state], dtype=np.int32))
-
-  def _step(self, action):
-
-    if self._episode_ended:
-      # The last action ended the episode. Ignore the current action and start
-      # a new episode.
-      return self.reset()
-
-    # Make sure episodes don't go on forever.
-    if action == 1:
-      self._episode_ended = True
-    elif action == 0:
-      new_card = np.random.randint(1, 11)
-      self._state += new_card
-    else:
-      raise ValueError('`action` should be 0 or 1.')
-
-    if self._episode_ended or self._state >= 21:
-      reward = self._state - 21 if self._state <= 21 else -21
-      return ts.termination(np.array([self._state], dtype=np.int32), reward)
-    else:
-      return ts.transition(
-          np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)
-
-
-get_new_card_action = 0
-end_round_action = 1
-
-environment = CardGameEnv()
-time_step = environment.reset()
-print(time_step)
-cumulative_reward = time_step.reward
-
-for _ in range(3):
-  time_step = environment.step(get_new_card_action)
-  print(time_step)
-  cumulative_reward += time_step.reward
-
-time_step = environment.step(end_round_action)
-print(time_step)
-cumulative_reward += time_step.reward
-print('Final Reward = ', cumulative_reward)
\ No newline at end of file
--- a/test_policies.py
+++ b/test_policies.py
-from tf_agents.specs import array_spec
-from tf_agents.policies import random_py_policy
-import numpy as np
-
-from tf_agents.policies import scripted_py_policy
-
-#Random Python Policy
-action_spec = array_spec.BoundedArraySpec(shape=(1,), dtype=np.int32, minimum=0, maximum=10)
-my_random_py_policy = random_py_policy.RandomPyPolicy(time_step_spec=None,
-    action_spec=action_spec)
-time_step = None
-action_step = my_random_py_policy.action(time_step)
-print(action_step)
-action_step = my_random_py_policy.action(time_step)
-print(action_step)
-
-print("*"*100)
-
-#Scripted Python Policy
-action_spec = array_spec.BoundedArraySpec((2,), np.int32, -10, 10)
-action_script = [(1, np.array([5, 2], dtype=np.int32)),
-                 (0, np.array([0, 0], dtype=np.int32)), # Setting `num_repeates` to 0 will skip this action.
-                 (2, np.array([1, 2], dtype=np.int32)),
-                 (1, np.array([3, 4], dtype=np.int32))]
-
-my_scripted_py_policy = scripted_py_policy.ScriptedPyPolicy(
-    time_step_spec=None, action_spec=action_spec, action_script=action_script)
-
-policy_state = my_scripted_py_policy.get_initial_state()
-time_step = None
-print('Executing scripted policy...')
-action_step = my_scripted_py_policy.action(time_step, policy_state)
-print(action_step.action[0])
-action_step= my_scripted_py_policy.action(time_step, action_step.state)
-print(action_step.action[0])
-action_step = my_scripted_py_policy.action(time_step, action_step.state)
-print(action_step.action[0])
-action_step = my_scripted_py_policy.action(time_step, action_step.state)
-print(action_step.action[0])
-
-
-print('Resetting my_scripted_py_policy...')
-policy_state = my_scripted_py_policy.get_initial_state()
-action_step = my_scripted_py_policy.action(time_step, policy_state)
-print(action_step)
-print("*"*100)
-
-
-
-
-
--- a/test_replay_buffers.py
+++ b/test_replay_buffers.py
-import tensorflow as tf
-import numpy as np
-
-from tf_agents import specs
-from tf_agents.agents.dqn import dqn_agent
-from tf_agents.drivers import dynamic_step_driver
-from tf_agents.environments import suite_gym
-from tf_agents.environments import tf_py_environment
-from tf_agents.networks import q_network
-from tf_agents.replay_buffers import py_uniform_replay_buffer
-from tf_agents.replay_buffers import tf_uniform_replay_buffer
-from tf_agents.specs import tensor_spec
-from tf_agents.trajectories import time_step
-
-tf.compat.v1.enable_v2_behavior()
-
-#Creating the buffer
-
-data_spec =  (
-        tf.TensorSpec([3], tf.float32, 'action'),
-        (
-            tf.TensorSpec([5], tf.float32, 'lidar'),
-            tf.TensorSpec([3, 2], tf.float32, 'camera')
-        )
-)
-
-batch_size = 32
-max_length = 1000
-
-replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
-    data_spec,
-    batch_size=batch_size,
-    max_length=max_length)
-
-#Writing to the buffer
-action = tf.constant(1 * np.ones(
-    data_spec[0].shape.as_list(), dtype=np.float32))
-lidar = tf.constant(
-    2 * np.ones(data_spec[1][0].shape.as_list(), dtype=np.float32))
-camera = tf.constant(
-    3 * np.ones(data_spec[1][1].shape.as_list(), dtype=np.float32))
-
-values = (action, (lidar, camera))
-values_batched = tf.nest.map_structure(lambda t: tf.stack([t] * batch_size),
-                                       values)
-
-replay_buffer.add_batch(values_batched)
-
-
-#Reading form the buffer
-# add more items to the buffer before reading
-for _ in range(5):
-    replay_buffer.add_batch(values_batched)
-
-# Get one sample from the replay buffer with batch size 10 and 1 timestep:
-
-sample = replay_buffer.get_next(sample_batch_size=10, num_steps=1)
-
-# Convert the replay buffer to a tf.data.Dataset and iterate through it
-dataset = replay_buffer.as_dataset(
-    sample_batch_size=4,
-    num_steps=2)
-
-iterator = iter(dataset)
-print("Iterator trajectories:")
-trajectories = []
-for _ in range(3):
-    t, _ = next(iterator)
-    trajectories.append(t)
-
-print(tf.nest.map_structure(lambda t: t.shape, trajectories))
-
-# Read all elements in the replay buffer:
-trajectories = replay_buffer.gather_all()
-
-print("Trajectories from gather all:")
-print(tf.nest.map_structure(lambda t: t.shape, trajectories))
-
-
-#PyUniformReplayBuffer
-replay_buffer_capacity = 1000*32 # same capacity as the TFUniformReplayBuffer
-
-py_replay_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
-    capacity=replay_buffer_capacity,
-    data_spec=tensor_spec.to_nest_array_spec(data_spec))
-
-
-#Using replay buffers during training
-#Data collection
-env = suite_gym.load('CartPole-v0')
-tf_env = tf_py_environment.TFPyEnvironment(env)
-
-q_net = q_network.QNetwork(
-    tf_env.time_step_spec().observation,
-    tf_env.action_spec(),
-    fc_layer_params=(100,))
-
-agent = dqn_agent.DqnAgent(
-    tf_env.time_step_spec(),
-    tf_env.action_spec(),
-    q_network=q_net,
-    optimizer=tf.compat.v1.train.AdamOptimizer(0.001))
-
-replay_buffer_capacity = 1000
-
-replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
-    agent.collect_data_spec,
-    batch_size=tf_env.batch_size,
-    max_length=replay_buffer_capacity)
-
-# Add an observer that adds to the replay buffer:
-replay_observer = [replay_buffer.add_batch]
-
-collect_steps_per_iteration = 10
-collect_op = dynamic_step_driver.DynamicStepDriver(
-  tf_env,
-  agent.collect_policy,
-  observers=replay_observer,
-  num_steps=collect_steps_per_iteration).run()
-
-#Reading data for a train step
-# Read the replay buffer as a Dataset,
-# read batches of 4 elements, each with 2 timesteps:
-dataset = replay_buffer.as_dataset(
-    sample_batch_size=4,
-    num_steps=2)
-
-iterator = iter(dataset)
-
-num_train_steps = 10
-
-for _ in range(num_train_steps):
-  trajectories, _ = next(iterator)
-  loss = agent.train(experience=trajectories)
-
--- a/tf_agents_test.py
+++ b/tf_agents_test.py
-# coding=utf-8
-# Copyright 2018 The TF-Agents Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-r"""Train and Eval DDPG.
-
-To run:
-
-```bash
-tensorboard --logdir ./data/tmp/ddpg/gym/HalfCheetah-v2/ --port 2223 &
-
-python tf_agents/agents/ddpg/examples/v2/train_eval.py \
-  --root_dir=$HOME/tmp/ddpg/gym/HalfCheetah-v2/ \
-  --num_iterations=2000000 \
-  --alsologtostderr
-```
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-
-from absl import app
-from absl import flags
-from absl import logging
-
-import gin
-import tensorflow as tf
-
-from tf_agents.agents.ddpg import actor_network
-from tf_agents.agents.ddpg import critic_network
-from tf_agents.agents.ddpg import ddpg_agent
-from tf_agents.drivers import dynamic_step_driver
-from tf_agents.environments import parallel_py_environment
-from tf_agents.environments import suite_mujoco
-from tf_agents.environments import tf_py_environment
-from tf_agents.environments import py_environment
-from tf_agents.eval import metric_utils
-from tf_agents.metrics import tf_metrics
-from tf_agents.replay_buffers import tf_uniform_replay_buffer
-from tf_agents.utils import common
-import abc
-import tensorflow as tf
-import numpy as np
-
-from tf_agents.environments import py_environment
-from tf_agents.environments import tf_environment
-from tf_agents.environments import tf_py_environment
-from tf_agents.environments import utils
-from tf_agents.specs import array_spec
-from tf_agents.environments import wrappers
-from tf_agents.environments import suite_gym
-from tf_agents.trajectories import time_step as ts
-
-flags.DEFINE_string('root_dir', './data/tmp/ddpg/gym/HalfCheetah-v2/',
-                    'Root directory for writing logs/summaries/checkpoints.')
-flags.DEFINE_integer('num_iterations', 100000,
-                     'Total number train/eval iterations to perform.')
-flags.DEFINE_multi_string('gin_file', None, 'Paths to the gin-config files.')
-flags.DEFINE_multi_string('gin_param', None, 'Gin binding parameters.')
-
-
-FLAGS = flags.FLAGS
-
-class CardGameEnv(py_environment.PyEnvironment):
-
-  def __init__(self):
-    self._action_spec = array_spec.BoundedArraySpec(
-        shape=(), dtype=np.float, minimum=0.0, maximum=10.0, name='action')
-    self._observation_spec = array_spec.BoundedArraySpec(
-        shape=(1,), dtype=np.float, minimum=0.0, name='observation')
-    self._state = 0.0
-    self._episode_ended = False
-    self._current_time_step = self._reset()
-
-  def action_spec(self):
-    return self._action_spec
-
-  def observation_spec(self):
-    return self._observation_spec
-
-  def _reset(self):
-    self._state = 0.0
-    self._episode_ended = False
-    return ts.restart(np.array([self._state], dtype=np.float))
-
-  def _step(self, action):
-
-    if self._episode_ended:
-      # The last action ended the episode. Ignore the current action and start
-      # a new episode.
-      return self.reset()
-
-    # Make sure episodes don't go on forever.
-    if action >= 1.0:
-      self._episode_ended = True
-    elif action < 1.0:
-      new_card = np.random.randint(1, 11)
-      self._state += new_card
-    else:
-      raise ValueError('`action` should be 0 or 1.')
-
-    if self._episode_ended or self._state >= 21:
-      reward = self._state - 21 if self._state <= 21 else -21
-      return ts.termination(np.array([self._state], dtype=np.float), reward)
-    else:
-      return ts.transition(
-          np.array([self._state], dtype=np.float), reward=0.0, discount=1.0)
-
-
-@gin.configurable
-def train_eval(
-    root_dir,
-    env_name='',
-    env_load_fn=suite_mujoco.load,
-    num_iterations=2000000,
-    actor_fc_layers=(400, 300),
-    critic_obs_fc_layers=(400,),
-    critic_action_fc_layers=None,
-    critic_joint_fc_layers=(300,),
-    # Params for collect
-    initial_collect_steps=1,
-    collect_steps_per_iteration=1,
-    num_parallel_environments=1,
-    replay_buffer_capacity=100000,
-    ou_stddev=0.2,
-    ou_damping=0.15,
-    # Params for target update
-    target_update_tau=0.05,
-    target_update_period=5,
-    # Params for train
-    train_steps_per_iteration=1,
-    batch_size=64,
-    actor_learning_rate=1e-4,
-    critic_learning_rate=1e-3,
-    dqda_clipping=None,
-    td_errors_loss_fn=tf.compat.v1.losses.huber_loss,
-    gamma=0.995,
-    reward_scale_factor=1.0,
-    gradient_clipping=None,
-    use_tf_functions=True,
-    # Params for eval
-    num_eval_episodes=10,
-    eval_interval=10,
-    # Params for checkpoints, summaries, and logging
-    log_interval=10,
-    summary_interval=10,
-    summaries_flush_secs=10,
-    debug_summaries=False,
-    summarize_grads_and_vars=False,
-    eval_metrics_callback=None):
-
-  """A simple train and eval for DDPG."""
-  # tensorboard log
-  root_dir = os.path.expanduser(root_dir)
-  train_dir = os.path.join(root_dir, 'train')
-  eval_dir = os.path.join(root_dir, 'eval')
-
-  train_summary_writer = tf.compat.v2.summary.create_file_writer(
-      train_dir, flush_millis=summaries_flush_secs * 1000)
-  train_summary_writer.set_as_default()
-
-  eval_summary_writer = tf.compat.v2.summary.create_file_writer(
-      eval_dir, flush_millis=summaries_flush_secs * 1000)
-  eval_metrics = [
-      tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
-      tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
-  ]
-
-  # initialize
-  env = CardGameEnv()
-  global_step = tf.compat.v1.train.get_or_create_global_step()
-  with tf.compat.v2.summary.record_if(
-      lambda: tf.math.equal(global_step % summary_interval, 0)):
-    tf_env = tf_py_environment.TFPyEnvironment(env)
-    eval_tf_env = tf_py_environment.TFPyEnvironment(env)
-
-    actor_net = actor_network.ActorNetwork(
-        tf_env.time_step_spec().observation,
-        tf_env.action_spec(),
-        fc_layer_params=actor_fc_layers,
-    )
-
-    critic_net_input_specs = (tf_env.time_step_spec().observation,
-                              tf_env.action_spec())
-
-    critic_net = critic_network.CriticNetwork(
-        critic_net_input_specs,
-        observation_fc_layer_params=critic_obs_fc_layers,
-        action_fc_layer_params=critic_action_fc_layers,
-        joint_fc_layer_params=critic_joint_fc_layers,
-    )
-
-    tf_agent = ddpg_agent.DdpgAgent(
-        tf_env.time_step_spec(),
-        tf_env.action_spec(),
-        actor_network=actor_net,
-        critic_network=critic_net,
-        actor_optimizer=tf.compat.v1.train.AdamOptimizer(
-            learning_rate=actor_learning_rate),
-        critic_optimizer=tf.compat.v1.train.AdamOptimizer(
-            learning_rate=critic_learning_rate),
-        ou_stddev=ou_stddev,
-        ou_damping=ou_damping,
-        target_update_tau=target_update_tau,
-        target_update_period=target_update_period,
-        dqda_clipping=dqda_clipping,
-        td_errors_loss_fn=td_errors_loss_fn,
-        gamma=gamma,
-        reward_scale_factor=reward_scale_factor,
-        gradient_clipping=gradient_clipping,
-        debug_summaries=debug_summaries,
-        summarize_grads_and_vars=summarize_grads_and_vars,
-        train_step_counter=global_step)
-    tf_agent.initialize()
-
-    train_metrics = [
-        tf_metrics.NumberOfEpisodes(),
-        tf_metrics.EnvironmentSteps(),
-        tf_metrics.AverageReturnMetric(),
-        tf_metrics.AverageEpisodeLengthMetric(),
-    ]
-
-    eval_policy = tf_agent.policy
-    collect_policy = tf_agent.collect_policy
-
-    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
-        tf_agent.collect_data_spec,
-        batch_size=tf_env.batch_size,
-        max_length=replay_buffer_capacity)
-
-    initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
-        tf_env,
-        collect_policy,
-        observers=[replay_buffer.add_batch],
-        num_steps=initial_collect_steps)
-
-    collect_driver = dynamic_step_driver.DynamicStepDriver(
-        tf_env,
-        collect_policy,
-        observers=[replay_buffer.add_batch] + train_metrics,
-        num_steps=collect_steps_per_iteration)
-
-    if use_tf_functions:
-      initial_collect_driver.run = common.function(initial_collect_driver.run)
-      collect_driver.run = common.function(collect_driver.run)
-      tf_agent.train = common.function(tf_agent.train)
-
-    # Collect initial replay data.
-    logging.info(
-        'Initializing replay buffer by collecting experience for %d steps with '
-        'a random policy.', initial_collect_steps)
-    initial_collect_driver.run()
-
-    results = metric_utils.eager_compute(
-        eval_metrics,
-        eval_tf_env,
-        eval_policy,
-        num_episodes=num_eval_episodes,
-        train_step=global_step,
-        summary_writer=eval_summary_writer,
-        summary_prefix='Metrics',
-    )
-    if eval_metrics_callback is not None:
-      eval_metrics_callback(results, global_step.numpy())
-    metric_utils.log_metrics(eval_metrics)
-
-    time_step = None
-    policy_state = collect_policy.get_initial_state(tf_env.batch_size)
-
-    timed_at_step = global_step.numpy()
-    time_acc = 0
-
-    # Dataset generates trajectories with shape [Bx2x...]
-    dataset = replay_buffer.as_dataset(
-        num_parallel_calls=3,
-        sample_batch_size=batch_size,
-        num_steps=2).prefetch(3)
-    iterator = iter(dataset)
-
-
-    # train and eval
-    for _ in range(num_iterations):
-      start_time = time.time()
-      time_step, policy_state = collect_driver.run(
-          time_step=time_step,
-          policy_state=policy_state,
-      )
-      for _ in range(train_steps_per_iteration):
-        experience, _ = next(iterator)
-        train_loss = tf_agent.train(experience)
-        action_step = eval_policy.action(time_step)
-        print("-" * 100)
-        print(int(action_step.action.numpy()[0]))
-      time_acc += time.time() - start_time
-
-      if global_step.numpy() % log_interval == 0:
-        logging.info('step = %d, loss = %f', global_step.numpy(),
-                     train_loss.loss)
-        steps_per_sec = (global_step.numpy() - timed_at_step) / time_acc
-        logging.info('%.3f steps/sec', steps_per_sec)
-        tf.compat.v2.summary.scalar(
-            name='global_steps_per_sec', data=steps_per_sec, step=global_step)
-        timed_at_step = global_step.numpy()
-        time_acc = 0
-
-      for train_metric in train_metrics:
-        train_metric.tf_summaries(
-            train_step=global_step, step_metrics=train_metrics[:2])
-
-      if global_step.numpy() % eval_interval == 0:
-        results = metric_utils.eager_compute(
-            eval_metrics,
-            eval_tf_env,
-            eval_policy,
-            num_episodes=num_eval_episodes,
-            train_step=global_step,
-            summary_writer=eval_summary_writer,
-            summary_prefix='Metrics',
-        )
-        if eval_metrics_callback is not None:
-          eval_metrics_callback(results, global_step.numpy())
-        metric_utils.log_metrics(eval_metrics)
-
-    return train_loss
-
-
-def main(_):
-  tf.compat.v1.enable_v2_behavior()
-  logging.set_verbosity(logging.INFO)
-  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
-  train_eval(FLAGS.root_dir, num_iterations=FLAGS.num_iterations)
-
-if __name__ == '__main__':
-  app.run(main)