TorchCraftAI/reference/policygradienttrainer_8h_source.html

 /*
  * Copyright (c) 2017-present, Facebook, Inc.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #pragma once

 #include "metrics.h"
 #include "sampler.h"
 #include "trainer.h"
 #include <autogradpp/autograd.h>

 #include <queue>

 namespace cpid {

 struct BatchedPGReplayBufferFrame : ReplayBufferFrame {
   BatchedPGReplayBufferFrame(
       ag::Variant state,
       torch::Tensor action,
       float pAction,
       double reward)
       : state(state), action(action), pAction(pAction), reward(reward) {}

   ag::Variant state;
   torch::Tensor action;
   /// Probability of action according to the policy that was used to obtain this
   /// frame
   float pAction;
   /// Reward observed since taking previous action
   double reward;
 };

 /**
  * Off policy policy gradient with a critic.
  * This Trainer implements two modes:
  * - Online:
  *     It does 1 update with the given batch size per node whenever it gets an
  *     episode. Therefore, one episode will always be new and the others will
  *     be from the replay buffer. THIS MODE IS UNTESTED
  * - Offline:
  *     Many threads are assumed to generate episodes in the background, and
  *     it does updates in a seperate background thread.
  * In both modes, it will first update on new episodes at least once before
  * moving to sample from the replay buffer. If more episodes are generated than
  * it can update, it will block until the next update. When the replaybuffer of
  * episodes it has already updated over reaches maxBatchSize, it will remove
  * the oldest episode it's seen.
  *
  * Replayer format:
  *   state, action, p(action), reward
  *
  * Model output:
  *   Probability vector over actions: 1-dim Vector
  *   Critic's value estimate: Double
  */
 class BatchedPGTrainer : public Trainer {
   int batchSize_;
   std::size_t maxBatchSize_;
   double gamma_;
   bool onlineUpdates_ = false;

   std::shared_timed_mutex updateMutex_;
   // Games that were not used for updating the model yet
   std::deque<std::pair<GameUID, EpisodeKey>> newGames_;
   // Games that were already used for updating the model but which are still in
   // the replay buffer. This will be kept <= maxBatchSize_; older games will be
   // removed first.
   std::queue<std::pair<GameUID, EpisodeKey>> seenGames_;
   std::mutex newGamesMutex_;
   bool enoughEpisodes_ = false;
   int episodes_ = 0;

   void updateModel();

  protected:
   void stepEpisode(GameUID const&, EpisodeKey const&, ReplayBuffer::Episode&)
       override;

  public:
   ag::Variant forward(ag::Variant inp, EpisodeHandle const&) override;
   bool update() override;
   void doOnlineUpdatesInstead();

   inline int episodes() {
     return episodes_;
   }

   BatchedPGTrainer(
       ag::Container model,
       ag::Optimizer optim,
       std::unique_ptr<BaseSampler> sampler,
       double gamma = 0.99,
       int batchSize = 10,
       std::size_t maxBatchSize = 50,
       std::unique_ptr<AsyncBatcher> batcher = nullptr);

   /**
    * Contract: the trainer output should be a map with keys: "action" for the
    * taken action "V" for the state value, and "action" for the action
    * probability
    */
   virtual std::shared_ptr<ReplayBufferFrame> makeFrame(
       ag::Variant trainerOutput,
       ag::Variant state,
       float reward) override;
   std::shared_ptr<Evaluator> makeEvaluator(
       size_t,
       std::unique_ptr<BaseSampler> sampler =
           std::make_unique<DiscreteMaxSampler>()) override;
 };
 } // namespace cpid
cpid::GameUID
std::string GameUID
Definition: trainer.h:31

cpid::Trainer::EpisodeHandle
Definition: trainer.h:158

cpid::BatchedPGReplayBufferFrame::state
ag::Variant state
Definition: policygradienttrainer.h:27

cpid::BatchedPGReplayBufferFrame::action
torch::Tensor action
Definition: policygradienttrainer.h:28

cpid::Trainer
The Trainer should be shared amongst multiple different nodes, and attached to a single Module...
Definition: trainer.h:156

cpid::BatchedPGReplayBufferFrame::reward
double reward
Reward observed since taking previous action.
Definition: policygradienttrainer.h:33

cpid::BatchedPGReplayBufferFrame::pAction
float pAction
Probability of action according to the policy that was used to obtain this frame. ...
Definition: policygradienttrainer.h:31

cpid::BatchedPGTrainer::episodes
int episodes()
Definition: policygradienttrainer.h:87

cpid::EpisodeKey
std::string EpisodeKey
Definition: trainer.h:32

cpid
The TorchCraftAI training library.
Definition: batcher.cpp:15

cpid::ReplayBufferFrame
Stub base class for replay buffer frames.
Definition: trainer.h:69

cpid::BatchedPGReplayBufferFrame::BatchedPGReplayBufferFrame
BatchedPGReplayBufferFrame(ag::Variant state, torch::Tensor action, float pAction, double reward)
Definition: policygradienttrainer.h:20

cpid::BatchedPGReplayBufferFrame
Definition: policygradienttrainer.h:19

cpid::ReplayBuffer::Episode
std::vector< std::shared_ptr< ReplayBufferFrame >> Episode
Definition: trainer.h:89

cpid::BatchedPGTrainer
Off policy policy gradient with a critic.
Definition: policygradienttrainer.h:59