QRDQN implementation (#176)

Mobius1D · Prasidh Srikumar · findmyway · web-flow · commit b2a27f3189ad · 2021-04-27T16:18:57.000+08:00
* QRDQN implementation

Initial implementation with a CartPole experiment with a few bugs.

* Fix TD errors

fix the mistake in TD_error

* Working Non optimal QRDQN

Has bugs but runs the experiment.

* Fixed a few errors

Corrected few errors in huber loss and raw loss

* Fixed tau

* Fix notations and typos

* Fix state used in calculation of quantiles

* Fixed a few bugs

made quantile_huber_loss into a separate functon,  changed reshaping, changed ensemble_num to quantile_num

* Fixed a few issues

* Fixed issues

* fix qrdqn

Co-authored-by: Prasidh Srikumar &lt;prsdhsk@gmail,com&gt;
Co-authored-by: Jun Tian &lt;tianjun.cpp@gmail.com&gt;
diff --git a/src/algorithms/dqns/common.jl b/src/algorithms/dqns/common.jl
@@ -4,7 +4,7 @@
 
 const PERLearners = Union{PrioritizedDQNLearner,RainbowLearner,IQNLearner}
 
-function RLBase.update!(learner::Union{DQNLearner,REMDQNLearner,PERLearners}, t::AbstractTrajectory)
+function RLBase.update!(learner::Union{DQNLearner, QRDQNLearner,REMDQNLearner,PERLearners}, t::AbstractTrajectory)
     length(t[:terminal]) - learner.sampler.n <= learner.min_replay_history && return
 
     learner.update_step += 1
diff --git a/src/algorithms/dqns/dqns.jl b/src/algorithms/dqns/dqns.jl
@@ -1,6 +1,7 @@
 include("basic_dqn.jl")
 include("dqn.jl")
 include("prioritized_dqn.jl")
+include("qr_dqn.jl")
 include("rem_dqn.jl")
 include("rainbow.jl")
 include("iqn.jl")
diff --git a/src/algorithms/dqns/qr_dqn.jl b/src/algorithms/dqns/qr_dqn.jl
@@ -0,0 +1,138 @@
+export QRDQNLearner, quantile_huber_loss
+
+function quantile_huber_loss(ŷ, y; κ=1.0f0)
+    N, B = size(y)
+    Δ = reshape(y, N, 1, B) .- reshape(ŷ, 1, N, B)
+    abs_error = abs.(Δ)
+    quadratic = min.(abs_error, κ)
+    linear = abs_error .- quadratic
+    huber_loss = 0.5f0 .* quadratic .* quadratic .+ κ .* linear
+
+    cum_prob = send_to_device(device(y), range(0.5f0 / N; length=N, step=1.0f0 / N))
+    loss = Zygote.dropgrad(abs.(cum_prob .- (Δ .< 0))) .* huber_loss
+    mean(sum(loss;dims=1))
+end
+
+mutable struct QRDQNLearner{Tq <: AbstractApproximator,Tt <: AbstractApproximator,Tf,R} <: AbstractLearner
+    approximator::Tq
+    target_approximator::Tt
+    min_replay_history::Int
+    update_freq::Int
+    update_step::Int
+    target_update_freq::Int
+    sampler::NStepBatchSampler
+    n_quantile::Int
+    loss_func::Tf
+    rng::R
+    # for recording
+    loss::Float32
+end
+
+"""
+    QRDQNLearner(;kwargs...)
+
+See paper: [Distributional Reinforcement Learning with Quantile Regression](https://arxiv.org/pdf/1710.10044.pdf)
+
+# Keywords
+
+- `approximator`::[`AbstractApproximator`](@ref): used to get quantile-values of a batch of states. The output should be of size `(n_quantile, n_action)`.
+- `target_approximator`::[`AbstractApproximator`](@ref): similar to `approximator`, but used to estimate the quantile values of the next state batch.
+- `γ::Float32=0.99f0`: discount rate.
+- `batch_size::Int=32`
+- `update_horizon::Int=1`: length of update ('n' in n-step update).
+- `min_replay_history::Int=32`: number of transitions that should be experienced before updating the `approximator`.
+- `update_freq::Int=1`: the frequency of updating the `approximator`.
+- `n_quantile::Int=1`: the number of quantiles.
+- `target_update_freq::Int=100`: the frequency of syncing `target_approximator`.
+- `stack_size::Union{Int, Nothing}=4`: use the recent `stack_size` frames to form a stacked state.
+- `traces = SARTS`, set to `SLARTSL` if you are to apply to an environment of `FULL_ACTION_SET`.
+- `loss_func`=[`quantile_huber_loss`](@ref).
+"""
+function QRDQNLearner(;
+    approximator,
+    target_approximator,
+    stack_size::Union{Int,Nothing}=nothing,
+    γ::Float32=0.99f0,
+    batch_size::Int=32,
+    update_horizon::Int=1,
+    min_replay_history::Int=32,
+    update_freq::Int=1,
+    n_quantile::Int=1,
+    target_update_freq::Int=100,
+    traces=SARTS,
+    update_step=0,
+    loss_func=quantile_huber_loss,
+    rng=Random.GLOBAL_RNG
+)
+    copyto!(approximator, target_approximator)
+    sampler = NStepBatchSampler{traces}(;
+        γ=γ,
+        n=update_horizon,
+        stack_size=stack_size,
+        batch_size=batch_size,
+    )
+
+    N = n_quantile
+
+    QRDQNLearner(
+        approximator,
+        target_approximator,
+        min_replay_history,
+        update_freq,
+        update_step,
+        target_update_freq,
+        sampler,
+        N,
+        loss_func,
+        rng,
+        0.0f0,
+    )
+end
+
+Flux.functor(x::QRDQNLearner) = (Q = x.approximator, Qₜ = x.target_approximator),
+y -> begin
+    x = @set x.approximator = y.Q
+    x = @set x.target_approximator = y.Qₜ
+    x
+end
+
+function (learner::QRDQNLearner)(env)
+    s = send_to_device(device(learner.approximator), state(env))
+    s = Flux.unsqueeze(s, ndims(s) + 1)
+    q = reshape(learner.approximator(s), learner.n_quantile, :)
+    vec(mean(q, dims=1)) |> send_to_host
+end
+
+function RLBase.update!(learner::QRDQNLearner, batch::NamedTuple)
+    Q = learner.approximator
+    Qₜ = learner.target_approximator
+    γ = learner.sampler.γ
+    n = learner.sampler.n
+    batch_size = learner.sampler.batch_size
+    N = learner.n_quantile
+    D = device(Q)
+    loss_func = learner.loss_func
+
+    s, a, r, t, s′ = (send_to_device(D, batch[x]) for x in SARTS)
+    a = CartesianIndex.(a, 1:batch_size)
+
+    target_quantiles = reshape(Qₜ(s′), N, :, batch_size)
+    qₜ = dropdims(mean(target_quantiles; dims=1); dims=1)
+    aₜ = dropdims(argmax(qₜ, dims=1); dims=1)
+    @views target_quantile_aₜ = target_quantiles[:, aₜ]
+    y = reshape(r, 1, batch_size) .+ γ .* reshape(1 .- t, 1, batch_size) .* target_quantile_aₜ
+
+    gs = gradient(params(Q)) do
+        q = reshape(Q(s), N, :, batch_size)
+        @views ŷ = q[:, a]
+
+        loss = loss_func(ŷ, y)
+
+        ignore() do
+            learner.loss = loss
+        end
+        loss
+    end
+
+    update!(Q, gs)
+end
diff --git a/src/experiments/rl_envs/JuliaRL_QRDQN_Cartpole.jl b/src/experiments/rl_envs/JuliaRL_QRDQN_Cartpole.jl
@@ -0,0 +1,65 @@
+function RLCore.Experiment(
+    ::Val{:JuliaRL},
+    ::Val{:QRDQN},
+    ::Val{:CartPole},
+    ::Nothing;
+    save_dir=nothing,
+    seed=123,
+)
+
+    N = 10
+
+    rng = StableRNG(seed)
+    env = CartPoleEnv(; T=Float32, rng=rng)
+    ns, na = length(state(env)), length(action_space(env))
+
+    agent = Agent(
+        policy=QBasedPolicy(
+            learner=QRDQNLearner(
+                approximator=NeuralNetworkApproximator(
+                    model=Chain(
+                        Dense(ns, 128, relu; initW=glorot_uniform(rng)),
+                        Dense(128, 128, relu; initW=glorot_uniform(rng)),
+                        Dense(128, N * na; initW=glorot_uniform(rng)),
+                    ) |> cpu,
+                    optimizer=ADAM(),
+                ),
+                target_approximator=NeuralNetworkApproximator(
+                    model=Chain(
+                        Dense(ns, 128, relu; initW=glorot_uniform(rng)),
+                        Dense(128, 128, relu; initW=glorot_uniform(rng)),
+                        Dense(128, N * na; initW=glorot_uniform(rng)),
+                    ) |> cpu,
+                ),
+                stack_size=nothing,
+                batch_size=32,
+                update_horizon=1,
+                min_replay_history=100,
+                update_freq=1,
+                target_update_freq=100,
+                n_quantile=N,
+            ),
+            explorer=EpsilonGreedyExplorer(
+                kind=:exp,
+                ϵ_stable=0.01,
+                decay_steps=500,
+                rng=rng,
+            ),
+        ),
+        trajectory=CircularArraySARTTrajectory(
+            capacity=1000,
+            state=Vector{Float32} => (ns,),
+        ),
+    )
+
+    stop_condition = StopAfterStep(10_000)
+
+    hook = ComposedHook(TotalRewardPerEpisode())
+
+    description = """
+    This experiment uses the `QRDQNLearner` method with three dense layers to approximate the quantile values.
+    The testing environment is CartPoleEnv.
+    """
+
+    Experiment(agent, env, stop_condition, hook, description)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -33,7 +33,7 @@ end
 
     @testset "training" begin
         mktempdir() do dir
-            for method in (:BasicDQN, :BC, :DQN, :PrioritizedDQN, :Rainbow, :REMDQN, :IQN, :VPG)
+            for method in (:BasicDQN, :BC, :DQN, :PrioritizedDQN, :Rainbow, :QRDQN, :REMDQN, :IQN, :VPG)
                 res = run(
                     Experiment(
                         Val(:JuliaRL),