SAC multidimensional actions (#173)

albheim · findmyway · web-flow · commit 022c1fd43391 · 2021-04-20T22:16:16.000+02:00
* Switch sigma to log_sigma

* Replace SAC network with gaussian network

* Missed a logsigma spot...

* Remove unwanted prints

* Remove na from example

* Seems to be running multidim actions

* Remove StructArray

* Cleanup

* Cleanup

* Update src/algorithms/policy_gradient/sac.jl

Co-authored-by: Jun Tian &lt;find_my_way@foxmail.com&gt;

* Add more missed logsigma spots

* Undo erronous change

* Track and log reward and entropy terms

* Add link to paper

Co-authored-by: Jun Tian &lt;find_my_way@foxmail.com&gt;
diff --git a/src/algorithms/policy_gradient/sac.jl b/src/algorithms/policy_gradient/sac.jl
@@ -23,6 +23,9 @@ mutable struct SACPolicy{
     update_every::Int
     step::Int
     rng::R
+    # Logging
+    reward_term::Float32
+    entropy_term::Float32
 end
 
 """
@@ -49,6 +52,8 @@ end
 `policy` is expected to output a tuple `(μ, logσ)` of mean and
 log standard deviations for the desired action distributions, this
 can be implemented using a `GaussianNetwork` in a `NeuralNetworkApproximator`.
+
+Implemented based on http://arxiv.org/abs/1812.05905
 """
 function SACPolicy(;
     policy,
@@ -85,6 +90,8 @@ function SACPolicy(;
         update_every,
         step,
         rng,
+        0f0,
+        0f0,
     )
 end
 
@@ -99,12 +106,12 @@ function (p::SACPolicy)(env)
         s = state(env)
         s = Flux.unsqueeze(s, ndims(s) + 1)
         # trainmode:
-        action = evaluate(p, s)[1][] # returns action as scalar
+        action = dropdims(evaluate(p, s)[1], dims=2) # Single action vec, drop second dim
 
         # testmode:
         # if testing dont sample an action, but act deterministically by
         # taking the "mean" action
-        # action = p.policy(s)[1][] # returns action as scalar
+        # action = dropdims(p.policy(s)[1], dims=2) 
     end
 end
 
@@ -137,17 +144,13 @@ function RLBase.update!(p::SACPolicy, batch::NamedTuple{SARTS})
 
     γ, ρ, α = p.γ, p.ρ, p.α
 
-    # !!! we have several assumptions here, need revisit when we have more complex environments
-    # state is vector
-    # action is scalar
     a′, log_π = evaluate(p, s′)
     q′_input = vcat(s′, a′)
     q′ = min.(p.target_qnetwork1(q′_input), p.target_qnetwork2(q′_input))
 
-    y = r .+ γ .* (1 .- t) .* vec((q′ .- α .* log_π))
+    y = r .+ γ .* (1 .- t) .* vec(q′ .- α .* log_π)
 
     # Train Q Networks
-    a = Flux.unsqueeze(a, 1)
     q_input = vcat(s, a)
 
     q_grad_1 = gradient(Flux.params(p.qnetwork1)) do
@@ -166,7 +169,13 @@ function RLBase.update!(p::SACPolicy, batch::NamedTuple{SARTS})
         a, log_π = evaluate(p, s)
         q_input = vcat(s, a)
         q = min.(p.qnetwork1(q_input), p.qnetwork2(q_input))
-        mean(α .* log_π .- q)
+        reward = mean(q)
+        entropy = mean(log_π)
+        ignore() do 
+            p.reward_term = reward
+            p.entropy_term = entropy
+        end
+        α * entropy - reward
     end
     update!(p.policy, p_grad)
 
diff --git a/src/experiments/rl_envs/JuliaRL_SAC_Pendulum.jl b/src/experiments/rl_envs/JuliaRL_SAC_Pendulum.jl
@@ -18,10 +18,11 @@ function RLCore.Experiment(
     low = A.left
     high = A.right
     ns = length(state(inner_env))
+    na = 1
 
     env = ActionTransformedEnv(
         inner_env;
-        action_mapping = x -> low + (x + 1) * 0.5 * (high - low),
+        action_mapping = x -> low + (x[1] + 1) * 0.5 * (high - low),
     )
     init = glorot_uniform(rng)
 
@@ -31,15 +32,15 @@ function RLCore.Experiment(
                 Dense(ns, 30, relu), 
                 Dense(30, 30, relu),
             ),
-            μ = Chain(Dense(30, 1, initW = init)),
-            logσ = Chain(Dense(30, 1, x -> clamp.(x, typeof(x)(-10), typeof(x)(2)), initW = init)),
+            μ = Chain(Dense(30, na, initW = init)),
+            logσ = Chain(Dense(30, na, x -> clamp.(x, typeof(x)(-10), typeof(x)(2)), initW = init)),
         ),
         optimizer = ADAM(0.003),
     )
 
     create_q_net() = NeuralNetworkApproximator(
         model = Chain(
-            Dense(ns + 1, 30, relu; initW = init),
+            Dense(ns + na, 30, relu; initW = init),
             Dense(30, 30, relu; initW = init),
             Dense(30, 1; initW = init),
         ),
@@ -58,15 +59,15 @@ function RLCore.Experiment(
             α = 0.2f0,
             batch_size = 64,
             start_steps = 1000,
-            start_policy = RandomPolicy(-1.0..1.0; rng = rng),
+            start_policy = RandomPolicy(Space([-1.0..1.0 for _ in 1:na]); rng = rng),
             update_after = 1000,
             update_every = 1,
             rng = rng,
         ),
         trajectory = CircularArraySARTTrajectory(
             capacity = 10000,
             state = Vector{Float32} => (ns,),
-            action = Float32 => (),
+            action = Vector{Float32} => (na,),
         ),
     )
 
@@ -76,9 +77,16 @@ function RLCore.Experiment(
     hook = ComposedHook(
         total_reward_per_episode,
         time_per_step,
-        DoEveryNEpisode() do t, agent, env
+        DoEveryNStep() do t, agent, env
             with_logger(lg) do
-                @info "training" reward = total_reward_per_episode.rewards[end]
+                @info(
+                    "training",
+                    reward_term = agent.policy.reward_term,
+                    entropy_term = agent.policy.entropy_term,
+                )
+                if is_terminated(env)
+                    @info "training" reward = total_reward_per_episode.reward log_step_increment = 0
+                end
             end
         end,
     )