speed up epsilon greedy policies

jbrea · jbrea · commit bcd9bea07e05 · 2018-08-08T14:12:14.000+02:00
diff --git a/src/epsilongreedypolicies.jl b/src/epsilongreedypolicies.jl
@@ -5,25 +5,6 @@ abstract type AbstractEpsilonGreedyPolicy end
           VeryOptimisticEpsilonGreedyPolicy,
           PesimisticEpsilonGreedyPolicy)
           
-
-for (typ, max, rel) in ((OptimisticEpsilonGreedyPolicy, maximumbelowInf, :(>=)),
-                        (VeryOptimisticEpsilonGreedyPolicy, maximum, :(==)),
-                        (PesimisticEpsilonGreedyPolicy, maximumbelowInf, :(==)))
-    @eval function getgreedystates(policy::$typ, values)
-        a = Int64[]
-        vmax = $max(values)
-        if isnan(vmax)
-            error("NaN encountered in getgreedystates: $values")
-        end
-        for (i, v) in enumerate(values)
-            if ($rel)(v, vmax)
-                push!(a, i)
-            end
-        end
-        a
-    end
-end
-
 const EpsilonGreedyPolicy = VeryOptimisticEpsilonGreedyPolicy
 export EpsilonGreedyPolicy
 
@@ -63,21 +44,43 @@ where never chosen before.
 """ PesimisticEpsilonGreedyPolicy
 
 
-function selectaction(policy::AbstractEpsilonGreedyPolicy, values)
-    if rand() < policy.ϵ
-        rand(1:length(values))
-    else
-        rand(getgreedystates(policy, values))
+for (typ, max, rel) in ((OptimisticEpsilonGreedyPolicy, maximumbelowInf, :(>=)),
+                        (VeryOptimisticEpsilonGreedyPolicy, maximum, :(==)),
+                        (PesimisticEpsilonGreedyPolicy, maximumbelowInf, :(==)))
+    @eval function selectaction(policy::$typ, values)
+        if rand() < policy.ϵ
+            rand(1:length(values))
+        else
+            vmax = $max(values)
+            c = 1
+            a = 1
+            for (i, v) in enumerate(values)
+                if ($rel)(v, vmax)
+                    if rand() < 1/c
+                        a = i
+                    end
+                    c += 1
+                end
+            end
+            a
+        end
     end
-end
-
-function getactionprobabilities(policy::AbstractEpsilonGreedyPolicy, values)
-    p = ones(length(values))/length(values) * policy.ϵ
-    a = getgreedystates(policy, values)
-    p2 = (1. - policy.ϵ)/length(a)
-    for i in a
-        p[i] =+ p2
+    @eval function getactionprobabilities(policy::$typ, values)
+        p = ones(length(values))/length(values) * policy.ϵ
+        vmax = $max(values)
+        c = 0
+        for v in values
+            if ($rel)(v, vmax)
+                c += 1
+            end
+        end
+        p2 = (1. - policy.ϵ)/c
+        for (i, v) in enumerate(values)
+            if ($rel)(v, vmax)
+                p[i] += p2
+            end
+        end
+        p
     end
-    p
 end
 
diff --git a/test/epsilongreedypolicies.jl b/test/epsilongreedypolicies.jl
@@ -1,10 +1,8 @@
-getgreedystates = ReinforcementLearning.getgreedystates
-for (v, rO, rVO, r, rP) in (([-9., 12., Inf64], [2, 3], [3], [3], [2]),
-                            ([-9., -12.], [1], [1], [1], [1]),
-                            ([Inf64, Inf64], [1, 2], [1, 2], [1, 2], [1, 2]))
-    @test getgreedystates(OptimisticEpsilonGreedyPolicy(0.), v) == rO
-    @test getgreedystates(VeryOptimisticEpsilonGreedyPolicy(0.), v) == rVO
-    @test getgreedystates(PesimisticEpsilonGreedyPolicy(0.), v) == rP
+import ReinforcementLearning: selectaction
+
+function empiricalactionprop(p, v; n = 10^6)
+    res = [selectaction(p, v) for _ in 1:n]
+    map(x -> length(find(i -> i == x, res)), 1:length(v))./n
 end
 
 for (v, rO, rVO, r, rP) in (([-9., 12., Inf64], [0, .5, .5], [0, 0., 1.], 
@@ -16,7 +14,15 @@ for (v, rO, rVO, r, rP) in (([-9., 12., Inf64], [0, .5, .5], [0, 0., 1.],
     @test getactionprobabilities(OptimisticEpsilonGreedyPolicy(0.), v) == rO
     @test getactionprobabilities(VeryOptimisticEpsilonGreedyPolicy(0.), v) == rVO
     @test getactionprobabilities(PesimisticEpsilonGreedyPolicy(0.), v) == rP
+    @test isapprox(empiricalactionprop(OptimisticEpsilonGreedyPolicy(0.), v),
+                   rO, atol = .05)
+    @test isapprox(empiricalactionprop(VeryOptimisticEpsilonGreedyPolicy(0.), v),
+                   rVO, atol = .05)
+    @test isapprox(empiricalactionprop(PesimisticEpsilonGreedyPolicy(0.), v),
+                   rP, atol = .05)
+    @test isapprox(empiricalactionprop(OptimisticEpsilonGreedyPolicy(.2), v),
+                   getactionprobabilities(OptimisticEpsilonGreedyPolicy(.2), v),
+                   atol = .05)
 end
 
 
-