wip: running bandits experiments

jscriptcoder · jscriptcoder · commit 9cc66871efb0 · 2020-10-27T10:28:35.000+01:00
diff --git a/noderl/multiarm_bandits/BaseAgent.ts b/noderl/multiarm_bandits/BaseAgent.ts
@@ -1,34 +1,33 @@
 import { argmax, full, zeros, range } from '../utils/lists'
 import { choice } from '../utils/random'
 
-export default class GreedyAgent {
-  protected arms_count: number[]
+export default class BaseAgent {
+  public name: string
+  protected arm_counts: number[]
   protected q_values: number[]
   protected actions: number[]
-  protected explored: number = 0
-  protected exploited: number = 0
 
   get estimates(): number[] {
     return this.q_values
   }
 
   get counts(): number[] {
-    return this.arms_count
+    return this.arm_counts
   }
 
   constructor(n_arms: number, init_value: number = 0) {
-    this.arms_count = zeros(n_arms)
+    this.name = `${this.constructor.name}_arms-${n_arms}_initvals-${init_value}`
+    this.arm_counts = zeros(n_arms)
     this.q_values = full(n_arms, init_value)
     this.actions = range(n_arms)
   }
 
   pull(arm: number) {
-    this.arms_count[arm]++
+    this.arm_counts[arm]++
     return arm
   }
 
   random_action(): number {
-    this.explored++
     const action = choice(this.actions)
     return this.pull(action)
   }
@@ -39,14 +38,9 @@ export default class GreedyAgent {
 
   optimize(action: number, reward: number) {
     const old_stimate = this.q_values[action]
-    const step_size = 1/this.arms_count[action]
+    const step_size = 1/this.arm_counts[action]
 
     // Algorithm in section 2.4 of Sutton & Barto book
     this.q_values[action] = old_stimate + step_size * (reward - old_stimate)
   }
-
-  explore_exploit_rate(): number[] {
-    const total = this.explored + this.exploited
-    return [this.explored/total, this.exploited/total]
-  }
 }
diff --git a/noderl/multiarm_bandits/EpsilonGreegyAgent.ts b/noderl/multiarm_bandits/EpsilonGreegyAgent.ts
@@ -4,17 +4,27 @@ import GreedyAgent from './GreedyAgent'
 
 export default class EpsilonGreedyAgent extends GreedyAgent {
   private eps: number
+  protected explored: number = 0
+  protected exploited: number = 0
 
   constructor(n_arms: number, eps: number, init_value: number = 0) {
     super(n_arms, init_value)
+    this.name += `_eps-${eps}`
     this.eps = eps
   }
 
   act(): number {
     if (uniform() < this.eps) {
+      this.explored++
       return this.random_action()
     } else {
+      this.exploited++
       return this.greedy_action()
     }
   }
+
+  explore_exploit_rate(): number[] {
+    const total = this.explored + this.exploited
+    return [this.explored/total, this.exploited/total]
+  }
 }
diff --git a/noderl/multiarm_bandits/GreedyAgent.ts b/noderl/multiarm_bandits/GreedyAgent.ts
@@ -5,7 +5,6 @@ export default class GreedyAgent extends BaseAgent {
 
   greedy_action(): number {
     // Greedy action
-    this.exploited++
     const action = argmax(this.q_values)
     return this.pull(action)
   }
diff --git a/noderl/multiarm_bandits/UCB1Agent.ts b/noderl/multiarm_bandits/UCB1Agent.ts
@@ -8,7 +8,7 @@ export default class UCB1Agent extends BaseAgent {
   ucb_action() {
     const plays = ++this.plays
     const values = this.q_values.map((estimate, i) => {
-      const arm_count = this.arms_count[i]
+      const arm_count = this.arm_counts[i]
       return estimate + Math.sqrt(2 * Math.log(plays)/ arm_count)
     })
 
diff --git a/noderl/multiarm_bandits/training.ts b/noderl/multiarm_bandits/training.ts
@@ -4,10 +4,12 @@ import GreedyAgent from './GreedyAgent'
 import EpsilonGreegyAgent from './EpsilonGreegyAgent'
 import UCB1Agent from './UCB1Agent'
 import { LOG_PATH } from '../config'
+import { full } from '../utils/lists'
 
-const PROB_DIST = [0.2, 0.5, 0.75]
-const REWARD_DIST = [1, 1, 1]
-const INIT_VALUE = 10
+const PROB_DIST = [0.2, 0.5, 0.75, 0.15, 0.01, 0.92, 0.88, 0.36, 0.79, 0.9]
+const N = PROB_DIST.length
+const REWARD_DIST = full(N, 1)
+const INIT_VALUE = 0
 const EPSILON = 0.01
 const EPISODES = 100000
 
@@ -18,16 +20,17 @@ const env = new BanditEnv(PROB_DIST, REWARD_DIST)
 // const agent = new EpsilonGreegyAgent(PROB_DIST.length, EPSILON, INIT_VALUE)
 const agent = new UCB1Agent(PROB_DIST.length, INIT_VALUE)
 
-const writer = tf.node.summaryFileWriter(`${LOG_PATH}/bandits`)
+const writer = tf.node.summaryFileWriter(`${LOG_PATH}/${agent.name}`)
 
 let total_reward = 0
 
 if (agent instanceof UCB1Agent) {
-  PROB_DIST.forEach((_, i) => {
+  // Initialization. We try once all the arms
+  for (let i = 0; i < N; i++) {
     const action = agent.pull(i)
     const reward = env.step(action)
     agent.optimize(action, reward)
-  })
+  }
 }
 
 for (let e = 0; e < EPISODES; e++) {

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@ export default class GreedyAgent extends BaseAgent {`
`5`	`5`
`6`	`6`	`greedy_action(): number {`
`7`	`7`	`// Greedy action`
`8`		`- this.exploited++`
`9`	`8`	`const action = argmax(this.q_values)`
`10`	`9`	`return this.pull(action)`
`11`	`10`	`}`