gwern comments on The Absent-Minded Driver

gwern 19 Jan 2016 16:57 UTC

1 point

While I was at it, I thought I’d give some fancier algorithms a spin using Karpathy’s Reinforce.js RL library (Github). The DQN may be able to do much better since it can potentially observe the similarity of payoffs and infer the underlying function to maximize.

First a JS implementation of the Absent-Minded Driver simulation:

function getRandBinary(p=0.5) { rand = Math.random(); if (rand >= p) { return 0; } else { return 1; } }
function absentMindedDriver(p1, p2) { if(p1==1) { return 0; } else { if (p2==1) { return 4; } else { return 1; } } }
function runDriver(p) { p1 = getRandBinary(p)
p2 = getRandBinary(p)
return absentMindedDriver(p1, p2); }
function runDrivers(p=(1/3), iters=1000) {
var rewards = [];
for (i=0; i < iters; i++) {
rewards.push(runDriver(p));
}
return rewards;
}

Now we can use Reinforce.js to set up and run a deep Q-learning agent (but not too deep since this runs in-browser and there’s no need for hundreds or thousands of neurons for a tiny problem like this):

// load Reinforce.js
var script = document.createElement("script");
script.src = "https://raw.githubusercontent.com/karpathy/reinforcejs/master/lib/rl.js″;
document.body.appendChild(script);

// set up a DQN RL agent and run
var env = {};
env.getNumStates = function() { return 0; }
env.getMaxNumActions = function() { return 100; } // so actions are 1-100?

// create the DQN agent: relatively small, since this is an easy problem; greedy, since only one step; hold onto all data & process heavily
var spec = { num_hidden_units:25, experience_add_every:1, learning_steps_per_iteration:100, experience_size:10100 }
agent = new RL.DQNAgent(env, spec);

// OK, now let it free-run; we give it an extra 100 actions for equality with the MAB’s initial 100 exploratory pulls
for (i=0; i < 10100; i++) {
  s = [] // MAB is one-shot, so no global state
  var action = agent.act([]);
  //… execute action in environment and get the reward
  reward = runDriver(action/100);
  console.log(“Action: ” + action + ”; reward: ” + reward);
  agent.learn(reward); // the agent improves its Q,policy,model, etc. reward is a float
  }
// …Action: 15; reward: 1
// Action: 34; reward: 0
// Action: 34; reward: 4
// Action: 21; reward: 4
// Action: 15; reward: 1
// Action: 43; reward: 0
// Action: 21; reward: 1
// Action: 43; reward: 1
// Action: 34; reward: 4
// Action: 15; reward: 1
// Action: 78; reward: 0
// Action: 15; reward: 0
// Action: 15; reward: 1
// Action: 34; reward: 1
// Action: 43; reward: 1
// Action: 34; reward: 4
// Action: 24; reward: 1
// Action: 0; reward: 1
// Action: 34; reward: 0
// Action: 3; reward: 1
// Action: 51; reward: 4
// Action: 3; reward: 1
// Action: 11; reward: 4
// Action: 3; reward: 1
// Action: 11; reward: 1
// Action: 43; reward: 4
// Action: 36; reward: 4
// Action: 36; reward: 0
// Action: 21; reward: 4
// Action: 77; reward: 0
// Action: 21; reward: 4
// Action: 26; reward: 4

Overall, I am not too impressed by the DQN. It looks like it is doing worse than the MAB was, despite having a much more flexible model to use. I don’t know if this reflects the better exploration of Thompson sampling compared to the DQN’s epsilon-greedy, or if my fiddling with the hyperparameters failed to help. It’s a small enough problem that a Bayesian NN is probably computationally feasible, but I dunno how to use those.