karpathy / convnetjs

Hello,

I've recently started using this library and it has been exciting so far. However, it appears that the backward() function is not returning a proper value and as such I'm not really able to train anything.

Specifically, the variable avcost gets a value of NaN. After tracing the issue back to the policy function, it appears that maxval is getting a value of NaN, which seems to be because action_values.w contains NaN for all its values, which I think is because all of the layers associated with value_net have Float64Array values of NaN's pretty much across the board. I'm using the layer setup found in the rldemo demonstration so I'm not really sure how to progress past this.

Any help is appreciated. Sorry for any formatting issues, I don't regularly use GitHub.

Thanks.

dear ryley, welcome to github. may you please provide minimal code example? thanks בתאריך יום ה׳, 24 באוק׳ 2019, 05:20, מאת RyleyGG ‏<notifications@github.com

…

: Hello, I've recently started using this library and it has been exciting so far. However, it appears that the backward() function is not returning a proper value and as such I'm not really able to train anything. Specifically, the variable avcost gets a value of NaN. After tracing the issue back to the policy function, it appears that maxval is getting a value of NaN, which seems to be because action_values.w contains NaN for all its values, which I think is because all of the layers associated with value_net have Float64Array values of NaN's pretty much across the board. I'm using the layer setup found in the rldemo demonstration so I'm not really sure how to progress past this. Any help is appreciated. Sorry for any formatting issues, I don't regularly use GitHub. Thanks. — You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub <#114?email_source=notifications&email_token=ACVA7LBFK76URTD3T44SNB3QQEA7TA5CNFSM4JENXXD2YY3PNVWWK3TUL52HS4DFUVEXG43VMWVGG33NNVSW45C7NFSM4HT7LWQQ>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ACVA7LFRGTL3FGCXOP7LZN3QQEA7TANCNFSM4JENXXDQ> .

@NoamGaash Of course.

Here is the backward function I was referring to, found in build/deepqlearn.js

convnetjs/build/deepqlearn.js

Lines 213 to 258 in 4c3358a

    
           backward: function(reward) { 
        
             this.latest_reward = reward; 
        
             this.average_reward_window.add(reward); 
        
             this.reward_window.shift(); 
        
             this.reward_window.push(reward); 
        
             if(!this.learning) { return; }  
        
             // various book-keeping 
        
             this.age += 1; 
        
             // it is time t+1 and we have to store (s_t, a_t, r_t, s_{t+1}) as new experience 
        
             // (given that an appropriate number of state measurements already exist, of course) 
        
             if(this.forward_passes > this.temporal_window + 1) { 
        
               var e = new Experience(); 
        
               var n = this.window_size; 
        
               e.state0 = this.net_window[n-2]; 
        
               e.action0 = this.action_window[n-2]; 
        
               e.reward0 = this.reward_window[n-2]; 
        
               e.state1 = this.net_window[n-1]; 
        
               if(this.experience.length < this.experience_size) { 
        
                 this.experience.push(e); 
        
               } else { 
        
                 // replace. finite memory! 
        
                 var ri = convnetjs.randi(0, this.experience_size); 
        
                 this.experience[ri] = e; 
        
               } 
        
             } 
        
             // learn based on experience, once we have some samples to go on 
        
             // this is where the magic happens... 
        
             if(this.experience.length > this.start_learn_threshold) { 
        
               var avcost = 0.0; 
        
               for(var k=0;k < this.tdtrainer.batch_size;k++) { 
        
                 var re = convnetjs.randi(0, this.experience.length); 
        
                 var e = this.experience[re]; 
        
                 var x = new convnetjs.Vol(1, 1, this.net_inputs); 
        
                 x.w = e.state0; 
        
                 var maxact = this.policy(e.state1); 
        
                 var r = e.reward0 + this.gamma * maxact.value; 
        
                 var ystruct = {dim: e.action0, val: r}; 
        
                 var loss = this.tdtrainer.train(x, ystruct); 
        
                 avcost += loss.loss; 
        
               } 
        
               avcost = avcost/this.tdtrainer.batch_size; 
        
               this.average_loss_window.add(avcost);

Specifically, these are the lines I believe are associated with my issue:

convnetjs/build/deepqlearn.js

Lines 242 to 258 in 4c3358a

    
           // learn based on experience, once we have some samples to go on 
        
           // this is where the magic happens... 
        
           if(this.experience.length > this.start_learn_threshold) { 
        
             var avcost = 0.0; 
        
             for(var k=0;k < this.tdtrainer.batch_size;k++) { 
        
               var re = convnetjs.randi(0, this.experience.length); 
        
               var e = this.experience[re]; 
        
               var x = new convnetjs.Vol(1, 1, this.net_inputs); 
        
               x.w = e.state0; 
        
               var maxact = this.policy(e.state1); 
        
               var r = e.reward0 + this.gamma * maxact.value; 
        
               var ystruct = {dim: e.action0, val: r}; 
        
               var loss = this.tdtrainer.train(x, ystruct); 
        
               avcost += loss.loss; 
        
             } 
        
             avcost = avcost/this.tdtrainer.batch_size; 
        
             this.average_loss_window.add(avcost);

After avcost += loss.loss, avcost has a value of NaN. loss is defined here:

convnetjs/build/deepqlearn.js

Line 254 in 4c3358a

var loss = this.tdtrainer.train(x, ystruct);

loss.cost_loss, loss.loss, and loss.softmax_loss all have a value of NaN for me. loss is partially dependent on the value of ystruct -- I found that ystruct.val was also returning NaN. The variable used for ystruct.val, r, is defined here:

convnetjs/build/deepqlearn.js

Line 252 in 4c3358a

var r = e.reward0 + this.gamma * maxact.value;

I found that e.reward0 and this.gamma both had the values they should, but maxact.value did not, again returning NaN. maxact is defined here:

convnetjs/build/deepqlearn.js

Line 251 in 4c3358a

var maxact = this.policy(e.state1);

e.state1 has a valid value from what I can tell, and so it is my belief that my issue lies somewhere in the policy function, which is defined here:

convnetjs/build/deepqlearn.js

Lines 139 to 151 in 4c3358a

    
           policy: function(s) { 
        
             // compute the value of doing any action in this state 
        
             // and return the argmax action and its value 
        
             var svol = new convnetjs.Vol(1, 1, this.net_inputs); 
        
             svol.w = s; 
        
             var action_values = this.value_net.forward(svol); 
        
             var maxk = 0;  
        
             var maxval = action_values.w[0]; 
        
             for(var k=1;k<this.num_actions;k++) { 
        
               if(action_values.w[k] > maxval) { maxk = k; maxval = action_values.w[k]; } 
        
             } 
        
             return {action:maxk, value:maxval}; 
        
           },

One of the values policy returns is maxval, which is also getting a value of NaN. maxval is defined as var maxval = action_values.w[0];, and in my case action_values.w is returning all NaN values.

action_values is defined here: var action_values = this.value_net.forward(svol); svol appears to have correct data from what I can tell, whereas this.value_net does not. Logging this.value_net will list several layers, and breaking them down will show a result like this:

So as you can see, plenty of NaN values. This is where I'm stumped. I'm assuming the issue to be layer-related, but I'm using the same exact layer setup as is used in the rldemo, so I'm not really sure on where to go from here. Here is how I'm setting the network up:

const temporal_window = 1; // amount of temporal memory. 0 = agent lives in-the-moment.
    const network_size = inputNum*temporal_window + actionNum*temporal_window + inputNum;

    // the value function network computes a value of taking any of the possible actions
    // given an input state. Here we specify one explicitly the hard way
    // but user could also equivalently instead use opt.hidden_layer_sizes = [20,20]
    // to just insert simple relu hidden layers.
    let layer_defs = [];
    layer_defs.push({type:'input', out_sx:1, out_sy:1, out_depth:network_size});
    layer_defs.push({type:'fc', num_neurons: 50, activation:'relu'});
    layer_defs.push({type:'fc', num_neurons: 50, activation:'relu'});
    layer_defs.push({type:'regression', num_neurons:actionNum});

    // options for the Temporal Difference learner that trains the above net
    // by backpropping the temporal difference learning rule.
    let tdtrainer_options = {learning_rate:0.001, momentum:0.0, batch_size:64, l2_decay:0.01};

    //opt is the set of all configurable options related to the bot
    let opt = {}; //Array of the various options
    opt.temporal_window = temporal_window; //The amount of "temporal memory" the AI has, in terms of "time steps"
    opt.experience_size = 1500; //size of experience replay memory
    opt.start_learn_threshold = 25; //number of examples in experience replay memory before AI begins learning
    opt.gamma = 0.7; //Determines how much the AI plans ahead, on a scale of 0 to 1.
    opt.learning_steps_total = 50000; //Number of total steps to learn for
    opt.learning_steps_burnin = 25; //For the above number of steps, how many should be completely random at the beginning of the learning process?
    opt.epsilon_min = 0.03; //Epsilon determines the amount of randomness the AI will implement over time. Set to 0 for AI to only use learned experiences deep into the learning process
    opt.epsilon_test_time = 0.03; //what epsilon to use at test time? (i.e. when learning is disabled)
    opt.layer_defs = layer_defs;
    opt.tdtrainer_options = tdtrainer_options;
    brain = new deepqlearn.Brain(inputNum, actionNum, opt);

The other properties of the brain object, such as the average_reward_window, etc. all seem to be working properly, and I'm not getting any warnings/errors into the console. Whenever I call brain.backward(), it is with a proper value for reward such as 50 or -50.

by minimal code example I addressed this definition:
https://stackoverflow.com/help/minimal-reproducible-example

I'm not sure I can reproduce your problem based on the information you gave me.

have you verified the brain.backward function received numeric input of type 'number' (not volume)?

@NoamGaash Yes, I'm sure the function is receiving an input of type number. I have validated this by checking the type of the reward immediately before calling brain.backward, as well as checking the value in the backward function itself; both cases returned 'number'.

Also, I apologize for misunderstanding your last request. You should be able to reproduce the issue with this code:

<html>
<head>

    <script src = "build/convnet.js"></script>
    <script src = "build/deepqlearn.js"></script>
    <script src = "build/util.js"></script>
    <script src = "build/vis.js"></script>
    <script type = "text/javascript">

    inputNum = 4;
    actionNum = 5;
    let brain;
    let gameInfo = [];
    let decision;
    let reward = 0;
    let opt = {}; //Array of the various options

    function initNet()
    {
        const temporal_window = 1; // amount of temporal memory. 0 = agent lives in-the-moment. 
        const network_size = inputNum*temporal_window + actionNum*temporal_window + inputNum;

        // the value function network computes a value of taking any of the possible actions
        // given an input state. Here we specify one explicitly the hard way
        // but user could also equivalently instead use opt.hidden_layer_sizes = [20,20]
        // to just insert simple relu hidden layers.
        let layer_defs = [];
        layer_defs.push({type:'input', out_sx:1, out_sy:1, out_depth:network_size});
        layer_defs.push({type:'fc', num_neurons: 50, activation:'relu'});
        layer_defs.push({type:'fc', num_neurons: 50, activation:'relu'});
        layer_defs.push({type:'regression', num_neurons:actionNum});

        // options for the Temporal Difference learner that trains the above net
        // by backpropping the temporal difference learning rule.
        let tdtrainer_options = {learning_rate:0.001, momentum:0.0, batch_size:64, l2_decay:0.01};

        //opt is the set of all configurable options related to the bot
        
        opt.temporal_window = temporal_window; //The amount of "temporal memory" the AI has, in terms of "time steps"
        opt.experience_size = 1500; //size of experience replay memory
        opt.start_learn_threshold = 25; //number of examples in experience replay memory before AI begins learning
        opt.gamma = 1; //Determines how much the AI plans ahead, on a scale of 0 to 1.
        opt.learning_steps_total = 50000; //Number of total steps to learn for
        opt.learning_steps_burnin = 25; //For the above number of steps, how many should be completely random at the beginning of the learning process?
        opt.epsilon_min = 0; //Epsilon determines the amount of randomness the AI will implement over time. Set to 0 for AI to only use learned experiences deep into the learning process
        opt.epsilon_test_time = 0; //what epsilon to use at test time? (i.e. when learning is disabled)
        opt.layer_defs = layer_defs;
        opt.tdtrainer_options = tdtrainer_options;
        brain = new deepqlearn.Brain(inputNum, actionNum, opt);
    }
    </script>

</head>

<body>

<script type = 'text/javascript'>
    initNet();
    function refreshBot() //Refreshes the AI with new information regarding the gamestate and applies rewards
    {
        gameInfo = [Math.random(), Math.random(), Math.random()];
        for(k = 0; k<500; k++) 
        {
            console.log('Currently on run '+(k+1));
            decision = brain.forward(gameInfo); // returns index of chosen action
            reward = decision === 0 ? 1.0 : 0.0;
            brain.backward(reward);
            gameInfo[Math.floor(Math.random()*3)] += Math.random()*2-0.5;
        }

        console.log(brain.average_loss_window); //Returns NaN values
    }
    refreshBot();
    
</script>
</body>
</html>

Using the example above I am receiving values of NaN in brain.average_loss_window. The initNet() function is exactly the same as in my actual code, but in order to make the example proper I simplified the refreshBot function with the example given at the bottom of this page, and I am still getting NaN values so I'm sure none of that code is of issue. Let me know if I can provide any more info.

Thanks.

I noticed you set inputNum = 4, and gameInfo = [Math.random(), Math.random(), Math.random()].

change inputNum to be 4, or add an element to gameInfo.

here is a working example with 4 inputs

BTW - good luck with your Tetris project! sounds promising.

@NoamGaash The inputNum = 4 was an artifact from my code that I forgot to change to reflect the loop I used in the example code :/.

Nonetheless, it helped me figure out the actual problem. My original set of inputs that I was passing brain.forward included a couple of different arrays, so it looked something like gameInfo = [2d array, array, 2d array, number], adding an array into the set of inputs seems to break something in the forward function, meaning that without changing the source code the solution is to only pass numbers as inputs.

I also found that there were some other miscellaneous issues wrong with my code that weren't the fault of the library but myself. Those have been fixed and it appears to be working properly now.

I appreciate it, and thanks for all the help!

	backward: function(reward) {
	this.latest_reward = reward;
	this.average_reward_window.add(reward);
	this.reward_window.shift();
	this.reward_window.push(reward);

	if(!this.learning) { return; }

	// various book-keeping
	this.age += 1;

	// it is time t+1 and we have to store (s_t, a_t, r_t, s_{t+1}) as new experience
	// (given that an appropriate number of state measurements already exist, of course)
	if(this.forward_passes > this.temporal_window + 1) {
	var e = new Experience();
	var n = this.window_size;
	e.state0 = this.net_window[n-2];
	e.action0 = this.action_window[n-2];
	e.reward0 = this.reward_window[n-2];
	e.state1 = this.net_window[n-1];
	if(this.experience.length < this.experience_size) {
	this.experience.push(e);
	} else {
	// replace. finite memory!
	var ri = convnetjs.randi(0, this.experience_size);
	this.experience[ri] = e;
	}
	}

	// learn based on experience, once we have some samples to go on
	// this is where the magic happens...
	if(this.experience.length > this.start_learn_threshold) {
	var avcost = 0.0;
	for(var k=0;k < this.tdtrainer.batch_size;k++) {
	var re = convnetjs.randi(0, this.experience.length);
	var e = this.experience[re];
	var x = new convnetjs.Vol(1, 1, this.net_inputs);
	x.w = e.state0;
	var maxact = this.policy(e.state1);
	var r = e.reward0 + this.gamma * maxact.value;
	var ystruct = {dim: e.action0, val: r};
	var loss = this.tdtrainer.train(x, ystruct);
	avcost += loss.loss;
	}
	avcost = avcost/this.tdtrainer.batch_size;
	this.average_loss_window.add(avcost);

	policy: function(s) {
	// compute the value of doing any action in this state
	// and return the argmax action and its value
	var svol = new convnetjs.Vol(1, 1, this.net_inputs);
	svol.w = s;
	var action_values = this.value_net.forward(svol);
	var maxk = 0;
	var maxval = action_values.w[0];
	for(var k=1;k<this.num_actions;k++) {
	if(action_values.w[k] > maxval) { maxk = k; maxval = action_values.w[k]; }
	}
	return {action:maxk, value:maxval};
	},

average_loss_window has a value of NaN