Scaling Node.js « Shahzad Bhatti

September 24, 2012

Scaling Node.js

Filed under: Javascript — admin @ 12:22 pm

As I described in my earlier blog, I have been testing Node.js and Websockets for streaming data to large number of clients. I am evaluating Node.js and WebSockets in both pub/sub model such as clients connecting to receive streaming quotes and request/reply model where clients are invoking an API and waiting for reply. Below are some of my experience with those technologies so far:

Pub/Sub Server

The server code uses cluster module to provide multiprocessing support and each process starts a Websocket server and listens for incoming connections. I used socket.io library for WebSockets. The server keeps track of all clients that have subscribed and then send random data every 10-100 milli-seconds.
Here is basic implementation of the server:

 var MAX_THREADS = process.env.MAX_THREADS || 10;
 var MAX_ROWS = process.env.MAX_ROWS || 2;
 var nextClientId = 0;
 //
 var helper = require('helper')
 
 var pubsubController = require('pubsub_controller')(MAX_ROWS)
 var cluster = require('cluster');
 
 if (cluster.isMaster) {
    for (var i = 0; i < MAX_THREADS; i++) {
       cluster.fork();
    }
    cluster.on('exit', function(worker, code, signal) {
       console.log('worker ' + worker.process.pid + ' died');
    });
 } else {
    var socketsByAddr = {};
    var io = require('socket.io').listen(8124);
    io.set('log level', 1);
    io.set('transports', ['websocket']);
    io.set('force new connection', true);
    io.enable('browser client gzip');
 
    io.sockets.on('connection', function(socket) {
       socket.address = socket.handshake.address.address + ':' + socket.handshake.address.port;
       socket.key = socket.handshake.address.address + ':' + socket.handshake.address.port + ':' + (++nextClientId);
       socketsByAddr[socket.address] = true;
       pubsubController.subscribe(socket);
       socket.on('disconnect', function() {
          pubsubController.unsubscribe(socket);
          delete socketsByAddr[socket.address];
       });
 
       socket.on('end', function() {
       });
    });
 
    setInterval(function() {
       pubsubController.pushUpdates();
    }, helper.random(100) + 10);
 }
 
 process.on('uncaughtException', function(err) {
   console.log("GLOBAL " + err + "\n" + err.stack);
 });

Pub/Sub Client

The pub/sub client subscribes for data and receies data every 10-100 milli-seconds. The client code builds up 100 more connections every 15 seconds and it logs the response time information. On client side, I used socket.io-client library. The client calls 'subscribe-execs' to subscribe and then receive messages under 'receive-execs'. Upon receiving message, the client tracks various metrics such as response time, CPU, memory usage, etc. As, I am running both client and server on the same machine, it gives accurate representation of latency without network hops or clocks mismatch. Another thing to note is that the client passes 'force new connection' flag to force new socket for each client as by default all clients share same socket.

 var OUTLIER_SIZE = process.env.OUTLIER_SIZE || 10;
 
 var hwUsage = require('hardware_usage');
 var metrics = require('metrics')({}, OUTLIER_SIZE);
 hwUsage.start();
       
 var client = require('pubsub_client')(metrics, hwUsage);
 var clients = [];
 
 // create connections for this client
 var buildClients = function() {
    for (var i=0; i<100; i++) {
       var client = client.newClient(i+0);
       clients.push(client);
    }
    hwUsage.stop(function(usage) {
       console.log('clients,connected,', hwUsage.heading() + ',' + metrics.heading());
       console.log(clients.length + ',' + client.totalConnected() + ',' + usage.toString() + ',' + metrics.summary().toString());
    });
 };
 
 
 setInterval(buildClients, 15000);
 
 
 process.on('uncaughtException', function(err) {
   console.log("GLOBAL " + err + "\n" + err.stack);
 });

The actual client is defined as a module, e.g.:

 var io = require('socket.io-client');
 var helper = require('helper');
 var totalConnected = 0;
 
 module.exports = function(metrics, hwUsage) {
    var ExecClient = function(id) {
       this.id = id;
       this.client = io.connect('localhost', { port: 8124,
                               transports: ['websocket'],
                               'force new connection': true,
                               'sync on disconnect' : true,
                               'connect timeout': 500,
                               'reconnect': true,
                               'reconnection delay': 500,
                               'reopen delay': 500,
                               'max reconnection attempts': 5});
    };
    
    // 
    ExecClient.prototype.setupConnectListener = function() {
       if (typeof this.client === "undefined") return;
       var that = this;
       this.client.on('connect_failed', function(error) {
       });
 
       this.client.on('connect', function() {
          totalConnected++;
       });
 
       this.client.on('disconnect', function() {
          totalConnected--;
       });
 
       this.client.on('reconnect_failed', function() {
          //process.exit(1);
       });
 
       this.client.on('end', function() {
          //process.exit(1);
       });
    };
 
    ExecClient.prototype.setupReceiveListener = function() {
       if (typeof this.client === "undefined") return;
       var that = this;
       this.client.on('receive-execs', function(msg) {
          console.log('receive');
          var metric = metrics.update(msg.address, msg.request, msg.timestamp, JSON.stringify(msg).length);
        });
    };
 
    return {
       totalConnected : function() {
          return totalConnected;
       },
       newClient: function(id) {
          var client = new ExecClient(id);
          client.setupConnectListener();
          client.setupReceiveListener();
          return client;
       }
    };
 }

Subscription management

Following module defines APIs for managing subscriptions and as I mentioned when number of messages received by client reaches maxMessages, it unsubscribes from the server.

 var helper = require('helper');
 var emitVolatile = typeof process.env.EMIT_VOLATILE !== "undefined" && process.env.EMIT_VOLATILE === 'true';
 
 module.exports = function(numExecs) {
    var execSubscribers = {};
    var counter = 0; 
    var generatePayload = function(address) {
       var execs = [];
       var date = new Date();
       for (var i=0; i<numExecs; i++) {
          execs.push({sequence:i, activityDateStr:date.toString(), com: helper.random(5),price: helper.random(100), accountId:1772139, symbol:"QQQ", transaction:"STO", description:"QQQ Stock", qty: helper.random(200), key: "QQQ:::S", netAmount: helper.random(1000)});
       }
       var payload = {request: counter, rows:numExecs, timestamp : date.getTime(), address:address, execs: execs};
       return payload;
    }
 
    return {
       subscribe: function(socket) {
          execSubscribers[socket.key] = socket;
       },
       unsubscribe: function(socket) {
          delete execSubscribers[socket.key];
       },
       count: function() {
          return helper.size(execSubscribers);
       },
       pushUpdates: function() {
          var count = 0;
          for (var addr in execSubscribers) {
             count++;
             try {
                var socket = execSubscribers[addr];
                //
                if (emitVolatile) {
                   socket.volatile.emit('receive-execs', generatePayload(addr));
                } else {
                   socket.emit('receive-execs', generatePayload(addr));
                }
                //
                if (typeof socket.numMessages === "undefined") {
                   socket.numMessages = 1;
                } else {
                   ++socket.numMessages;
                   //delete execSubscribers[socket.key];
                }
             } catch (err) {
                console.log('failed to send execution report' + err);
                try {
                   socket.disconnect();
                } catch (ex) {
                   console.log('failed to close socket ' + ex);
                }
                delete execSubscribers[socket.key];
             }
          }
          return count;
       }
    };
 }

Hardware Measurement

Following module defines API for measuring CPU time, load average, memory, etc:

 var fs = require('fs');
 var os = require('os');
 var helper = require('helper');
 
 var HardwareUsage = function() {
    this.startCpuTime = 0;
    this.startLoadAvg = os.loadavg();
    this.startMemory = process.memoryUsage();
    this.endCpuTime = 0;
    this.endLoadAvg = 0;
    this.endMemory = 0;
    this.started = new Date().getTime();
    var that = this;
    this.cpuUsage (function(totalCpu) {
       that.startCpuTime = totalCpu;
    });
 }
 
 HardwareUsage.prototype.cpuUsage = function(callback) {
    if (!fs.existsSync("/proc/" + process.pid + "/stat")) {
       callback();
       return;
    }
    var that = this;
 
    fs.readFile("/proc/" + process.pid + "/stat", function(err, data){
       var elems = data.toString().split(' ');
       var utime = parseInt(elems[13]);
       var stime = parseInt(elems[14]);
       var totalCpu = utime + stime;
       callback(totalCpu);
    });
 };
 
 
 HardwareUsage.prototype.percCpuChange = function() {
    return helper.percentChange(this.startCpuTime, this.endCpuTime);
 }
 
 HardwareUsage.prototype.percLoadChange = function() {
    return helper.percentChange(this.startLoadAvg[0], this.endLoadAvg[0]);
 }
 
 HardwareUsage.prototype.percMemoryChange = function() {
    return helper.percentChange(this.startMemory.heapTotal, this.endMemory.heapUsed);
 }
 
 
 HardwareUsage.prototype.toString = function() {
    return helper.round(this.endCpuTime) + ',' + helper.round(this.endLoadAvg[0]) + ',' + helper.round(this.startMemory.heapTotal/1024.0/1024.0) + ',' + helper.round(this.endMemory.heapUsed/1024.0/1024.0);
 };
 
 HardwareUsage.prototype.stop = function(callback) {
    var that = this;
    this.cpuUsage (function(totalCpu) {
       that.endCpuTime = totalCpu;
       that.endMemory = process.memoryUsage();
       that.endLoadAvg = os.loadavg();
       that.elapsed = new Date().getTime() - that.started;
       callback(that);
    });
 };
 
 var usage = new HardwareUsage();
 
 exports.heading = function() {
    return 'cpu time, load avg, total memory (M), used memory(M)';
 };
 exports.start = function() {
    usage = new HardwareUsage();
 };
 
 exports.stop = function(callback) {
    usage.stop(callback);
 };

Response time Measurement

Following module defines API for measuring response time:

 var helper = require('helper');
    
 module.exports = function(store, outlierSize) {
    var MetricResultSummary = function(metric) {
       this.averageResponseTime = helper.round(metric.totalTime / metric.totalRequests);
       this.averageHighResponseTime = helper.round(helper.sum(metric.topTenHigh)/ metric.topTenHigh.length);
       this.averageLowResponseTime = helper.round(helper.sum(metric.topTenLow)/ metric.topTenLow.length);
       this.averageSize = helper.round(metric.totalPayloadSize / metric.totalRequests);
       this.messagesReceived = metric.messagesReceived;
       this.count = 0;
    }
    MetricResultSummary.prototype.toString = function() {
       return this.count + ',' + this.averageResponseTime + ',' + this.averageHighResponseTime + ',' + this.averageLowResponseTime + ',' + this.averageSize + ',' + this.messagesReceived;
    };
 
    var Metric = function() {
       this.totalTime = 0;
       this.totalRequests = 0;
       this.topTenHigh = [];
       this.topTenLow = [];
       this.totalPayloadSize = 0;
       this.messagesReceived = 0;
    }
 
    Metric.prototype.addTo = function(target) {
       target.totalTime += this.totalTime;
       target.totalRequests += this.totalRequests;
       for (var i=0; i<this.topTenHigh.length; i++) {
          target.topTenHigh.push(this.topTenHigh[i]);
       }
       for (var i=0; i<this.topTenLow.length; i++) {
          target.topTenLow.push(this.topTenLow[i]);
       }
       target.totalPayloadSize += this.totalPayloadSize;
       target.messagesReceived += this.messagesReceived;
    };
 
    Metric.prototype.update = function(id, sendTime, payloadSize) {
       var elapsed = new Date().getTime() - sendTime;
       for (var i=0; i<outlierSize; i++) {
          if (typeof this.topTenHigh[i] === "undefined" || elapsed > this.topTenHigh[i]) {
             this.topTenHigh[i] = elapsed;
             break;
          }
       }
       //
       for (var i=0; i<outlierSize; i++) {
          if (typeof this.topTenLow[i] === "undefined" || elapsed < this.topTenLow[i]) {
             this.topTenLow[i] = elapsed;
             break;
          }
       }
       this.messagesReceived++;
       this.totalTime += elapsed;
       this.totalPayloadSize += payloadSize;
       this.totalRequests++;
    };
    //
    Metric.prototype.averageResponse = function() {
       return helper.round(this.totalTime / this.totalRequests);
    };
    Metric.prototype.summary = function() {
       return new MetricResultSummary(this);
    };
    Metric.prototype.toString = function() {
       return new MetricResultSummary(this).toString();
    };
 
    var get = function(key) {
          var metric = store[key];
          if (typeof metric === "undefined") {
             metric = new Metric();
             store[key] = metric;
          }
          return metric;
    };
    return {
       get: function(key) {
          return get(key);
       },
       update: function(key, id, sendTime, payloadSize) {
          var metric = get(key);
          metric.update(id, sendTime, payloadSize);
          return metric;
       },
       summaryFor: function(key) {
          var metric = get(key);
          return metric.summary();
       },
       heading: function() {
          return 'count, average response (ms), average outlier high response (ms), average outlier low response (ms), average message size, messages received';
       },
       summary: function() {
          var result = new Metric();
          var count = 0;
          for (var key in store) {
             var metric = store[key];
             metric.addTo(result);
             count++;
          }
          var summary = result.summary();
          summary.count = count;
          return summary;
       }
    };
 }

Request/Reply Server

The request/reply server also uses cluster module and listens for Websocket port on each process. Upon receiving order-request API, it sends a reply to the client. Here is the implementation of server:

 var OUTLIER_SIZE = process.env.OUTLIER_SIZE || 2;
 var MAX_THREADS = process.env.MAX_THREADS || 20;
 var helper = require('helper')
    
 var cluster = require('cluster');
 var hwUsage = require('hardware_usage');
 var metrics = require('metrics')({}, OUTLIER_SIZE);
 hwUsage.start();
 var socketsByAddr = {};
 var totalClients = 0;
 
 
 if (cluster.isMaster) {
    for (var i = 0; i < MAX_THREADS; i++) {
       cluster.fork();
    }  
    cluster.on('exit', function(worker, code, signal) {
       console.log('worker ' + worker.process.pid + ' died');
    });
 } else {
    //
    var io = require('socket.io').listen(8124);
    io.set('log level', 1); 
    io.set('transports', ['websocket']);
    io.set('force new connection', true);
    io.enable('browser client gzip');
    
    io.sockets.on('connection', function(socket) {
       totalClients++;
       socket.address = socket.handshake.address.address + ':' + socket.handshake.address.port;
       socketsByAddr[socket.address] = true;
       socket.on('order-request', function(request) {
          var response = {request: request.counter, timestamp : request.timestamp, address:request.address};
          metrics.update(request.address, request.request, request.timestamp, JSON.stringify(request).length);
          socket.emit('order-response', response);
       });
       
       // disconnected
       socket.on('disconnect', function() {
          totalClients--;
          delete socketsByAddr[socket.address];
       });
       
       socket.on('end', function() {
       });
    });
    });
 }
 
 
 setInterval(function() {
    hwUsage.stop(function(usage) {
      console.log('sockets, clients, ' + hwUsage.heading() + ',' + metrics.heading());
      console.log(helper.size(socketsByAddr) + ',' + totalClients + ',' + usage.toString() + ',' + metrics.summary().toString());
    });
 }, 15000);
 
 
 process.on('uncaughtException', function(err) {
   console.log("GLOBAL " + err + "\n" + err.stack);
 });

Request/Reply Client

The request/reply client is similar to pub/sub model except it initiates order-request API every 50-250 milli-seconds. It also creates new connections every 15 seconds and logs response time continuously.

 var OUTLIER_SIZE = process.env.OUTLIER_SIZE || 10;
 var INTERVAL_BETWEEN_NEW_CLIENTS_SECS = (process.env.INTERVAL_BETWEEN_NEW_CLIENTS_SECS || 15) * 1000;
 
 var hwUsage = require('hardware_usage');
 var metrics = require('metrics')({}, OUTLIER_SIZE);
 hwUsage.start();
 
 var order_client = require('order_client')(metrics, hwUsage);
 // create connections for this client
 var totalConnections = 0;
 var buildClients = function() {
    for (var i=0; i<100; i++) {
       var client = order_client.newClient();
    }
 };
 
 
 setInterval(buildClients, INTERVAL_BETWEEN_NEW_CLIENTS_SECS);
 
 setInterval(function() {
     order_client.log();
 }, INTERVAL_BETWEEN_NEW_CLIENTS_SECS);
 
 process.on('uncaughtException', function(err) {
   console.log("GLOBAL " + err + "\n" + err.stack);
   console.trace('stack trace');
   //process.exit(1);
 });

The actual client is defined as module, e.g.

 var io = require('socket.io-client');
 var helper = require('helper');
 var MAX_ROWS = process.env.MAX_ROWS || 10;
 
 var nextClientId = 0;
 var nextRequestId = 0;
 var totalConnected = 0;
 module.exports = function(metrics, hwUsage) {
    var OrderClient = function() {
       this.id = ++nextClientId;
       this.connected = false;
       this.client = io.connect('localhost', { port: 8124,
                               transports: ['websocket'],
                               'force new connection': true,
                               'sync on disconnect' : true,
                               'connect timeout': 500,
                               'reconnect': true,
                               'reconnection delay': 500,
                               'reopen delay': 500,
                               'max reconnection attempts': 5});
    };
 
    //
    OrderClient.prototype.setupConnectListener = function() {
       var that = this;
       this.client.on('connect_failed', function(error) {
          this.connected = false;
       });
    
       this.client.on('connect', function() {
          this.connected = true;
          totalConnected++;
       });
 
       this.client.on('disconnect', function() {
          this.connected = false;
          totalConnected--;
       });
    
       this.client.on('reconnect_failed', function() {
          this.connected = false;
       });
       
       this.client.on('end', function() {
          this.connected = false; 
       });
    };    
          
       
    OrderClient.prototype.sendRequest = function(max) {
       var execs = [];
       var date = new Date();
       for (var i=0; i<max; i++) {
          execs.push({sequence:i, activityDateStr:date.toString(), com: helper.random(5),price: helper.random(100), accountId:1772139, symbol:"QQQ", transaction:"STO", description:"QQQ Stock", qty: helper.random(200), key: "QQQ:::S", netAmount: helper.random(1000)});
       }
       var request = {request: ++nextRequestId, rows:max, timestamp : date.getTime(), address:this.id, execs: execs};
       this.client.emit('order-request', request);
    }; 
    OrderClient.prototype.setupResponseListener = function() {
       var that = this;
       this.client.on('order-response', function(response) {
          var metric = metrics.update(response.address, response.request, response.timestamp, JSON.stringify(response).length);
        });
    };
 
    return {
       log: function() {
          hwUsage.stop(function(usage) {
             console.log('clients,connected,' + hwUsage.heading() + ',' + metrics.heading());
             console.log(nextClientId + ',' + totalConnected + ',' + usage.toString() + ',' + metrics.summary().toString());
          });
       },
       newClient: function() {
          var client = new OrderClient();
          client.setupConnectListener();
          client.setupResponseListener();
          setInterval(function() {
             client.sendRequest(helper.random(MAX_ROWS));
          }, helper.random(250) + 50);
          return client;
       }
    };
 }

First blocker

I started testing on my Linux machine and saw RangeError: Maximum call stack size exceeded after server reached about 2000 connections. It turned out JSON library on socket.io went into recursion when it received messages while it's parsing existing message. I found a patch at Fix infinite recursion in Websocket parsers and manually applied it as I didn't see it on the latest version socket.io library 0.8.9. After applying it, I was able to make some progress.

Second blocker

After reaching about 10,000 connections I started seeing Cannot call method 'packet' of null coming from the socket.js. At the time, I was using default configuration for Websockets and apparently it was falling back to xhr-polling under heavy load (See open bug for it). I changed configuration to explicitly defined websocket protocol without any fallback, e.g.

 io.set('transports', ['websocket']);

I also made some changes to network settings on my Linux machine as recommended by Node.js w/1M concurrent connections!.

 net.core.rmem_max = 33554432
 net.core.wmem_max = 33554432
 net.ipv4.tcp_rmem = 4096 16384 33554432
 net.ipv4.tcp_wmem = 4096 16384 33554432
 net.ipv4.tcp_mem = 786432 1048576 26777216
 net.ipv4.tcp_max_tw_buckets = 360000
 net.core.netdev_max_backlog = 2500
 vm.min_free_kbytes = 65536
 vm.swappiness = 0
 net.ipv4.ip_local_port_range = 1024 65535

I then applied above changes using:

 sudo sysctl -w net.core.somaxconn=10000
 sudo sysctl -w net.ipv4.tcp_max_syn_backlog=10000 
 sudo sysctl -p

Third blocker

Above changes got me to about 17,000 connections but then I started seeing connection timeout on the client side and warn: client not handshaken client should reconnect on the server side. are two open bugs: first and second for it. I added reconnect parameters to client connection, e.g.

                               'reconnect': true,
                               'reconnection delay': 500,
                               'reopen delay': 500,
                               'max reconnection attempts': 5});

Fourth blocker

Under heavy load, I also saw "Error: not opened" coming from WebSocket library, e.g.

     at WebSocket.send (/alldata/home/sbhatti/prototype-nodejs/node_modules/socket.io-client/node_modules/ws/lib/WebSocket.js:175:16)
     at Transport.WS.send (/alldata/home/sbhatti/prototype-nodejs/node_modules/socket.io-client/lib/transports/websocket.js:107:22)
     at Transport.packet (/alldata/home/sbhatti/prototype-nodejs/node_modules/socket.io-client/lib/transport.js:178:10)
     at Transport.WS.payload (/alldata/home/sbhatti/prototype-nodejs/node_modules/socket.io-client/lib/transports/websocket.js:120:12)
     at Socket.flushBuffer (/alldata/home/sbhatti/prototype-nodejs/node_modules/socket.io-client/lib/socket.js:327:20)
     at Socket.setBuffer (/alldata/home/sbhatti/prototype-nodejs/node_modules/socket.io-client/lib/socket.js:314:14)
     at Socket.onConnect (/alldata/home/sbhatti/prototype-nodejs/node_modules/socket.io-client/lib/socket.js:409:14)
     at Transport.onConnect (/alldata/home/sbhatti/prototype-nodejs/node_modules/socket.io-client/lib/transport.js:139:17)
     at Transport.onPacket (/alldata/home/sbhatti/prototype-nodejs/node_modules/socket.io-client/lib/transport.js:91:12)
     at Transport.onData (/alldata/home/sbhatti/prototype-nodejs/node_modules/socket.io-client/lib/transport.js:69:16)

I chose to ignore it as Websocket is attempting to send data when connection is closed and client will just reconnect.

Summary

I am still evaluating Node.js and I have not decided if I would recommend Node.js for streaming solution. I will be comparing these results against some Java solutions such as Atomosphere and Vertx.io and will post those results in future.

Shahzad Bhatti Welcome to my ramblings and rants!