Skip to content

Commit 8a534bb

Browse files
committed
fix(sdam): don't lose servers when they fail monitoring
For legacy reasons the unified topology forced the connection pool into auto reconnect mode by default. This caused failed server checks to continue to emit errors on the server, causing the server to lose track of its monitoring state, and never returning the node to the pool of selectable servers. This results client-side as an error about server selection timing out. NODE-2274
1 parent 818055a commit 8a534bb

File tree

3 files changed

+67
-44
lines changed

3 files changed

+67
-44
lines changed

lib/core/connection/pool.js

+2-9
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,7 @@ function destroy(self, connections, options, callback) {
636636
*/
637637
Pool.prototype.destroy = function(force, callback) {
638638
var self = this;
639+
639640
// Do not try again if the pool is already dead
640641
if (this.state === DESTROYED || self.state === DESTROYING) {
641642
if (typeof callback === 'function') callback(null, null);
@@ -958,15 +959,6 @@ function createConnection(pool, callback) {
958959
pool.logger.debug(`connection attempt failed with error [${JSON.stringify(err)}]`);
959960
}
960961

961-
if (pool.options.legacyCompatMode === false) {
962-
// The unified topology uses the reported `error` from a pool to track what error
963-
// reason is returned to the user during selection timeout. We only want to emit
964-
// this if the pool is active because the listeners are removed on destruction.
965-
if (pool.state !== DESTROYED && pool.state !== DESTROYING) {
966-
pool.emit('error', err);
967-
}
968-
}
969-
970962
// check if reconnect is enabled, and attempt retry if so
971963
if (!pool.reconnectId && pool.options.reconnect) {
972964
if (pool.state === CONNECTING && pool.options.legacyCompatMode) {
@@ -1044,6 +1036,7 @@ function _execute(self) {
10441036
// operations
10451037
if (self.connectingConnections > 0) {
10461038
self.executing = false;
1039+
setTimeout(() => _execute(self)(), 10);
10471040
return;
10481041
}
10491042

lib/core/sdam/server.js

+10-24
Original file line numberDiff line numberDiff line change
@@ -148,16 +148,13 @@ class Server extends EventEmitter {
148148
{ bson: this.s.bson }
149149
);
150150

151-
// NOTE: this should only be the case if we are connecting to a single server
152-
poolOptions.reconnect = true;
151+
// NOTE: reconnect is explicitly false because of the server selection loop
152+
poolOptions.reconnect = false;
153153
poolOptions.legacyCompatMode = false;
154154

155155
this.s.pool = new Pool(this, poolOptions);
156156

157157
// setup listeners
158-
this.s.pool.on('connect', connectEventHandler(this));
159-
this.s.pool.on('close', errorEventHandler(this));
160-
this.s.pool.on('error', errorEventHandler(this));
161158
this.s.pool.on('parseError', parseErrorEventHandler(this));
162159

163160
// it is unclear whether consumers should even know about these events
@@ -169,14 +166,7 @@ class Server extends EventEmitter {
169166
relayEvents(this.s.pool, this, ['commandStarted', 'commandSucceeded', 'commandFailed']);
170167

171168
stateTransition(this, STATE_CONNECTING);
172-
173-
// If auth settings have been provided, use them
174-
if (options.auth) {
175-
this.s.pool.connect.apply(this.s.pool, options.auth);
176-
return;
177-
}
178-
179-
this.s.pool.connect();
169+
this.s.pool.connect(connectEventHandler(this));
180170
}
181171

182172
/**
@@ -474,7 +464,13 @@ function executeWriteOperation(args, options, callback) {
474464
}
475465

476466
function connectEventHandler(server) {
477-
return function(pool, conn) {
467+
return function(err, conn) {
468+
if (err) {
469+
server.emit('error', new MongoNetworkError(err));
470+
server.emit('close');
471+
return;
472+
}
473+
478474
const ismaster = conn.ismaster;
479475
server.s.lastIsMasterMS = conn.lastIsMasterMS;
480476
if (conn.agreedCompressor) {
@@ -506,16 +502,6 @@ function connectEventHandler(server) {
506502
};
507503
}
508504

509-
function errorEventHandler(server) {
510-
return function(err) {
511-
if (err) {
512-
server.emit('error', new MongoNetworkError(err));
513-
}
514-
515-
server.emit('close');
516-
};
517-
}
518-
519505
function parseErrorEventHandler(server) {
520506
return function(err) {
521507
stateTransition(this, STATE_CLOSED);

lib/core/sdam/topology.js

+55-11
Original file line numberDiff line numberDiff line change
@@ -894,15 +894,6 @@ function selectServers(topology, selector, timeout, start, callback) {
894894
topology.s.monitorTimers.push(timer);
895895
});
896896

897-
const descriptionChangedHandler = () => {
898-
// successful iteration, clear the check timer
899-
clearTimeout(iterationTimer);
900-
topology.s.iterationTimers.splice(timerIndex, 1);
901-
902-
// topology description has changed due to monitoring, reattempt server selection
903-
selectServers(topology, selector, timeout, start, callback);
904-
};
905-
906897
const iterationTimer = setTimeout(() => {
907898
topology.removeListener('topologyDescriptionChanged', descriptionChangedHandler);
908899
callback(
@@ -913,16 +904,25 @@ function selectServers(topology, selector, timeout, start, callback) {
913904
);
914905
}, timeout - duration);
915906

907+
const descriptionChangedHandler = () => {
908+
// successful iteration, clear the check timer
909+
removeTimerFrom(iterationTimer, topology.s.iterationTimers);
910+
clearTimeout(iterationTimer);
911+
912+
// topology description has changed due to monitoring, reattempt server selection
913+
selectServers(topology, selector, timeout, start, callback);
914+
};
915+
916916
// track this timer in case we need to clean it up outside this loop
917-
const timerIndex = topology.s.iterationTimers.push(iterationTimer);
917+
topology.s.iterationTimers.push(iterationTimer);
918918

919919
topology.once('topologyDescriptionChanged', descriptionChangedHandler);
920920
};
921921

922922
retrySelection();
923923
}
924924

925-
function createAndConnectServer(topology, serverDescription) {
925+
function createAndConnectServer(topology, serverDescription, connectDelay) {
926926
topology.emit(
927927
'serverOpening',
928928
new monitoring.ServerOpeningEvent(topology.s.id, serverDescription.address)
@@ -934,10 +934,45 @@ function createAndConnectServer(topology, serverDescription) {
934934
server.once('connect', serverConnectEventHandler(server, topology));
935935
server.on('descriptionReceived', topology.serverUpdateHandler.bind(topology));
936936
server.on('error', serverErrorEventHandler(server, topology));
937+
938+
if (connectDelay) {
939+
const connectTimer = setTimeout(() => {
940+
removeTimerFrom(connectTimer, topology.s.iterationTimers);
941+
server.connect();
942+
}, connectDelay);
943+
944+
topology.s.iterationTimers.push(connectTimer);
945+
return server;
946+
}
947+
937948
server.connect();
938949
return server;
939950
}
940951

952+
function resetServer(topology, serverDescription) {
953+
if (!topology.s.servers.has(serverDescription.address)) {
954+
return;
955+
}
956+
957+
// first remove the old server
958+
const server = topology.s.servers.get(serverDescription.address);
959+
destroyServer(server, topology);
960+
961+
// add the new server, and attempt connection after a delay
962+
const newServer = createAndConnectServer(
963+
topology,
964+
serverDescription,
965+
topology.s.heartbeatFrequencyMS
966+
);
967+
968+
topology.s.servers.set(serverDescription.address, newServer);
969+
}
970+
971+
function removeTimerFrom(timer, timers) {
972+
const idx = timers.findIndex(t => t === timer);
973+
timers.splice(idx, 1);
974+
}
975+
941976
/**
942977
* Create `Server` instances for all initially known servers, connect them, and assign
943978
* them to the passed in `Topology`.
@@ -954,6 +989,15 @@ function connectServers(topology, serverDescriptions) {
954989
}
955990

956991
function updateServers(topology, incomingServerDescription) {
992+
// if the server was reset internally because of an error, we need to replace the
993+
// `Server` instance for it so we can attempt reconnect.
994+
//
995+
// TODO: this logical can change once CMAP is put in place
996+
if (incomingServerDescription && incomingServerDescription.error) {
997+
resetServer(topology, incomingServerDescription);
998+
return;
999+
}
1000+
9571001
// update the internal server's description
9581002
if (incomingServerDescription && topology.s.servers.has(incomingServerDescription.address)) {
9591003
const server = topology.s.servers.get(incomingServerDescription.address);

0 commit comments

Comments
 (0)