MCP Server Logging and Observability
Complete guide to implementing logging and observability in MCP servers, covering structured logging, request tracing, metrics collection, error tracking, connection monitoring, performance profiling, and building observable MCP server infrastructure.
MCP Server Logging and Observability
Overview
An MCP server without observability is a black box. When a tool invocation takes 10 seconds instead of the usual 200 milliseconds, you need to know why — was it a slow database query, a downstream API timeout, or memory pressure on the host? I have debugged production MCP outages where the only evidence was a spike in client-side timeouts because the server had zero instrumentation. Structured logging, distributed tracing, and metrics collection turn that black box into a glass box.
Prerequisites
- Node.js 16 or later
@modelcontextprotocol/sdkpackage installedwinstonfor structured logging (npm install winston)prom-clientfor Prometheus metrics (npm install prom-client)- Basic understanding of MCP server architecture
- Familiarity with logging levels and structured data concepts
Structured Logging
Logger Configuration
Console.log in production is like shouting into a void. Structured logging gives every log entry a consistent format that machines can parse and humans can read.
// logger.js
var winston = require("winston");
var logFormat = process.env.NODE_ENV === "production"
? winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
)
: winston.format.combine(
winston.format.timestamp({ format: "HH:mm:ss" }),
winston.format.colorize(),
winston.format.printf(function(info) {
var msg = info.timestamp + " [" + info.level + "] " + info.message;
if (info.connectionId) msg += " conn=" + info.connectionId;
if (info.tool) msg += " tool=" + info.tool;
if (info.duration) msg += " duration=" + info.duration + "ms";
if (info.stack) msg += "\n" + info.stack;
return msg;
})
);
var logger = winston.createLogger({
level: process.env.LOG_LEVEL || "info",
format: logFormat,
defaultMeta: {
service: "mcp-server",
version: process.env.APP_VERSION || "1.0.0",
pid: process.pid
},
transports: [
new winston.transports.Console()
]
});
// Add file transport in production
if (process.env.NODE_ENV === "production") {
logger.add(new winston.transports.File({
filename: "/var/log/mcp-server/error.log",
level: "error",
maxsize: 50 * 1024 * 1024, // 50MB
maxFiles: 5,
tailable: true
}));
logger.add(new winston.transports.File({
filename: "/var/log/mcp-server/combined.log",
maxsize: 100 * 1024 * 1024, // 100MB
maxFiles: 10,
tailable: true
}));
}
module.exports = logger;
Request-Scoped Logging
Every log entry from a tool invocation should carry the connection ID and request ID so you can trace a single request through the entire server.
// request-logger.js
var crypto = require("crypto");
var logger = require("./logger");
function createRequestLogger(connectionId) {
var requestId = crypto.randomBytes(8).toString("hex");
return {
connectionId: connectionId,
requestId: requestId,
info: function(message, meta) {
logger.info(message, Object.assign({
connectionId: connectionId,
requestId: requestId
}, meta || {}));
},
warn: function(message, meta) {
logger.warn(message, Object.assign({
connectionId: connectionId,
requestId: requestId
}, meta || {}));
},
error: function(message, meta) {
logger.error(message, Object.assign({
connectionId: connectionId,
requestId: requestId
}, meta || {}));
},
debug: function(message, meta) {
logger.debug(message, Object.assign({
connectionId: connectionId,
requestId: requestId
}, meta || {}));
},
child: function(extraMeta) {
var self = this;
return {
connectionId: connectionId,
requestId: requestId,
info: function(msg, meta) {
self.info(msg, Object.assign({}, extraMeta, meta || {}));
},
warn: function(msg, meta) {
self.warn(msg, Object.assign({}, extraMeta, meta || {}));
},
error: function(msg, meta) {
self.error(msg, Object.assign({}, extraMeta, meta || {}));
},
debug: function(msg, meta) {
self.debug(msg, Object.assign({}, extraMeta, meta || {}));
}
};
}
};
}
module.exports = createRequestLogger;
Tool Invocation Logging
// tool-logger-middleware.js
var logger = require("./logger");
function createToolLogger(mcpServer) {
var originalTool = mcpServer.tool.bind(mcpServer);
mcpServer.tool = function(name, description, schema, handler) {
var wrappedHandler = function(params, context) {
var startTime = Date.now();
var connectionId = context && context.connectionId || "unknown";
logger.info("Tool invocation started", {
tool: name,
connectionId: connectionId,
params: sanitizeParams(params)
});
try {
var result = handler(params, context);
// Handle both sync and async results
if (result && typeof result.then === "function") {
return result.then(function(res) {
var duration = Date.now() - startTime;
logger.info("Tool invocation completed", {
tool: name,
connectionId: connectionId,
duration: duration,
resultType: res.content ? res.content[0].type : "unknown"
});
return res;
}).catch(function(err) {
var duration = Date.now() - startTime;
logger.error("Tool invocation failed", {
tool: name,
connectionId: connectionId,
duration: duration,
error: err.message,
stack: err.stack
});
throw err;
});
}
var duration = Date.now() - startTime;
logger.info("Tool invocation completed", {
tool: name,
connectionId: connectionId,
duration: duration
});
return result;
} catch (err) {
var duration = Date.now() - startTime;
logger.error("Tool invocation failed", {
tool: name,
connectionId: connectionId,
duration: duration,
error: err.message,
stack: err.stack
});
throw err;
}
};
return originalTool(name, description, schema, wrappedHandler);
};
return mcpServer;
}
function sanitizeParams(params) {
var sanitized = {};
var sensitiveKeys = ["password", "secret", "token", "key", "authorization"];
Object.keys(params || {}).forEach(function(key) {
if (sensitiveKeys.indexOf(key.toLowerCase()) > -1) {
sanitized[key] = "[REDACTED]";
} else if (typeof params[key] === "string" && params[key].length > 200) {
sanitized[key] = params[key].substring(0, 200) + "...[truncated]";
} else {
sanitized[key] = params[key];
}
});
return sanitized;
}
module.exports = createToolLogger;
Production JSON log output:
{"level":"info","message":"Tool invocation started","service":"mcp-server","version":"1.2.0","pid":1234,"timestamp":"2026-02-10T10:30:15.123Z","tool":"query_database","connectionId":"abc123","params":{"sql":"SELECT * FROM users WHERE active = true","password":"[REDACTED]"}}
{"level":"info","message":"Tool invocation completed","service":"mcp-server","version":"1.2.0","pid":1234,"timestamp":"2026-02-10T10:30:15.456Z","tool":"query_database","connectionId":"abc123","duration":333,"resultType":"text"}
Metrics Collection
Prometheus Metrics
// metrics.js
var promClient = require("prom-client");
// Create a registry
var registry = new promClient.Registry();
// Default Node.js metrics (memory, CPU, event loop)
promClient.collectDefaultMetrics({ register: registry });
// Custom MCP metrics
var toolInvocations = new promClient.Counter({
name: "mcp_tool_invocations_total",
help: "Total number of tool invocations",
labelNames: ["tool", "status"],
registers: [registry]
});
var toolDuration = new promClient.Histogram({
name: "mcp_tool_duration_seconds",
help: "Tool invocation duration in seconds",
labelNames: ["tool"],
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
registers: [registry]
});
var activeConnections = new promClient.Gauge({
name: "mcp_active_connections",
help: "Number of active SSE connections",
registers: [registry]
});
var connectionDuration = new promClient.Histogram({
name: "mcp_connection_duration_seconds",
help: "Connection duration in seconds",
buckets: [1, 10, 60, 300, 900, 1800, 3600],
registers: [registry]
});
var toolErrors = new promClient.Counter({
name: "mcp_tool_errors_total",
help: "Total number of tool invocation errors",
labelNames: ["tool", "error_type"],
registers: [registry]
});
var messageSize = new promClient.Histogram({
name: "mcp_message_size_bytes",
help: "Size of MCP messages in bytes",
labelNames: ["direction", "type"],
buckets: [100, 500, 1000, 5000, 10000, 50000, 100000],
registers: [registry]
});
var resourceAccess = new promClient.Counter({
name: "mcp_resource_access_total",
help: "Total resource access count",
labelNames: ["resource_uri"],
registers: [registry]
});
module.exports = {
registry: registry,
toolInvocations: toolInvocations,
toolDuration: toolDuration,
activeConnections: activeConnections,
connectionDuration: connectionDuration,
toolErrors: toolErrors,
messageSize: messageSize,
resourceAccess: resourceAccess,
// Convenience method for tool timing
timeToolInvocation: function(toolName, fn) {
var end = toolDuration.startTimer({ tool: toolName });
toolInvocations.inc({ tool: toolName, status: "started" });
try {
var result = fn();
if (result && typeof result.then === "function") {
return result.then(function(res) {
end();
toolInvocations.inc({ tool: toolName, status: "success" });
return res;
}).catch(function(err) {
end();
toolInvocations.inc({ tool: toolName, status: "error" });
toolErrors.inc({ tool: toolName, error_type: err.constructor.name });
throw err;
});
}
end();
toolInvocations.inc({ tool: toolName, status: "success" });
return result;
} catch (err) {
end();
toolInvocations.inc({ tool: toolName, status: "error" });
toolErrors.inc({ tool: toolName, error_type: err.constructor.name });
throw err;
}
}
};
Metrics Endpoint
// metrics-endpoint.js
var metrics = require("./metrics");
function registerMetricsEndpoint(app) {
app.get("/metrics", function(req, res) {
metrics.registry.metrics().then(function(data) {
res.set("Content-Type", metrics.registry.contentType);
res.send(data);
}).catch(function(err) {
res.status(500).send("Error collecting metrics: " + err.message);
});
});
}
module.exports = registerMetricsEndpoint;
Prometheus scrape output:
# HELP mcp_tool_invocations_total Total number of tool invocations
# TYPE mcp_tool_invocations_total counter
mcp_tool_invocations_total{tool="query_database",status="success"} 1542
mcp_tool_invocations_total{tool="query_database",status="error"} 23
mcp_tool_invocations_total{tool="read_file",status="success"} 8921
# HELP mcp_tool_duration_seconds Tool invocation duration in seconds
# TYPE mcp_tool_duration_seconds histogram
mcp_tool_duration_seconds_bucket{tool="query_database",le="0.1"} 890
mcp_tool_duration_seconds_bucket{tool="query_database",le="0.25"} 1350
mcp_tool_duration_seconds_bucket{tool="query_database",le="0.5"} 1500
mcp_tool_duration_seconds_bucket{tool="query_database",le="1"} 1535
mcp_tool_duration_seconds_bucket{tool="query_database",le="+Inf"} 1542
mcp_tool_duration_seconds_sum{tool="query_database"} 287.45
mcp_tool_duration_seconds_count{tool="query_database"} 1542
# HELP mcp_active_connections Number of active SSE connections
# TYPE mcp_active_connections gauge
mcp_active_connections 12
Connection Monitoring
Connection Lifecycle Tracking
// connection-tracker.js
var EventEmitter = require("events");
var logger = require("./logger");
var metrics = require("./metrics");
function ConnectionTracker() {
this.connections = new Map();
this.events = new EventEmitter();
}
ConnectionTracker.prototype.add = function(connectionId, metadata) {
var entry = {
id: connectionId,
connectedAt: new Date(),
lastActivity: new Date(),
toolInvocations: 0,
bytesReceived: 0,
bytesSent: 0,
errors: 0,
remoteAddress: metadata.remoteAddress || "unknown",
userAgent: metadata.userAgent || "unknown"
};
this.connections.set(connectionId, entry);
metrics.activeConnections.inc();
logger.info("Connection opened", {
connectionId: connectionId,
remoteAddress: entry.remoteAddress,
totalConnections: this.connections.size
});
this.events.emit("connection:open", entry);
return entry;
};
ConnectionTracker.prototype.remove = function(connectionId) {
var entry = this.connections.get(connectionId);
if (!entry) return;
var duration = (Date.now() - entry.connectedAt.getTime()) / 1000;
metrics.activeConnections.dec();
metrics.connectionDuration.observe(duration);
logger.info("Connection closed", {
connectionId: connectionId,
duration: Math.round(duration) + "s",
toolInvocations: entry.toolInvocations,
bytesReceived: entry.bytesReceived,
bytesSent: entry.bytesSent,
errors: entry.errors
});
this.connections.delete(connectionId);
this.events.emit("connection:close", entry);
};
ConnectionTracker.prototype.recordActivity = function(connectionId, type, bytes) {
var entry = this.connections.get(connectionId);
if (!entry) return;
entry.lastActivity = new Date();
if (type === "tool_invocation") entry.toolInvocations++;
if (type === "received") entry.bytesReceived += (bytes || 0);
if (type === "sent") entry.bytesSent += (bytes || 0);
if (type === "error") entry.errors++;
};
ConnectionTracker.prototype.getSnapshot = function() {
var now = Date.now();
var snapshot = [];
this.connections.forEach(function(entry) {
snapshot.push({
id: entry.id,
connectedAt: entry.connectedAt.toISOString(),
durationSeconds: Math.round((now - entry.connectedAt.getTime()) / 1000),
lastActivityAgo: Math.round((now - entry.lastActivity.getTime()) / 1000) + "s",
toolInvocations: entry.toolInvocations,
remoteAddress: entry.remoteAddress,
errors: entry.errors
});
});
return {
total: snapshot.length,
connections: snapshot
};
};
ConnectionTracker.prototype.getStaleConnections = function(maxIdleSeconds) {
var now = Date.now();
var stale = [];
this.connections.forEach(function(entry) {
var idle = (now - entry.lastActivity.getTime()) / 1000;
if (idle > maxIdleSeconds) {
stale.push(entry);
}
});
return stale;
};
module.exports = ConnectionTracker;
Stale Connection Cleanup
// connection-reaper.js
var logger = require("./logger");
function startConnectionReaper(tracker, options) {
var maxIdleSeconds = options.maxIdleSeconds || 3600;
var checkInterval = options.checkInterval || 60000;
var interval = setInterval(function() {
var stale = tracker.getStaleConnections(maxIdleSeconds);
if (stale.length > 0) {
logger.warn("Found stale connections", {
count: stale.length,
maxIdleSeconds: maxIdleSeconds
});
stale.forEach(function(entry) {
var idle = Math.round((Date.now() - entry.lastActivity.getTime()) / 1000);
logger.info("Closing stale connection", {
connectionId: entry.id,
idleSeconds: idle,
remoteAddress: entry.remoteAddress
});
// Close via transport if available
if (entry.transport && typeof entry.transport.close === "function") {
entry.transport.close();
}
tracker.remove(entry.id);
});
}
}, checkInterval);
return {
stop: function() { clearInterval(interval); }
};
}
module.exports = startConnectionReaper;
Request Tracing
Trace Context Propagation
When an MCP tool calls an external API or database, you need to correlate the entire chain of operations back to the original tool invocation.
// tracing.js
var crypto = require("crypto");
function TraceContext(traceId, spanId, parentSpanId) {
this.traceId = traceId || crypto.randomBytes(16).toString("hex");
this.spanId = spanId || crypto.randomBytes(8).toString("hex");
this.parentSpanId = parentSpanId || null;
this.startTime = Date.now();
this.spans = [];
}
TraceContext.prototype.createChildSpan = function(operation) {
var child = {
spanId: crypto.randomBytes(8).toString("hex"),
parentSpanId: this.spanId,
operation: operation,
startTime: Date.now(),
endTime: null,
duration: null,
attributes: {},
status: "started"
};
this.spans.push(child);
return child;
};
TraceContext.prototype.endSpan = function(span, status) {
span.endTime = Date.now();
span.duration = span.endTime - span.startTime;
span.status = status || "ok";
};
TraceContext.prototype.toJSON = function() {
return {
traceId: this.traceId,
spanId: this.spanId,
parentSpanId: this.parentSpanId,
totalDuration: Date.now() - this.startTime,
spans: this.spans.map(function(s) {
return {
spanId: s.spanId,
operation: s.operation,
duration: s.duration,
status: s.status,
attributes: s.attributes
};
})
};
};
// Middleware factory for tracing tool invocations
function tracedTool(toolName, handler) {
return function(params, context) {
var trace = new TraceContext();
var mainSpan = trace.createChildSpan("tool:" + toolName);
// Inject trace into context for downstream use
var tracedContext = Object.assign({}, context, { trace: trace });
try {
var result = handler(params, tracedContext);
if (result && typeof result.then === "function") {
return result.then(function(res) {
trace.endSpan(mainSpan, "ok");
logTrace(toolName, trace);
return res;
}).catch(function(err) {
trace.endSpan(mainSpan, "error");
mainSpan.attributes.error = err.message;
logTrace(toolName, trace);
throw err;
});
}
trace.endSpan(mainSpan, "ok");
logTrace(toolName, trace);
return result;
} catch (err) {
trace.endSpan(mainSpan, "error");
mainSpan.attributes.error = err.message;
logTrace(toolName, trace);
throw err;
}
};
}
function logTrace(toolName, trace) {
var data = trace.toJSON();
var logger = require("./logger");
logger.debug("Trace completed", {
tool: toolName,
traceId: data.traceId,
totalDuration: data.totalDuration,
spanCount: data.spans.length,
spans: data.spans
});
}
// Traced database query helper
function tracedQuery(pool, sql, params, trace) {
var span = trace.createChildSpan("db:query");
span.attributes.sql = sql.substring(0, 100);
return pool.query(sql, params).then(function(result) {
trace.endSpan(span, "ok");
span.attributes.rowCount = result.rows.length;
return result;
}).catch(function(err) {
trace.endSpan(span, "error");
span.attributes.error = err.message;
throw err;
});
}
// Traced HTTP request helper
function tracedFetch(url, options, trace) {
var span = trace.createChildSpan("http:" + (options.method || "GET"));
span.attributes.url = url;
var http = require("http");
var https = require("https");
var mod = url.startsWith("https") ? https : http;
return new Promise(function(resolve, reject) {
var req = mod.request(url, options, function(res) {
var body = "";
res.on("data", function(chunk) { body += chunk; });
res.on("end", function() {
trace.endSpan(span, res.statusCode < 400 ? "ok" : "error");
span.attributes.statusCode = res.statusCode;
span.attributes.responseSize = body.length;
resolve({ statusCode: res.statusCode, body: body });
});
});
req.on("error", function(err) {
trace.endSpan(span, "error");
span.attributes.error = err.message;
reject(err);
});
if (options.body) req.write(options.body);
req.end();
});
}
module.exports = {
TraceContext: TraceContext,
tracedTool: tracedTool,
tracedQuery: tracedQuery,
tracedFetch: tracedFetch
};
Trace output:
{
"level": "debug",
"message": "Trace completed",
"tool": "query_database",
"traceId": "a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6",
"totalDuration": 342,
"spanCount": 3,
"spans": [
{ "spanId": "sp1", "operation": "tool:query_database", "duration": 340, "status": "ok" },
{ "spanId": "sp2", "operation": "db:query", "duration": 125, "status": "ok", "attributes": { "sql": "SELECT * FROM users WHERE active = true", "rowCount": 42 } },
{ "spanId": "sp3", "operation": "db:query", "duration": 89, "status": "ok", "attributes": { "sql": "SELECT COUNT(*) FROM orders WHERE user_id = ANY($1)", "rowCount": 1 } }
]
}
Error Tracking
Error Classification and Aggregation
// error-tracker.js
var logger = require("./logger");
function ErrorTracker() {
this.errors = [];
this.errorCounts = {};
this.maxErrors = 1000;
}
ErrorTracker.prototype.track = function(error, context) {
var errorKey = error.constructor.name + ":" + (error.code || error.message.substring(0, 50));
if (!this.errorCounts[errorKey]) {
this.errorCounts[errorKey] = { count: 0, firstSeen: new Date(), lastSeen: null, sample: null };
}
this.errorCounts[errorKey].count++;
this.errorCounts[errorKey].lastSeen = new Date();
this.errorCounts[errorKey].sample = {
message: error.message,
stack: error.stack,
code: error.code,
context: context
};
// Keep a rolling window of recent errors
this.errors.push({
timestamp: new Date(),
type: error.constructor.name,
message: error.message,
code: error.code,
tool: context.tool || "unknown",
connectionId: context.connectionId || "unknown"
});
if (this.errors.length > this.maxErrors) {
this.errors = this.errors.slice(-this.maxErrors);
}
// Log at appropriate level based on error type
if (error.code === "ECONNREFUSED" || error.code === "ETIMEDOUT") {
logger.error("Dependency connection failure", {
errorType: error.constructor.name,
errorCode: error.code,
tool: context.tool,
connectionId: context.connectionId,
occurrences: this.errorCounts[errorKey].count
});
} else {
logger.error("Tool error", {
errorType: error.constructor.name,
message: error.message,
tool: context.tool,
connectionId: context.connectionId,
stack: error.stack
});
}
};
ErrorTracker.prototype.getReport = function() {
var counts = this.errorCounts;
var sorted = Object.keys(counts).sort(function(a, b) {
return counts[b].count - counts[a].count;
});
return {
totalUniqueErrors: sorted.length,
recentErrors: this.errors.slice(-20),
topErrors: sorted.slice(0, 10).map(function(key) {
return {
key: key,
count: counts[key].count,
firstSeen: counts[key].firstSeen,
lastSeen: counts[key].lastSeen,
sampleMessage: counts[key].sample.message
};
})
};
};
ErrorTracker.prototype.getErrorRate = function(windowSeconds) {
var cutoff = Date.now() - (windowSeconds * 1000);
var recent = this.errors.filter(function(e) {
return e.timestamp.getTime() > cutoff;
});
return {
count: recent.length,
windowSeconds: windowSeconds,
rate: (recent.length / windowSeconds).toFixed(3) + "/s"
};
};
module.exports = ErrorTracker;
Unhandled Error Boundaries
// error-boundaries.js
var logger = require("./logger");
function setupErrorBoundaries(errorTracker) {
// Catch unhandled promise rejections
process.on("unhandledRejection", function(reason, promise) {
var error = reason instanceof Error ? reason : new Error(String(reason));
logger.error("Unhandled promise rejection", {
message: error.message,
stack: error.stack
});
errorTracker.track(error, { tool: "unhandled_rejection" });
});
// Catch uncaught exceptions (log and exit)
process.on("uncaughtException", function(error) {
logger.error("Uncaught exception — shutting down", {
message: error.message,
stack: error.stack
});
errorTracker.track(error, { tool: "uncaught_exception" });
// Give logs time to flush
setTimeout(function() {
process.exit(1);
}, 1000);
});
// Memory warning
var memoryCheckInterval = setInterval(function() {
var usage = process.memoryUsage();
var heapPercent = (usage.heapUsed / usage.heapTotal) * 100;
if (heapPercent > 85) {
logger.warn("High memory usage", {
heapUsed: Math.round(usage.heapUsed / 1024 / 1024) + "MB",
heapTotal: Math.round(usage.heapTotal / 1024 / 1024) + "MB",
heapPercent: heapPercent.toFixed(1) + "%",
rss: Math.round(usage.rss / 1024 / 1024) + "MB"
});
}
}, 30000);
return {
stop: function() { clearInterval(memoryCheckInterval); }
};
}
module.exports = setupErrorBoundaries;
Performance Profiling
Slow Tool Detection
// slow-tool-detector.js
var logger = require("./logger");
function SlowToolDetector(options) {
this.thresholdMs = options.thresholdMs || 5000;
this.toolTimings = {};
}
SlowToolDetector.prototype.record = function(toolName, durationMs) {
if (!this.toolTimings[toolName]) {
this.toolTimings[toolName] = {
count: 0,
totalMs: 0,
minMs: Infinity,
maxMs: 0,
slowCount: 0,
p95: [],
recentSlow: []
};
}
var t = this.toolTimings[toolName];
t.count++;
t.totalMs += durationMs;
if (durationMs < t.minMs) t.minMs = durationMs;
if (durationMs > t.maxMs) t.maxMs = durationMs;
// Track for percentile calculation (keep last 1000)
t.p95.push(durationMs);
if (t.p95.length > 1000) t.p95.shift();
if (durationMs > this.thresholdMs) {
t.slowCount++;
t.recentSlow.push({
timestamp: new Date(),
durationMs: durationMs
});
if (t.recentSlow.length > 50) t.recentSlow.shift();
logger.warn("Slow tool invocation detected", {
tool: toolName,
duration: durationMs + "ms",
threshold: this.thresholdMs + "ms",
avgMs: Math.round(t.totalMs / t.count),
slowRate: ((t.slowCount / t.count) * 100).toFixed(1) + "%"
});
}
};
SlowToolDetector.prototype.getReport = function() {
var self = this;
var report = {};
Object.keys(this.toolTimings).forEach(function(tool) {
var t = self.toolTimings[tool];
var sorted = t.p95.slice().sort(function(a, b) { return a - b; });
var p95Index = Math.floor(sorted.length * 0.95);
report[tool] = {
count: t.count,
avgMs: Math.round(t.totalMs / t.count),
minMs: t.minMs === Infinity ? 0 : t.minMs,
maxMs: t.maxMs,
p95Ms: sorted[p95Index] || 0,
slowCount: t.slowCount,
slowRate: ((t.slowCount / t.count) * 100).toFixed(1) + "%"
};
});
return report;
};
module.exports = SlowToolDetector;
Complete Working Example
A fully observable MCP server with structured logging, metrics, tracing, and connection monitoring.
// server.js - Observable MCP Server
var express = require("express");
var McpServer = require("@modelcontextprotocol/sdk/server/mcp.js").McpServer;
var SSEServerTransport = require("@modelcontextprotocol/sdk/server/sse.js").SSEServerTransport;
var logger = require("./logger");
var metrics = require("./metrics");
var ConnectionTracker = require("./connection-tracker");
var ErrorTracker = require("./error-tracker");
var SlowToolDetector = require("./slow-tool-detector");
var setupErrorBoundaries = require("./error-boundaries");
var startConnectionReaper = require("./connection-reaper");
var app = express();
var mcpServer = new McpServer({ name: "observable-mcp", version: "1.0.0" });
var tracker = new ConnectionTracker();
var errorTracker = new ErrorTracker();
var slowDetector = new SlowToolDetector({ thresholdMs: 3000 });
// Setup error boundaries
var boundaries = setupErrorBoundaries(errorTracker);
// Start stale connection reaper
var reaper = startConnectionReaper(tracker, {
maxIdleSeconds: 1800,
checkInterval: 60000
});
// Instrumented tool registration
function registerTool(name, description, schema, handler) {
mcpServer.tool(name, description, schema, function(params, context) {
var startTime = Date.now();
metrics.toolInvocations.inc({ tool: name, status: "started" });
try {
var result = handler(params, context);
if (result && typeof result.then === "function") {
return result.then(function(res) {
var duration = Date.now() - startTime;
metrics.toolInvocations.inc({ tool: name, status: "success" });
metrics.toolDuration.observe({ tool: name }, duration / 1000);
slowDetector.record(name, duration);
return res;
}).catch(function(err) {
var duration = Date.now() - startTime;
metrics.toolInvocations.inc({ tool: name, status: "error" });
metrics.toolErrors.inc({ tool: name, error_type: err.constructor.name });
metrics.toolDuration.observe({ tool: name }, duration / 1000);
slowDetector.record(name, duration);
errorTracker.track(err, { tool: name });
throw err;
});
}
var duration = Date.now() - startTime;
metrics.toolInvocations.inc({ tool: name, status: "success" });
metrics.toolDuration.observe({ tool: name }, duration / 1000);
slowDetector.record(name, duration);
return result;
} catch (err) {
var duration = Date.now() - startTime;
metrics.toolInvocations.inc({ tool: name, status: "error" });
metrics.toolErrors.inc({ tool: name, error_type: err.constructor.name });
metrics.toolDuration.observe({ tool: name }, duration / 1000);
errorTracker.track(err, { tool: name });
throw err;
}
});
}
// Register tools with instrumentation
registerTool("echo", "Echo a message", { message: { type: "string" } }, function(params) {
return { content: [{ type: "text", text: "Echo: " + params.message }] };
});
registerTool("slow_query", "Simulate a slow database query", { delay: { type: "number" } }, function(params) {
var delay = params.delay || 1000;
return new Promise(function(resolve) {
setTimeout(function() {
resolve({ content: [{ type: "text", text: "Query completed after " + delay + "ms" }] });
}, delay);
});
});
// Health and observability endpoints
app.get("/healthz", function(req, res) {
res.json({ status: "alive" });
});
app.get("/metrics", function(req, res) {
metrics.registry.metrics().then(function(data) {
res.set("Content-Type", metrics.registry.contentType);
res.send(data);
});
});
app.get("/debug/connections", function(req, res) {
res.json(tracker.getSnapshot());
});
app.get("/debug/errors", function(req, res) {
res.json({
report: errorTracker.getReport(),
recentRate: errorTracker.getErrorRate(300)
});
});
app.get("/debug/performance", function(req, res) {
res.json(slowDetector.getReport());
});
// SSE endpoint with tracking
app.get("/sse", function(req, res) {
var connectionId = Date.now().toString(36) + Math.random().toString(36).substr(2);
var transport = new SSEServerTransport("/messages", res);
tracker.add(connectionId, {
remoteAddress: req.ip,
userAgent: req.get("user-agent")
});
transport.onclose = function() {
tracker.remove(connectionId);
};
mcpServer.connect(transport);
logger.info("SSE connection established", {
connectionId: connectionId,
remoteAddress: req.ip,
activeConnections: tracker.connections.size
});
});
app.post("/messages", express.json(), function(req, res) {
res.json({ ok: true });
});
// Start server
var port = process.env.MCP_PORT || 3100;
var server = app.listen(port, function() {
logger.info("Observable MCP server started", {
port: port,
env: process.env.NODE_ENV || "development",
pid: process.pid
});
});
// Graceful shutdown
process.on("SIGTERM", function() {
logger.info("SIGTERM received, shutting down");
reaper.stop();
boundaries.stop();
server.close(function() {
logger.info("Server closed");
process.exit(0);
});
});
# Start the server
NODE_ENV=production LOG_LEVEL=info node server.js
# Check connections
curl -s http://localhost:3100/debug/connections | jq .
# View error report
curl -s http://localhost:3100/debug/errors | jq .
# Check tool performance
curl -s http://localhost:3100/debug/performance | jq .
# Scrape metrics
curl -s http://localhost:3100/metrics
Output from /debug/performance:
{
"echo": {
"count": 1250,
"avgMs": 2,
"minMs": 0,
"maxMs": 45,
"p95Ms": 8,
"slowCount": 0,
"slowRate": "0.0%"
},
"slow_query": {
"count": 89,
"avgMs": 2340,
"minMs": 502,
"maxMs": 12500,
"p95Ms": 8200,
"slowCount": 23,
"slowRate": "25.8%"
}
}
Common Issues and Troubleshooting
Log Files Growing Unbounded
Error: ENOSPC: no space left on device, write
Without log rotation, a busy MCP server generates gigabytes of logs quickly. Winston's file transport supports built-in rotation.
// Fix: Configure log rotation
new winston.transports.File({
filename: "/var/log/mcp-server/combined.log",
maxsize: 50 * 1024 * 1024, // 50MB per file
maxFiles: 5, // Keep 5 rotated files
tailable: true
});
Metrics Cardinality Explosion
Error: Too many time series. Prometheus scrape exceeded memory limit.
Using dynamic labels (like user IDs or query text) creates unbounded cardinality. Only use labels with known, finite value sets.
// Bad: unbounded cardinality
toolDuration.observe({ tool: name, query: params.sql }, duration);
// Good: bounded labels only
toolDuration.observe({ tool: name }, duration);
Structured Logs Not Parsing in ELK
Error: Failed to parse JSON log entry at position 342
Multi-line error stacks break JSON parsing. Ensure error stacks are embedded as string values, not raw multi-line output.
// Fix: format errors correctly
winston.format.errors({ stack: true }) // Includes stack as a string field
Debug Endpoints Exposed in Production
GET /debug/errors → 200 (accessible without auth)
Debug endpoints expose internal state. Always gate them behind authentication or disable them in production.
// Fix: Protect debug endpoints
app.use("/debug", function(req, res, next) {
var token = req.get("X-Debug-Token");
if (token !== process.env.DEBUG_TOKEN) {
res.status(403).json({ error: "Forbidden" });
return;
}
next();
});
High Memory Usage from Error Tracking
Warning: High memory usage - heapPercent: 89.2%
Storing every error in memory without limits causes unbounded growth. Cap the error buffer and use sampling for high-volume errors.
// Fix: Limit error buffer size
if (this.errors.length > this.maxErrors) {
this.errors = this.errors.slice(-this.maxErrors);
}
Best Practices
- Always use structured JSON logging in production — text logs are convenient for development but impossible to query at scale. JSON integrates directly with log aggregation platforms.
- Include connection ID in every log entry — when debugging an issue reported by a specific client, filtering by connection ID lets you reconstruct the exact sequence of events.
- Sanitize sensitive data before logging — never log passwords, API keys, tokens, or personally identifiable information. Build sanitization into the logging layer, not individual log calls.
- Set up alerting on error rate, not error count — a steady trickle of errors is normal. A sudden spike in error rate indicates an incident. Alert on rate changes, not absolute numbers.
- Use histograms for latency, not averages — an average response time of 200ms hides the fact that 5% of requests take 10 seconds. Track p50, p95, and p99 percentiles.
- Implement slow query detection with adaptive thresholds — a 3-second threshold works for most tools, but some tools are inherently slow. Track per-tool baselines and alert on deviations.
- Protect debug and metrics endpoints — they expose internal server state including error stacks, connection details, and performance data. Gate them behind authentication tokens.
- Rotate logs and cap in-memory buffers — both disk-based logs and in-memory error trackers need size limits. Without them, a busy server will exhaust disk space or RAM.