Best Practice: Building an Observability Stack
Context
Observability is the ability to understand the internal state of a system from its outputs. It consists of three pillars: logs, metrics, and traces. A system missing any of these three pillars is blind during an incident.
Target State
A complete observability stack:
-
All services emit structured JSON logs with trace ID and request ID
-
Distributed tracing is implemented via OpenTelemetry and is visualizable
-
RED metrics (Rate, Errors, Duration) are exported for every service
-
Dashboards show service health at a glance
-
Log retention policies control costs and compliance
Technical Implementation
Step 1: Set Up Structured Logging
// Node.js example with Winston and OpenTelemetry
const winston = require('winston');
const { trace, context } = require('@opentelemetry/api');
const logger = winston.createLogger({
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
transports: [new winston.transports.Console()]
});
// Middleware: embed trace ID and request ID in every log
function createLogContext() {
const span = trace.getActiveSpan();
const spanContext = span?.spanContext();
return {
trace_id: spanContext?.traceId || 'no-trace',
span_id: spanContext?.spanId || 'no-span',
};
}
// Usage:
logger.info('Payment processed', {
...createLogContext(),
service: 'payment-service',
request_id: req.headers['x-request-id'],
user_id: req.user.id,
amount: payment.amount,
currency: payment.currency,
});
// Expected log output format:
// {
// "timestamp": "2025-03-18T10:30:00.000Z",
// "level": "info",
// "service": "payment-service",
// "trace_id": "4bf92f3577b34da6a3ce929d0e0e4736",
// "span_id": "00f067aa0ba902b7",
// "request_id": "req-abc123",
// "message": "Payment processed",
// "amount": 99.99,
// "currency": "EUR"
// }
Step 2: Configure OpenTelemetry (AWS X-Ray Backend)
// otel-setup.js – import before everything else
const { NodeSDK } = require('@opentelemetry/sdk-node');
const { AWSXRayPropagator } = require('@opentelemetry/propagator-aws-xray');
const { AWSXRayIdGenerator } = require('@opentelemetry/id-generator-aws-xray');
const { AWSXRayLambdaPropagator } = require('@opentelemetry/propagator-aws-xray');
const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-http');
const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http');
const { ExpressInstrumentation } = require('@opentelemetry/instrumentation-express');
const sdk = new NodeSDK({
textMapPropagator: new AWSXRayPropagator(),
idGenerator: new AWSXRayIdGenerator(),
traceExporter: new OTLPTraceExporter({
url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://localhost:4318/v1/traces',
}),
instrumentations: [
new HttpInstrumentation(),
new ExpressInstrumentation(),
],
serviceName: process.env.SERVICE_NAME || 'payment-service',
});
sdk.start();
process.on('SIGTERM', () => sdk.shutdown());
Step 3: Configure CloudWatch Log Groups with Terraform
# CloudWatch Log Group for application
resource "aws_cloudwatch_log_group" "app" {
name = "/aws/ecs/${var.service_name}"
retention_in_days = 90 # 90 days for application logs
tags = {
service = var.service_name
environment = var.environment
managed-by = "terraform"
}
}
# CloudWatch Log Group for security/audit logs (longer retention)
resource "aws_cloudwatch_log_group" "audit" {
name = "/audit/${var.service_name}"
retention_in_days = 365 # 1 year for audit logs
tags = {
service = var.service_name
environment = var.environment
log-type = "audit"
}
}
# CloudWatch Contributor Insights (anomaly detection in logs)
resource "aws_cloudwatch_log_metric_filter" "error_rate" {
name = "${var.service_name}-error-rate"
pattern = "{ $.level = \"ERROR\" }"
log_group_name = aws_cloudwatch_log_group.app.name
metric_transformation {
name = "ErrorCount"
namespace = "PaymentService"
value = "1"
}
}
# Configure X-Ray sampling rule
resource "aws_xray_sampling_rule" "payment_service" {
rule_name = "payment-service-sampling"
priority = 1000
reservoir_size = 5 # Minimum 5 traces/second
fixed_rate = 0.05 # 5% sampling for normal requests
url_path = "/api/payment/*"
host = "*"
http_method = "*"
service_type = "*"
service_name = "payment-service"
resource_arn = "*"
version = 1
}
Step 4: Create a Dashboard
resource "aws_cloudwatch_dashboard" "service_health" {
dashboard_name = "${var.service_name}-health"
dashboard_body = jsonencode({
widgets = [
{
type = "metric"
width = 8
height = 6
properties = {
title = "Request Rate (RPM)"
period = 60
stat = "Sum"
metrics = [["PaymentService", "RequestCount"]]
}
},
{
type = "metric"
width = 8
height = 6
properties = {
title = "Error Rate (%)"
period = 60
stat = "Average"
metrics = [["PaymentService", "ErrorRate"]]
}
},
{
type = "metric"
width = 8
height = 6
properties = {
title = "p99 Latency (ms)"
period = 60
stat = "p99"
metrics = [["PaymentService", "Latency"]]
}
}
]
})
}
Common Anti-Patterns
| Anti-Pattern | Problem |
|---|---|
Unstructured logging (plain text) |
Not machine-parseable; log searches are tedious; trace ID correlation impossible |
Logs without trace ID |
Requests cannot be tracked across services; root cause analysis takes hours |
No log retention |
Costs grow without bound; compliance violation (GDPR: no unlimited data storage) |
100% sampling rate in production |
Tracing costs explode under high traffic; alternative: adaptive sampling |
Sensitive data in logs |
GDPR violation; PII, passwords, tokens in logs are a security incident |
Dashboard without alerting |
Dashboard is only checked when someone looks; symptoms go unnoticed until users complain |
Metrics
-
Log availability: Are logs available in the aggregation platform within 60 seconds? (target: yes)
-
Trace coverage: % of services with configured distributed tracing (target: 100%)
-
MTTR correlation: Time to diagnosis with vs. without tracing (measure at postmortems)
Maturity Levels
| Level | Characteristics |
|---|---|
Level 1 |
Unstructured logs in files; no central aggregation; no tracing. |
Level 2 |
Logs centrally aggregated; basic dashboards; no distributed tracing; no structured format. |
Level 3 |
Structured JSON logs with trace ID; distributed tracing configured; RED metrics; log retention. |
Level 4 |
OpenTelemetry vendor-agnostic; SLO-based alerting on metrics; trace sampling optimized. |
Level 5 |
Full correlation of logs/traces/metrics; observability-as-a-product with internal SLAs. |