Best Practice: Safe Deployments (Feature Flags, Canary, Blue/Green)
Context
Big-bang deployments – all changes to all users simultaneously – belong to the risk profile of the 1990s. Progressive Delivery is the answer: new versions receive traffic incrementally, metrics are compared, and rollback is automatic.
The goal: every deployment is safe, measurable, and reversible.
Target State
-
Every deployment starts at 5–10% traffic (canary) or in an isolated environment (blue/green)
-
Metrics comparison: error rate and latency of the new version vs. old version
-
Automatic rollback when error rate rises by X%
-
Feature flags enable rollback without deployment (< 30 seconds)
Technical Implementation
Canary Deployment with AWS CodeDeploy (ECS)
# Terraform: CodeDeploy Canary for ECS Service
resource "aws_codedeploy_app" "payment_service" {
name = "payment-service"
compute_platform = "ECS"
}
resource "aws_codedeploy_deployment_group" "production" {
app_name = aws_codedeploy_app.payment_service.name
deployment_group_name = "payment-service-production"
service_role_arn = aws_iam_role.codedeploy_role.arn
deployment_config_name = "CodeDeployDefault.ECSCanary10Percent5Minutes"
# Meaning: 10% traffic to new version, observe for 5 minutes, then 100%
auto_rollback_configuration {
enabled = true
events = [
"DEPLOYMENT_FAILURE",
"DEPLOYMENT_STOP_ON_ALARM"
]
}
alarm_configuration {
alarms = [aws_cloudwatch_metric_alarm.canary_error_rate.name]
enabled = true
}
blue_green_deployment_config {
deployment_ready_option {
action_on_timeout = "CONTINUE_DEPLOYMENT"
wait_time_in_minutes = 0
}
terminate_blue_instances_on_deployment_success {
action = "TERMINATE"
termination_wait_time_in_minutes = 5
}
}
deployment_style {
deployment_option = "WITH_TRAFFIC_CONTROL"
deployment_type = "BLUE_GREEN"
}
ecs_service {
cluster_name = aws_ecs_cluster.production.name
service_name = aws_ecs_service.payment_service.name
}
load_balancer_info {
target_group_pair_info {
prod_traffic_route {
listener_arns = [aws_lb_listener.https.arn]
}
target_group {
name = aws_lb_target_group.blue.name
}
target_group {
name = aws_lb_target_group.green.name
}
}
}
}
# CloudWatch Alarm that triggers automatic rollback
resource "aws_cloudwatch_metric_alarm" "canary_error_rate" {
alarm_name = "canary-error-rate-high"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 1
threshold = 20 # Rollback when 5xx > 20 during canary period
metric_query {
id = "error_count"
expression = "SELECT SUM(HTTPCode_Target_5XX_Count) FROM SCHEMA(\"AWS/ApplicationELB\", LoadBalancer, TargetGroup)"
return_data = true
period = 60
}
alarm_actions = [] # CodeDeploy monitors this alarm directly
}
Blue/Green Deployment with AWS ALB (Lambda)
# Lambda Blue/Green with alias and weighting
resource "aws_lambda_alias" "live" {
name = "live"
function_name = aws_lambda_function.payment_processor.function_name
function_version = aws_lambda_function.payment_processor.version
routing_config {
additional_version_weights = {
# 10% traffic to new version (Canary)
(aws_lambda_function.payment_processor_v2.version) = 0.1
}
}
}
resource "aws_lambda_function" "payment_processor" {
function_name = "payment-processor-stable"
# ... stable version
}
resource "aws_lambda_function" "payment_processor_v2" {
function_name = "payment-processor-v2"
# ... new version
}
Feature Flags with AWS AppConfig
resource "aws_appconfig_application" "payment_service" {
name = "payment-service"
description = "Feature flags for payment service"
}
resource "aws_appconfig_environment" "production" {
name = "production"
application_id = aws_appconfig_application.payment_service.id
# No deployment without monitoring
monitors {
alarm_arn = aws_cloudwatch_metric_alarm.payment_error_rate.arn
alarm_role_arn = aws_iam_role.appconfig_monitor_role.arn
}
}
resource "aws_appconfig_configuration_profile" "feature_flags" {
name = "payment-features"
application_id = aws_appconfig_application.payment_service.id
location_uri = "hosted"
type = "AWS.AppConfig.FeatureFlags"
validator {
type = "JSON_SCHEMA"
content = jsonencode({
"$schema" = "http://json-schema.org/draft-07/schema#"
type = "object"
properties = {
new_checkout_flow = {
type = "object"
properties = {
enabled = { type = "boolean" }
percentage = { type = "number", minimum = 0, maximum = 100 }
}
}
}
})
}
}
# Feature Flag Values (Hosted Configuration)
resource "aws_appconfig_hosted_configuration_version" "flags_v1" {
application_id = aws_appconfig_application.payment_service.id
configuration_profile_id = aws_appconfig_configuration_profile.feature_flags.configuration_profile_id
description = "Initial feature flags"
content_type = "application/json"
content = jsonencode({
flags = {
new_checkout_flow = {
name = "New Checkout Flow"
description = "Gradual rollout of redesigned checkout"
enabled = false
attributes = {
percentage = { constraints = { type = "number" } }
}
}
}
values = {
new_checkout_flow = {
enabled = false
percentage = 0
}
}
version = "1"
})
}
Using Feature Flags in Application Code
// payment-service/src/checkout.js
const appConfig = require('./appconfig-client');
async function processCheckout(user, cart) {
// Retrieve feature flag (cached, < 10ms latency)
const flags = await appConfig.getConfiguration('payment-service', 'production', 'payment-features');
const useNewFlow = flags.new_checkout_flow.enabled &&
(Math.random() * 100 < flags.new_checkout_flow.percentage);
if (useNewFlow) {
logger.info('Using new checkout flow', {
trace_id: getCurrentTraceId(),
user_id: user.id,
feature_flag: 'new_checkout_flow'
});
return processNewCheckout(user, cart);
}
return processLegacyCheckout(user, cart);
}
Canary Rollback Procedure
#!/bin/bash
# canary-rollback.sh – Immediate rollback for a canary deployment
DEPLOYMENT_ID="${1}"
CLUSTER="${2:-payment-production}"
SERVICE="${3:-payment-service}"
echo "Rolling back deployment: ${DEPLOYMENT_ID}"
# Option 1: Stop CodeDeploy deployment with rollback
aws deploy stop-deployment \
--deployment-id "${DEPLOYMENT_ID}" \
--auto-rollback-enabled
# Option 2: Directly deploy previous ECS Task Definition
PREV_TASK_DEF=$(aws ecs describe-services \
--cluster "${CLUSTER}" \
--services "${SERVICE}" \
--query 'services[0].taskDefinition' \
--output text)
echo "Current task definition: ${PREV_TASK_DEF}"
# Determine Task Definition version -1
TASK_DEF_FAMILY=$(echo "${PREV_TASK_DEF}" | cut -d: -f7)
CURRENT_VERSION=$(echo "${PREV_TASK_DEF}" | cut -d: -f8)
PREV_VERSION=$((CURRENT_VERSION - 1))
echo "Rolling back to version: ${PREV_VERSION}"
aws ecs update-service \
--cluster "${CLUSTER}" \
--service "${SERVICE}" \
--task-definition "${TASK_DEF_FAMILY}:${PREV_VERSION}"
echo "Rollback initiated. Monitor: aws ecs describe-services --cluster ${CLUSTER} --services ${SERVICE}"
Common Anti-Patterns
| Anti-Pattern | Problem |
|---|---|
AllAtOnce deployment in production |
100% blast radius; no way to isolate a faulty version |
Canary without health check alarms |
Canary completes even with 50% error rate – no auto-rollback |
Feature flags never removed (stale flags) |
Code complexity grows; "what happens if I remove this flag?" – nobody knows anymore |
Rollback requires a new deployment |
During incident: 10–20 minute wait for a deployment cycle instead of 30-second flag toggle |
Blue/Green without traffic-split monitoring |
Both environments running, but no metrics comparison between Blue and Green |
Metrics
-
Change Failure Rate: % of deployments that required rollback (target: < 5%)
-
MTTR after a failed deployment: Time to full recovery (target: < 5 minutes with rollback)
-
Blast radius reduction: % of failed deployments with < 10% user impact (target: > 80%)
-
Feature flag lifecycle: Average lifespan of feature flags (warning signal: > 90 days)
Maturity Levels
| Level | Characteristics |
|---|---|
Level 1 |
Big-bang deployments. Rollback = new deployment = 15+ minutes. |
Level 2 |
Rolling updates or manual blue/green. No auto-rollback. |
Level 3 |
Canary or blue/green with health-check-based auto-rollback. Feature flags for new features. |
Level 4 |
Automated canary analysis via metrics comparison. Change Failure Rate tracked. |
Level 5 |
DORA Elite: Change Failure Rate < 5%. Automated deployment decision based on metrics. |