Soak Testing: Testing Under Sustained Load
Soak test (endurance test)—running system under normal or moderate load for 4–24 hours. Reveals issues that don't surface for minutes: memory leaks, accumulating file descriptors, DB connection pool degradation, slow query growth from data accumulation.
What Soak Tests Detect
Memory leaks: application grows by 100–200MB/hour and crashes with OOM after 12 hours.
Connection pool exhaustion: DB connections aren't returned to pool, after 6 hours pool exhausted—new requests wait until timeout.
Heap accumulation: JVM/Node.js GC handles first 2 hours, then Full GC pauses affect latency.
Table growth without autovacuum: PostgreSQL bloat—after million UPDATE/DELETE operations performance degrades without vacuum.
File descriptor leak: each request opens log file or socket without closing—after 8 hours ulimit exhausted.
k6 Soak Test Scenario
// tests/soak/endurance.js
import http from 'k6/http'
import { check, sleep } from 'k6'
import { Rate, Trend, Gauge } from 'k6/metrics'
const errorRate = new Rate('errors')
const p95Latency = new Trend('p95_latency_trend', true)
const activeUsers = new Gauge('active_users')
export const options = {
stages: [
{ duration: '5m', target: 50 }, // warmup
{ duration: '8h', target: 50 }, // 8 hours normal load
{ duration: '5m', target: 0 }, // cooldown
],
thresholds: {
// Latency shouldn't degrade during test
http_req_duration: ['p(95)<600'],
// Errors shouldn't happen (leaks show through errors)
errors: ['rate<0.001'],
// DB connection time shouldn't grow
http_req_connecting: ['p(95)<50'],
}
}
const BASE_URL = __ENV.BASE_URL || 'https://staging.example.com'
export function setup() {
const res = http.post(`${BASE_URL}/api/auth/login`, JSON.stringify({
email: '[email protected]',
password: __ENV.TEST_PASSWORD
}), { headers: { 'Content-Type': 'application/json' } })
return { token: res.json('token') }
}
export default function(data) {
const headers = {
'Authorization': `Bearer ${data.token}`,
'Content-Type': 'application/json'
}
activeUsers.add(1)
// Mix of operations typical for real traffic
const scenario = Math.random()
if (scenario < 0.6) {
// 60%: read data
const r = http.get(`${BASE_URL}/api/products?page=${Math.ceil(Math.random() * 50)}`,
{ headers })
check(r, { 'read: 200': (r) => r.status === 200 })
errorRate.add(r.status !== 200)
} else if (scenario < 0.8) {
// 20%: write data (creating real records)
const r = http.post(`${BASE_URL}/api/cart/items`, JSON.stringify({
productId: Math.ceil(Math.random() * 1000),
quantity: 1
}), { headers })
check(r, { 'write: 2xx': (r) => r.status < 300 })
errorRate.add(r.status >= 400)
} else if (scenario < 0.9) {
// 10%: search
const r = http.get(`${BASE_URL}/api/search?q=test&limit=20`, { headers })
check(r, { 'search: 200': (r) => r.status === 200 })
errorRate.add(r.status !== 200)
} else {
// 10%: user profile
const r = http.get(`${BASE_URL}/api/me`, { headers })
check(r, { 'profile: 200': (r) => r.status === 200 })
errorRate.add(r.status !== 200)
}
// Add p95 for time series
p95Latency.add(http.get(`${BASE_URL}/api/health`).timings.duration)
sleep(Math.random() * 2 + 0.5) // 0.5–2.5 seconds between requests
}
Monitoring Memory Leaks
#!/bin/bash
# scripts/memory-soak-monitor.sh
# Run in parallel with k6 soak test
APP_PID=$(pgrep -f "node server.js")
LOG_FILE="soak-memory-$(date +%Y%m%d-%H%M).csv"
echo "timestamp,rss_mb,heap_used_mb,heap_total_mb,external_mb,fd_count" > $LOG_FILE
while true; do
TS=$(date -u +%Y-%m-%dT%H:%M:%SZ)
# Node.js memory via endpoint /metrics (if exposed)
METRICS=$(curl -s http://localhost:3000/metrics/memory)
RSS=$(echo $METRICS | jq -r '.rss')
HEAP_USED=$(echo $METRICS | jq -r '.heapUsed')
HEAP_TOTAL=$(echo $METRICS | jq -r '.heapTotal')
EXTERNAL=$(echo $METRICS | jq -r '.external')
# File descriptors
FD_COUNT=$(ls /proc/$APP_PID/fd 2>/dev/null | wc -l)
echo "$TS,$RSS,$HEAP_USED,$HEAP_TOTAL,$EXTERNAL,$FD_COUNT" >> $LOG_FILE
echo "[$TS] RSS: ${RSS}MB | Heap: ${HEAP_USED}/${HEAP_TOTAL}MB | FDs: $FD_COUNT"
sleep 60 # every minute
done
// Express/Fastify endpoint for exposing memory
app.get('/metrics/memory', (req, res) => {
const mem = process.memoryUsage()
res.json({
rss: Math.round(mem.rss / 1024 / 1024),
heapUsed: Math.round(mem.heapUsed / 1024 / 1024),
heapTotal: Math.round(mem.heapTotal / 1024 / 1024),
external: Math.round(mem.external / 1024 / 1024),
})
})
PostgreSQL Monitoring During Soak
-- Run every 15 minutes and save results
-- Table growth (bloat)
SELECT relname, n_live_tup, n_dead_tup,
round(n_dead_tup::numeric / nullif(n_live_tup + n_dead_tup, 0) * 100, 1) AS dead_pct,
last_vacuum, last_autovacuum
FROM pg_stat_user_tables
ORDER BY n_dead_tup DESC LIMIT 10;
-- Accumulating idle transactions (connection leak)
SELECT count(*), state, wait_event_type
FROM pg_stat_activity
WHERE pid != pg_backend_pid()
GROUP BY state, wait_event_type
ORDER BY count DESC;
-- Temporary file accumulation
SELECT temp_files, temp_bytes
FROM pg_stat_database
WHERE datname = current_database();
Analyzing Degradation Trend
# analyze_soak.py
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
def analyze_memory_trend(csv_file: str):
df = pd.read_csv(csv_file, parse_dates=['timestamp'])
df['minutes'] = (df['timestamp'] - df['timestamp'].iloc[0]).dt.total_seconds() / 60
# Linear regression for RSS
slope, intercept, r_value, p_value, std_err = stats.linregress(
df['minutes'], df['rss_mb']
)
hours_to_oom = None
if slope > 0:
# At what memory consumption will OOM start (assume 4GB limit)
oom_threshold = 4096
current_rss = df['rss_mb'].iloc[-1]
hours_to_oom = (oom_threshold - current_rss) / (slope * 60)
print(f"Memory growth rate: {slope:.2f} MB/min ({slope*60:.1f} MB/hour)")
print(f"R²: {r_value**2:.3f} (1.0 = perfect linear growth = definite leak)")
if hours_to_oom:
print(f"Estimated OOM in: {hours_to_oom:.1f} hours")
# Test for statistical significance of growth
if p_value < 0.01 and slope > 0.1:
print("⚠️ MEMORY LEAK DETECTED (statistically significant growth)")
else:
print("✓ No significant memory leak detected")
return {
'slope_mb_per_min': slope,
'r_squared': r_value ** 2,
'hours_to_oom': hours_to_oom,
'leak_detected': p_value < 0.01 and slope > 0.1
}
# Run
result = analyze_memory_trend('soak-memory-20240315-100000.csv')
Typical Findings and Solutions
EventEmitter leak (Node.js): MaxListenersExceededWarning in logs. Add emitter.removeListener() or use once().
Unclosed DB connections: use pool.release() in finally block or ORM-level connection pooling.
Accumulating cron jobs: if cron runs while previous still executing—add mutex lock.
Redis pub/sub leak: unsubscribe from channels when connection closes.
Timeline
Setting up and running soak test for 8–24 hours with memory trend and performance analysis—2–3 business days.







