Retries done wrong make things worse
Naive retries during an outage create a thundering herd — thousands of clients all retrying simultaneously, making the overload worse. Here are the patterns that actually work.
Pattern 1: Exponential backoff with full jitter
Python
import time
import random
def exponential_backoff(attempt: int, base: float = 1.0, max_delay: float = 60.0) -> float:
"""Full jitter: random(0, min(cap, base * 2^attempt))"""
delay = min(max_delay, base * (2 ** attempt))
return random.uniform(0, delay)
# Example: attempt 0=0-1s, 1=0-2s, 2=0-4s, 3=0-8s, 4=0-16sPattern 2: Decorrelated jitter (AWS recommended)
Python
def decorrelated_jitter(prev_delay: float, base: float = 1.0, max_delay: float = 60.0) -> float:
"""sleep = min(cap, random(base, prev_delay * 3))"""
return min(max_delay, random.uniform(base, prev_delay * 3))
# More aggressive spread, recommended by AWSPattern 3: Complete retry decorator
Python
import functools
from openai import RateLimitError, APIConnectionError, APITimeoutError
def retry_ai(max_retries=5, retryable_errors=(RateLimitError, APIConnectionError, APITimeoutError)):
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
prev_delay = 1.0
for attempt in range(max_retries + 1):
try:
return func(*args, **kwargs)
except retryable_errors as e:
if attempt == max_retries:
raise
delay = decorrelated_jitter(prev_delay)
prev_delay = delay
print(f"Retry {attempt+1}/{max_retries}: {type(e).__name__}. Waiting {delay:.1f}s")
time.sleep(delay)
return wrapper
return decorator
# Usage
@retry_ai(max_retries=3)
def call_ai(prompt: str) -> str:
return client.chat.completions.create(
model="claude-sonnet-4-20250514",
messages=[{"role": "user", "content": prompt}],
timeout=30
).choices[0].message.contentPattern 4: Circuit breaker (production-grade)
Python
from enum import Enum
from datetime import datetime, timedelta
from threading import Lock
class State(Enum):
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing — reject all requests
HALF_OPEN = "half_open" # Testing recovery
class CircuitBreaker:
def __init__(self, failure_threshold=5, recovery_timeout=30, success_threshold=3):
self.failure_threshold = failure_threshold
self.recovery_timeout = recovery_timeout
self.success_threshold = success_threshold
self.state = State.CLOSED
self.failures = 0
self.successes = 0
self.last_failure_time = None
self.lock = Lock()
def __call__(self, func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
with self.lock:
if self.state == State.OPEN:
if self._should_attempt_reset():
self.state = State.HALF_OPEN
else:
raise Exception(f"Circuit OPEN. Next attempt in {self._time_until_reset()}s")
try:
result = func(*args, **kwargs)
self._record_success()
return result
except Exception as e:
self._record_failure()
raise
return wrapper
def _should_attempt_reset(self) -> bool:
return datetime.now() - self.last_failure_time > timedelta(seconds=self.recovery_timeout)
def _record_success(self):
with self.lock:
if self.state == State.HALF_OPEN:
self.successes += 1
if self.successes >= self.success_threshold:
self.state = State.CLOSED
self.failures = 0
self.successes = 0
def _record_failure(self):
with self.lock:
self.failures += 1
self.last_failure_time = datetime.now()
if self.failures >= self.failure_threshold:
self.state = State.OPEN