Development of an A/B testing system for AI agents
A/B testing of AI agents is a controlled comparison of two agent versions using real traffic. AI quality metrics (hallucination rate, task success, user satisfaction) require a statistically significant number of samples for inference.
A/B Experiment Design for Agents
from dataclasses import dataclass
from enum import Enum
class ExperimentStatus(str, Enum):
DRAFT = "draft"
RUNNING = "running"
COMPLETED = "completed"
STOPPED = "stopped"
@dataclass
class AgentExperiment:
experiment_id: str
agent_name: str
control_version: str # текущий prod
treatment_version: str # новая версия
traffic_split: float # 0.1 = 10% на treatment
hypothesis: str # что ожидаем улучшить
primary_metric: str # task_success_rate / quality_score / latency
secondary_metrics: list[str]
min_samples: int # минимум для статистики (обычно 200-500)
max_duration_days: int
status: ExperimentStatus = ExperimentStatus.DRAFT
Routing by Experiments
import hashlib
import random
class ExperimentRouter:
def __init__(self, experiments: list[AgentExperiment]):
self.experiments = {e.experiment_id: e for e in experiments
if e.status == ExperimentStatus.RUNNING}
def get_variant(self, agent_name: str, user_id: str) -> tuple[str, str | None]:
"""
Returns: (version_to_use, experiment_id_if_any)
Использует consistent hashing: один пользователь всегда в одной группе.
"""
active = [e for e in self.experiments.values() if e.agent_name == agent_name]
if not active:
return "latest", None
experiment = active[0]
# Consistent хэшинг по user_id + experiment_id
hash_input = f"{user_id}:{experiment.experiment_id}"
hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
bucket = (hash_value % 1000) / 1000.0 # 0.0 - 1.0
if bucket < experiment.traffic_split:
return experiment.treatment_version, experiment.experiment_id
else:
return experiment.control_version, experiment.experiment_id
Recording the results of the experiment
class ExperimentTracker:
def record_result(
self,
experiment_id: str,
variant: str, # "control" / "treatment"
task_id: str,
metrics: dict # task_success, quality_score, latency_ms, etc.
):
self.db.insert({
"experiment_id": experiment_id,
"variant": variant,
"task_id": task_id,
"recorded_at": datetime.utcnow(),
**metrics
})
Statistical analysis
from scipy import stats
import numpy as np
class ExperimentAnalyzer:
def analyze(self, experiment: AgentExperiment) -> ExperimentResults:
control_data = self.db.get_results(experiment.experiment_id, "control")
treatment_data = self.db.get_results(experiment.experiment_id, "treatment")
primary = experiment.primary_metric
control_values = [r[primary] for r in control_data]
treatment_values = [r[primary] for r in treatment_data]
# T-test для непрерывных метрик (latency, quality_score)
# Z-test для пропорций (success_rate)
if primary in ["task_success_rate", "completion_rate"]:
n_control = len(control_values)
n_treatment = len(treatment_values)
p_control = np.mean(control_values)
p_treatment = np.mean(treatment_values)
# Z-test для пропорций
z_stat, p_value = stats.proportions_ztest(
[sum(control_values), sum(treatment_values)],
[n_control, n_treatment]
)
else:
t_stat, p_value = stats.ttest_ind(control_values, treatment_values)
lift = (np.mean(treatment_values) - np.mean(control_values)) / np.mean(control_values)
return ExperimentResults(
control_mean=np.mean(control_values),
treatment_mean=np.mean(treatment_values),
lift=lift,
p_value=p_value,
is_significant=p_value < 0.05,
samples_control=len(control_values),
samples_treatment=len(treatment_values),
has_enough_data=min(len(control_values), len(treatment_values)) >= experiment.min_samples,
recommendation="ship" if p_value < 0.05 and lift > 0 else "no_change" if p_value >= 0.05 else "rollback"
)
Auto-stop rules
class ExperimentGuardrails:
def check(self, experiment: AgentExperiment, results: ExperimentResults) -> Action:
# Остановить если treatment значимо хуже
if results.is_significant and results.lift < -0.05: # > 5% ухудшение
return Action.STOP_AND_ROLLBACK
# Остановить при критическом росте ошибок
if results.treatment_error_rate > results.control_error_rate * 2:
return Action.STOP_AND_ROLLBACK
# Завершить если накоплено достаточно данных
if results.has_enough_data and results.is_significant and results.lift > 0:
return Action.SHIP_TREATMENT
return Action.CONTINUE







