AI Agent A/B Testing System Development

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
AI Agent A/B Testing System Development
Medium
from 1 week to 3 months
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    823
  • image_logo-aider_0.jpg
    AIDER company logo development
    762
  • image_crm_chasseurs_493_0.webp
    CRM development for Chasseurs
    848

Development of an A/B testing system for AI agents

A/B testing of AI agents is a controlled comparison of two agent versions using real traffic. AI quality metrics (hallucination rate, task success, user satisfaction) require a statistically significant number of samples for inference.

A/B Experiment Design for Agents

from dataclasses import dataclass
from enum import Enum

class ExperimentStatus(str, Enum):
    DRAFT = "draft"
    RUNNING = "running"
    COMPLETED = "completed"
    STOPPED = "stopped"

@dataclass
class AgentExperiment:
    experiment_id: str
    agent_name: str
    control_version: str      # текущий prod
    treatment_version: str    # новая версия
    traffic_split: float      # 0.1 = 10% на treatment
    hypothesis: str           # что ожидаем улучшить
    primary_metric: str       # task_success_rate / quality_score / latency
    secondary_metrics: list[str]
    min_samples: int          # минимум для статистики (обычно 200-500)
    max_duration_days: int
    status: ExperimentStatus = ExperimentStatus.DRAFT

Routing by Experiments

import hashlib
import random

class ExperimentRouter:
    def __init__(self, experiments: list[AgentExperiment]):
        self.experiments = {e.experiment_id: e for e in experiments
                           if e.status == ExperimentStatus.RUNNING}

    def get_variant(self, agent_name: str, user_id: str) -> tuple[str, str | None]:
        """
        Returns: (version_to_use, experiment_id_if_any)
        Использует consistent hashing: один пользователь всегда в одной группе.
        """
        active = [e for e in self.experiments.values() if e.agent_name == agent_name]
        if not active:
            return "latest", None

        experiment = active[0]

        # Consistent хэшинг по user_id + experiment_id
        hash_input = f"{user_id}:{experiment.experiment_id}"
        hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
        bucket = (hash_value % 1000) / 1000.0  # 0.0 - 1.0

        if bucket < experiment.traffic_split:
            return experiment.treatment_version, experiment.experiment_id
        else:
            return experiment.control_version, experiment.experiment_id

Recording the results of the experiment

class ExperimentTracker:
    def record_result(
        self,
        experiment_id: str,
        variant: str,        # "control" / "treatment"
        task_id: str,
        metrics: dict        # task_success, quality_score, latency_ms, etc.
    ):
        self.db.insert({
            "experiment_id": experiment_id,
            "variant": variant,
            "task_id": task_id,
            "recorded_at": datetime.utcnow(),
            **metrics
        })

Statistical analysis

from scipy import stats
import numpy as np

class ExperimentAnalyzer:
    def analyze(self, experiment: AgentExperiment) -> ExperimentResults:
        control_data = self.db.get_results(experiment.experiment_id, "control")
        treatment_data = self.db.get_results(experiment.experiment_id, "treatment")

        primary = experiment.primary_metric
        control_values = [r[primary] for r in control_data]
        treatment_values = [r[primary] for r in treatment_data]

        # T-test для непрерывных метрик (latency, quality_score)
        # Z-test для пропорций (success_rate)
        if primary in ["task_success_rate", "completion_rate"]:
            n_control = len(control_values)
            n_treatment = len(treatment_values)
            p_control = np.mean(control_values)
            p_treatment = np.mean(treatment_values)

            # Z-test для пропорций
            z_stat, p_value = stats.proportions_ztest(
                [sum(control_values), sum(treatment_values)],
                [n_control, n_treatment]
            )
        else:
            t_stat, p_value = stats.ttest_ind(control_values, treatment_values)

        lift = (np.mean(treatment_values) - np.mean(control_values)) / np.mean(control_values)

        return ExperimentResults(
            control_mean=np.mean(control_values),
            treatment_mean=np.mean(treatment_values),
            lift=lift,
            p_value=p_value,
            is_significant=p_value < 0.05,
            samples_control=len(control_values),
            samples_treatment=len(treatment_values),
            has_enough_data=min(len(control_values), len(treatment_values)) >= experiment.min_samples,
            recommendation="ship" if p_value < 0.05 and lift > 0 else "no_change" if p_value >= 0.05 else "rollback"
        )

Auto-stop rules

class ExperimentGuardrails:
    def check(self, experiment: AgentExperiment, results: ExperimentResults) -> Action:
        # Остановить если treatment значимо хуже
        if results.is_significant and results.lift < -0.05:  # > 5% ухудшение
            return Action.STOP_AND_ROLLBACK

        # Остановить при критическом росте ошибок
        if results.treatment_error_rate > results.control_error_rate * 2:
            return Action.STOP_AND_ROLLBACK

        # Завершить если накоплено достаточно данных
        if results.has_enough_data and results.is_significant and results.lift > 0:
            return Action.SHIP_TREATMENT

        return Action.CONTINUE