AI-based video interview analysis system
The goal is to automatically analyze recorded or online video interviews with candidates: recognizing emotions, analyzing verbal responses, and identifying signs of stress or uncertainty. The system doesn't replace an HR specialist, but it helps prioritize 200 candidates per hour.
System components
import cv2
import numpy as np
import torch
import whisper
from transformers import pipeline
import mediapipe as mp
class VideoInterviewAnalyzer:
def __init__(self, config: dict):
# Речь → текст: Whisper large-v3
self.asr = whisper.load_model('large-v3')
# Анализ тональности текста
self.sentiment_analyzer = pipeline(
'text-classification',
model='blanchefort/rubert-base-cased-sentiment',
device=0 if torch.cuda.is_available() else -1
)
# Эмоции на лице: MediaPipe + классификатор
self.face_mesh = mp.solutions.face_mesh.FaceMesh(
max_num_faces=1, refine_landmarks=True
)
self.emotion_model = self._load_emotion_model(config['emotion_model'])
# Анализ речи: темп, паузы, слова-паразиты
self.filler_words_ru = ['э', 'ну', 'это', 'короче', 'типа', 'как бы']
def analyze_video(self, video_path: str) -> dict:
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frames = []
audio_data = []
# Извлечение кадров (1 в секунду для эмоций)
frame_idx = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_idx % int(fps) == 0: # 1 fps
frames.append(frame)
frame_idx += 1
cap.release()
# Параллельный анализ
emotion_timeline = self._analyze_emotions(frames)
transcript = self._transcribe_audio(video_path)
speech_features = self._analyze_speech(transcript, fps * frame_idx)
text_analysis = self._analyze_text(transcript['text'])
return self._compile_report(emotion_timeline, transcript,
speech_features, text_analysis)
def _analyze_emotions(self, frames: list) -> list:
timeline = []
for t, frame in enumerate(frames):
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.face_mesh.process(rgb)
if results.multi_face_landmarks:
emotion = self.emotion_model.predict(frame)
timeline.append({'second': t, 'emotion': emotion})
else:
timeline.append({'second': t, 'emotion': 'no_face'})
return timeline
def _transcribe_audio(self, video_path: str) -> dict:
result = self.asr.transcribe(video_path, language='ru',
word_timestamps=True)
return result
def _analyze_speech(self, transcript: dict,
total_frames: int) -> dict:
words = [w for seg in transcript.get('segments', [])
for w in seg.get('words', [])]
if not words:
return {}
total_duration = words[-1]['end'] if words else 1.0
words_per_minute = len(words) / (total_duration / 60)
# Паузы > 2 сек
long_pauses = []
for i in range(1, len(words)):
pause = words[i]['start'] - words[i-1]['end']
if pause > 2.0:
long_pauses.append({'start': words[i-1]['end'],
'duration': pause})
# Слова-паразиты
fillers = [w for w in words
if w['word'].strip().lower() in self.filler_words_ru]
filler_rate = len(fillers) / max(len(words), 1)
return {
'words_per_minute': words_per_minute,
'long_pauses': long_pauses,
'filler_rate': filler_rate,
'total_words': len(words)
}
def _compile_report(self, emotions: list, transcript: dict,
speech: dict, text: dict) -> dict:
# Распределение эмоций
emotion_counts = {}
for e in emotions:
em = e.get('emotion', 'unknown')
emotion_counts[em] = emotion_counts.get(em, 0) + 1
dominant_emotion = max(emotion_counts,
key=emotion_counts.get) if emotion_counts else None
# Скоринг (упрощённый)
score = 50.0
if speech.get('words_per_minute', 0) > 100:
score += 10 # хороший темп речи
if speech.get('filler_rate', 1) < 0.05:
score += 10 # мало слов-паразитов
if dominant_emotion in ['happy', 'neutral']:
score += 10
if dominant_emotion in ['fear', 'disgust']:
score -= 10
if len(speech.get('long_pauses', [])) > 5:
score -= 10
return {
'overall_score': min(100, max(0, score)),
'emotion_distribution': emotion_counts,
'dominant_emotion': dominant_emotion,
'speech_metrics': speech,
'transcript': transcript.get('text', ''),
'text_sentiment': text,
'recommendations': self._generate_recommendations(emotions,
speech, text)
}
Ethical limitations and accuracy
Important to understand: emotion analysis systems have an accuracy rate of 65–75% in real-world conditions (JAFFE dataset: up to 92%, but these are laboratory conditions). Nervousness during an interview ≠ incompetence. The system provides signals, not solutions.
| Parameter | Accuracy |
|---|---|
| Recognizing the 7 Basic Emotions | 68–78% |
| ASR (Whisper large-v3, Russian) | WER 8–12% |
| Sentiment analysis of text | F1 0.81–0.87 |
| Detection of filler words | > 95% |
Integration with ATS/HRM
The system outputs structured JSON: score, transcript, emotional profile, and speech metrics. It integrates via a REST API into any ATS: Huntflow, Potok, and SAP SuccessFactors.
| Scale | Term |
|---|---|
| Basic Analysis (ASR + Emotions) | 4–6 weeks |
| A complete system with scoring and ATS integration | 8–14 weeks |







