AI Video Interview Analysis System Soft Skills Non-Verbal Signals

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
AI Video Interview Analysis System Soft Skills Non-Verbal Signals
Complex
~2-4 weeks
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    823
  • image_logo-aider_0.jpg
    AIDER company logo development
    762
  • image_crm_chasseurs_493_0.webp
    CRM development for Chasseurs
    848

AI-based video interview analysis system

The goal is to automatically analyze recorded or online video interviews with candidates: recognizing emotions, analyzing verbal responses, and identifying signs of stress or uncertainty. The system doesn't replace an HR specialist, but it helps prioritize 200 candidates per hour.

System components

import cv2
import numpy as np
import torch
import whisper
from transformers import pipeline
import mediapipe as mp

class VideoInterviewAnalyzer:
    def __init__(self, config: dict):
        # Речь → текст: Whisper large-v3
        self.asr = whisper.load_model('large-v3')

        # Анализ тональности текста
        self.sentiment_analyzer = pipeline(
            'text-classification',
            model='blanchefort/rubert-base-cased-sentiment',
            device=0 if torch.cuda.is_available() else -1
        )

        # Эмоции на лице: MediaPipe + классификатор
        self.face_mesh = mp.solutions.face_mesh.FaceMesh(
            max_num_faces=1, refine_landmarks=True
        )
        self.emotion_model = self._load_emotion_model(config['emotion_model'])

        # Анализ речи: темп, паузы, слова-паразиты
        self.filler_words_ru = ['э', 'ну', 'это', 'короче', 'типа', 'как бы']

    def analyze_video(self, video_path: str) -> dict:
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)

        frames = []
        audio_data = []

        # Извлечение кадров (1 в секунду для эмоций)
        frame_idx = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            if frame_idx % int(fps) == 0:  # 1 fps
                frames.append(frame)
            frame_idx += 1
        cap.release()

        # Параллельный анализ
        emotion_timeline = self._analyze_emotions(frames)
        transcript = self._transcribe_audio(video_path)
        speech_features = self._analyze_speech(transcript, fps * frame_idx)
        text_analysis = self._analyze_text(transcript['text'])

        return self._compile_report(emotion_timeline, transcript,
                                     speech_features, text_analysis)

    def _analyze_emotions(self, frames: list) -> list:
        timeline = []
        for t, frame in enumerate(frames):
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = self.face_mesh.process(rgb)

            if results.multi_face_landmarks:
                emotion = self.emotion_model.predict(frame)
                timeline.append({'second': t, 'emotion': emotion})
            else:
                timeline.append({'second': t, 'emotion': 'no_face'})

        return timeline

    def _transcribe_audio(self, video_path: str) -> dict:
        result = self.asr.transcribe(video_path, language='ru',
                                      word_timestamps=True)
        return result

    def _analyze_speech(self, transcript: dict,
                          total_frames: int) -> dict:
        words = [w for seg in transcript.get('segments', [])
                  for w in seg.get('words', [])]

        if not words:
            return {}

        total_duration = words[-1]['end'] if words else 1.0
        words_per_minute = len(words) / (total_duration / 60)

        # Паузы > 2 сек
        long_pauses = []
        for i in range(1, len(words)):
            pause = words[i]['start'] - words[i-1]['end']
            if pause > 2.0:
                long_pauses.append({'start': words[i-1]['end'],
                                     'duration': pause})

        # Слова-паразиты
        fillers = [w for w in words
                    if w['word'].strip().lower() in self.filler_words_ru]
        filler_rate = len(fillers) / max(len(words), 1)

        return {
            'words_per_minute': words_per_minute,
            'long_pauses': long_pauses,
            'filler_rate': filler_rate,
            'total_words': len(words)
        }

    def _compile_report(self, emotions: list, transcript: dict,
                          speech: dict, text: dict) -> dict:
        # Распределение эмоций
        emotion_counts = {}
        for e in emotions:
            em = e.get('emotion', 'unknown')
            emotion_counts[em] = emotion_counts.get(em, 0) + 1

        dominant_emotion = max(emotion_counts,
                                key=emotion_counts.get) if emotion_counts else None

        # Скоринг (упрощённый)
        score = 50.0
        if speech.get('words_per_minute', 0) > 100:
            score += 10  # хороший темп речи
        if speech.get('filler_rate', 1) < 0.05:
            score += 10  # мало слов-паразитов
        if dominant_emotion in ['happy', 'neutral']:
            score += 10
        if dominant_emotion in ['fear', 'disgust']:
            score -= 10
        if len(speech.get('long_pauses', [])) > 5:
            score -= 10

        return {
            'overall_score': min(100, max(0, score)),
            'emotion_distribution': emotion_counts,
            'dominant_emotion': dominant_emotion,
            'speech_metrics': speech,
            'transcript': transcript.get('text', ''),
            'text_sentiment': text,
            'recommendations': self._generate_recommendations(emotions,
                                                               speech, text)
        }

Ethical limitations and accuracy

Important to understand: emotion analysis systems have an accuracy rate of 65–75% in real-world conditions (JAFFE dataset: up to 92%, but these are laboratory conditions). Nervousness during an interview ≠ incompetence. The system provides signals, not solutions.

Parameter Accuracy
Recognizing the 7 Basic Emotions 68–78%
ASR (Whisper large-v3, Russian) WER 8–12%
Sentiment analysis of text F1 0.81–0.87
Detection of filler words > 95%

Integration with ATS/HRM

The system outputs structured JSON: score, transcript, emotional profile, and speech metrics. It integrates via a REST API into any ATS: Huntflow, Potok, and SAP SuccessFactors.

Scale Term
Basic Analysis (ASR + Emotions) 4–6 weeks
A complete system with scoring and ATS integration 8–14 weeks