AI Video Processing and Understanding Implementation

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
AI Video Processing and Understanding Implementation
Complex
~1-2 weeks
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    823
  • image_logo-aider_0.jpg
    AIDER company logo development
    762
  • image_crm_chasseurs_493_0.webp
    CRM development for Chasseurs
    848

AI system for understanding and processing video

Understanding video is a more complex task than processing individual frames. Video is a spatiotemporal volume of data: objects move, scenes change, events unfold over time. "What's going on in this video?" requires temporal reasoning—understanding the sequence of events.

Video understanding system architecture

import torch
import numpy as np
import cv2
from transformers import AutoProcessor, AutoModelForVideoClassification

class VideoUnderstandingPipeline:
    def __init__(self, config: dict):
        # Video Action Recognition: VideoMAE или TimeSformer
        self.action_model = AutoModelForVideoClassification.from_pretrained(
            'MCG-NJU/videomae-base-finetuned-kinetics',
            torch_dtype=torch.float16
        ).cuda()
        self.action_processor = AutoProcessor.from_pretrained(
            'MCG-NJU/videomae-base-finetuned-kinetics'
        )

        # Для длинных видео: LLaVA-Video или Video-LLaMA
        self.vlm_model = self._load_video_llm(config.get('vlm_model'))

        self.clip_duration = config.get('clip_duration', 16)  # кадров
        self.fps_sample = config.get('fps_sample', 8)  # fps для анализа

    def extract_clips(self, video_path: str) -> list[np.ndarray]:
        """Нарезаем видео на клипы для action recognition"""
        cap = cv2.VideoCapture(video_path)
        original_fps = cap.get(cv2.CAP_PROP_FPS)
        sample_interval = max(1, int(original_fps / self.fps_sample))

        clips = []
        current_clip = []

        frame_idx = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_idx % sample_interval == 0:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                current_clip.append(frame_rgb)

                if len(current_clip) == self.clip_duration:
                    clips.append(np.array(current_clip))
                    current_clip = current_clip[self.clip_duration // 2:]  # overlap 50%

            frame_idx += 1

        cap.release()
        return clips

    @torch.no_grad()
    def classify_actions(self, clips: list[np.ndarray]) -> list[dict]:
        results = []
        for i, clip in enumerate(clips):
            inputs = self.action_processor(
                list(clip), return_tensors='pt'
            ).to('cuda')

            outputs = self.action_model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)[0]
            top5_probs, top5_ids = probs.topk(5)

            results.append({
                'clip_idx': i,
                'start_frame': i * self.clip_duration // 2,
                'actions': [
                    {
                        'label': self.action_model.config.id2label[idx.item()],
                        'probability': prob.item()
                    }
                    for prob, idx in zip(top5_probs, top5_ids)
                ]
            })

        return results

Video Search: Semantic Search in Video Archives

import faiss
from transformers import CLIPProcessor, CLIPModel

class VideoSemanticSearch:
    """
    CLIP эмбеддинги кадров → FAISS индекс → поиск по тексту.
    Быстрый способ найти «момент, где человек падает» в 1000-часовом архиве.
    """
    def __init__(self):
        self.clip_model = CLIPModel.from_pretrained(
            'openai/clip-vit-large-patch14'
        ).cuda()
        self.clip_processor = CLIPProcessor.from_pretrained(
            'openai/clip-vit-large-patch14'
        )

        self.index = faiss.IndexFlatIP(768)  # CLIP ViT-L/14 dim = 768
        self.frame_metadata = []  # (video_id, timestamp)

    @torch.no_grad()
    def index_video(self, video_path: str, video_id: str,
                     sample_every_n: int = 30):
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_idx = 0
        batch_frames = []
        batch_meta = []

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_idx % sample_every_n == 0:
                pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                batch_frames.append(pil_frame)
                batch_meta.append((video_id, frame_idx / fps))

                if len(batch_frames) >= 32:
                    self._process_batch(batch_frames, batch_meta)
                    batch_frames, batch_meta = [], []

            frame_idx += 1

        if batch_frames:
            self._process_batch(batch_frames, batch_meta)
        cap.release()

    def _process_batch(self, frames: list, meta: list):
        inputs = self.clip_processor(
            images=frames, return_tensors='pt', padding=True
        ).to('cuda')
        embs = self.clip_model.get_image_features(**inputs)
        embs = embs / embs.norm(dim=-1, keepdim=True)
        embs_np = embs.cpu().float().numpy()
        faiss.normalize_L2(embs_np)
        self.index.add(embs_np)
        self.frame_metadata.extend(meta)

    @torch.no_grad()
    def search(self, query: str, top_k: int = 10) -> list[dict]:
        inputs = self.clip_processor(
            text=[query], return_tensors='pt', padding=True
        ).to('cuda')
        text_emb = self.clip_model.get_text_features(**inputs)
        text_emb = text_emb / text_emb.norm(dim=-1, keepdim=True)
        text_np = text_emb.cpu().float().numpy()
        faiss.normalize_L2(text_np)

        scores, indices = self.index.search(text_np, top_k)

        results = []
        for score, idx in zip(scores[0], indices[0]):
            video_id, timestamp = self.frame_metadata[idx]
            results.append({
                'video_id': video_id,
                'timestamp_sec': timestamp,
                'score': float(score)
            })
        return results

Temporal Reasoning: Video-LLM for Complex Queries

class VideoLLMAnalyzer:
    """
    Video-LLaVA, LLaVA-Video или Qwen2-VL с видео-входом.
    Для вопросов типа «что происходит в конце видео?»,
    «сколько раз человек посмотрел в камеру?»
    """
    def __init__(self):
        # Qwen2-VL поддерживает видео до 256 кадров
        from transformers import Qwen2VLForConditionalGeneration
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            'Qwen/Qwen2-VL-7B-Instruct',
            torch_dtype=torch.bfloat16,
            device_map='auto'
        )

    def query_video(self, video_path: str, question: str) -> str:
        # Семплируем не более 32 кадров равномерно
        frames = self._sample_frames(video_path, n=32)
        # Формируем prompt с видео-токенами
        response = self._generate(frames, question)
        return response

Performance and optimization

Video generates a huge amount of data. 24 hours of recording at 30 fps = 2.6M frames. Processing every frame is inefficient:

  • Motion-based sampling: we process only frames with motion (background subtraction as a filter)
  • Adaptive sampling: 8 fps is enough for action recognition; 15 fps for object detection
  • Hierarchical indexing: first scene-level (what is happening in the scene), then frame-level
Task Model Latency/frame
Action recognition (16 frames) VideoMAE-Base 45ms
Semantic search (CLIP indexing) CLIP ViT-L/14 8ms
Video QA Qwen2-VL-7B 1.2 sec/clip
Object tracking (entire video stream) YOLOv8 + ByteTrack 20ms
Project type Term
Action recognition system 4–7 weeks
Semantic search in video archives 5–8 weeks
A complete video understanding platform 10–18 weeks