AI Video Processing and Understanding Implementation

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.

8+Years of workmore info 900+Completed projectsmore info 100+In house employeesmore info 19+Partnersmore info

Offered services

Showing 1 of 1 servicesAll 1566 services

Complex

~1-2 weeks

FAQ

AI Development Areas

Discuss your AI project

Free consultation — we'll show you how AI can solve your challenge

Get a quote

We'll estimate the budget and timeline for your AI project

AI Solution Development Stages

Latest works

Development of a web application for FEEDME
1170
Development of an online store for the company FURNORO
1094
B2B Advance company logo design
563
Development of a web application for Enviok
830
AIDER company logo development
763
CRM development for Chasseurs
879

Show more works

AI system for understanding and processing video

Understanding video is a more complex task than processing individual frames. Video is a spatiotemporal volume of data: objects move, scenes change, events unfold over time. "What's going on in this video?" requires temporal reasoning—understanding the sequence of events.

Video understanding system architecture

import torch
import numpy as np
import cv2
from transformers import AutoProcessor, AutoModelForVideoClassification

class VideoUnderstandingPipeline:
    def __init__(self, config: dict):
        # Video Action Recognition: VideoMAE или TimeSformer
        self.action_model = AutoModelForVideoClassification.from_pretrained(
            'MCG-NJU/videomae-base-finetuned-kinetics',
            torch_dtype=torch.float16
        ).cuda()
        self.action_processor = AutoProcessor.from_pretrained(
            'MCG-NJU/videomae-base-finetuned-kinetics'
        )

        # Для длинных видео: LLaVA-Video или Video-LLaMA
        self.vlm_model = self._load_video_llm(config.get('vlm_model'))

        self.clip_duration = config.get('clip_duration', 16)  # кадров
        self.fps_sample = config.get('fps_sample', 8)  # fps для анализа

    def extract_clips(self, video_path: str) -> list[np.ndarray]:
        """Нарезаем видео на клипы для action recognition"""
        cap = cv2.VideoCapture(video_path)
        original_fps = cap.get(cv2.CAP_PROP_FPS)
        sample_interval = max(1, int(original_fps / self.fps_sample))

        clips = []
        current_clip = []

        frame_idx = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_idx % sample_interval == 0:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                current_clip.append(frame_rgb)

                if len(current_clip) == self.clip_duration:
                    clips.append(np.array(current_clip))
                    current_clip = current_clip[self.clip_duration // 2:]  # overlap 50%

            frame_idx += 1

        cap.release()
        return clips

    @torch.no_grad()
    def classify_actions(self, clips: list[np.ndarray]) -> list[dict]:
        results = []
        for i, clip in enumerate(clips):
            inputs = self.action_processor(
                list(clip), return_tensors='pt'
            ).to('cuda')

            outputs = self.action_model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)[0]
            top5_probs, top5_ids = probs.topk(5)

            results.append({
                'clip_idx': i,
                'start_frame': i * self.clip_duration // 2,
                'actions': [
                    {
                        'label': self.action_model.config.id2label[idx.item()],
                        'probability': prob.item()
                    }
                    for prob, idx in zip(top5_probs, top5_ids)
                ]
            })

        return results

Video Search: Semantic Search in Video Archives

import faiss
from transformers import CLIPProcessor, CLIPModel

class VideoSemanticSearch:
    """
    CLIP эмбеддинги кадров → FAISS индекс → поиск по тексту.
    Быстрый способ найти «момент, где человек падает» в 1000-часовом архиве.
    """
    def __init__(self):
        self.clip_model = CLIPModel.from_pretrained(
            'openai/clip-vit-large-patch14'
        ).cuda()
        self.clip_processor = CLIPProcessor.from_pretrained(
            'openai/clip-vit-large-patch14'
        )

        self.index = faiss.IndexFlatIP(768)  # CLIP ViT-L/14 dim = 768
        self.frame_metadata = []  # (video_id, timestamp)

    @torch.no_grad()
    def index_video(self, video_path: str, video_id: str,
                     sample_every_n: int = 30):
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_idx = 0
        batch_frames = []
        batch_meta = []

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_idx % sample_every_n == 0:
                pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                batch_frames.append(pil_frame)
                batch_meta.append((video_id, frame_idx / fps))

                if len(batch_frames) >= 32:
                    self._process_batch(batch_frames, batch_meta)
                    batch_frames, batch_meta = [], []

            frame_idx += 1

        if batch_frames:
            self._process_batch(batch_frames, batch_meta)
        cap.release()

    def _process_batch(self, frames: list, meta: list):
        inputs = self.clip_processor(
            images=frames, return_tensors='pt', padding=True
        ).to('cuda')
        embs = self.clip_model.get_image_features(**inputs)
        embs = embs / embs.norm(dim=-1, keepdim=True)
        embs_np = embs.cpu().float().numpy()
        faiss.normalize_L2(embs_np)
        self.index.add(embs_np)
        self.frame_metadata.extend(meta)

    @torch.no_grad()
    def search(self, query: str, top_k: int = 10) -> list[dict]:
        inputs = self.clip_processor(
            text=[query], return_tensors='pt', padding=True
        ).to('cuda')
        text_emb = self.clip_model.get_text_features(**inputs)
        text_emb = text_emb / text_emb.norm(dim=-1, keepdim=True)
        text_np = text_emb.cpu().float().numpy()
        faiss.normalize_L2(text_np)

        scores, indices = self.index.search(text_np, top_k)

        results = []
        for score, idx in zip(scores[0], indices[0]):
            video_id, timestamp = self.frame_metadata[idx]
            results.append({
                'video_id': video_id,
                'timestamp_sec': timestamp,
                'score': float(score)
            })
        return results

Temporal Reasoning: Video-LLM for Complex Queries

class VideoLLMAnalyzer:
    """
    Video-LLaVA, LLaVA-Video или Qwen2-VL с видео-входом.
    Для вопросов типа «что происходит в конце видео?»,
    «сколько раз человек посмотрел в камеру?»
    """
    def __init__(self):
        # Qwen2-VL поддерживает видео до 256 кадров
        from transformers import Qwen2VLForConditionalGeneration
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            'Qwen/Qwen2-VL-7B-Instruct',
            torch_dtype=torch.bfloat16,
            device_map='auto'
        )

    def query_video(self, video_path: str, question: str) -> str:
        # Семплируем не более 32 кадров равномерно
        frames = self._sample_frames(video_path, n=32)
        # Формируем prompt с видео-токенами
        response = self._generate(frames, question)
        return response

Performance and optimization

Video generates a huge amount of data. 24 hours of recording at 30 fps = 2.6M frames. Processing every frame is inefficient:

Motion-based sampling: we process only frames with motion (background subtraction as a filter)
Adaptive sampling: 8 fps is enough for action recognition; 15 fps for object detection
Hierarchical indexing: first scene-level (what is happening in the scene), then frame-level

Task	Model	Latency/frame
Action recognition (16 frames)	VideoMAE-Base	45ms
Semantic search (CLIP indexing)	CLIP ViT-L/14	8ms
Video QA	Qwen2-VL-7B	1.2 sec/clip
Object tracking (entire video stream)	YOLOv8 + ByteTrack	20ms

Project type	Term
Action recognition system	4–7 weeks
Semantic search in video archives	5–8 weeks
A complete video understanding platform	10–18 weeks