AI system for understanding and processing video
Understanding video is a more complex task than processing individual frames. Video is a spatiotemporal volume of data: objects move, scenes change, events unfold over time. "What's going on in this video?" requires temporal reasoning—understanding the sequence of events.
Video understanding system architecture
import torch
import numpy as np
import cv2
from transformers import AutoProcessor, AutoModelForVideoClassification
class VideoUnderstandingPipeline:
def __init__(self, config: dict):
# Video Action Recognition: VideoMAE или TimeSformer
self.action_model = AutoModelForVideoClassification.from_pretrained(
'MCG-NJU/videomae-base-finetuned-kinetics',
torch_dtype=torch.float16
).cuda()
self.action_processor = AutoProcessor.from_pretrained(
'MCG-NJU/videomae-base-finetuned-kinetics'
)
# Для длинных видео: LLaVA-Video или Video-LLaMA
self.vlm_model = self._load_video_llm(config.get('vlm_model'))
self.clip_duration = config.get('clip_duration', 16) # кадров
self.fps_sample = config.get('fps_sample', 8) # fps для анализа
def extract_clips(self, video_path: str) -> list[np.ndarray]:
"""Нарезаем видео на клипы для action recognition"""
cap = cv2.VideoCapture(video_path)
original_fps = cap.get(cv2.CAP_PROP_FPS)
sample_interval = max(1, int(original_fps / self.fps_sample))
clips = []
current_clip = []
frame_idx = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_idx % sample_interval == 0:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
current_clip.append(frame_rgb)
if len(current_clip) == self.clip_duration:
clips.append(np.array(current_clip))
current_clip = current_clip[self.clip_duration // 2:] # overlap 50%
frame_idx += 1
cap.release()
return clips
@torch.no_grad()
def classify_actions(self, clips: list[np.ndarray]) -> list[dict]:
results = []
for i, clip in enumerate(clips):
inputs = self.action_processor(
list(clip), return_tensors='pt'
).to('cuda')
outputs = self.action_model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)[0]
top5_probs, top5_ids = probs.topk(5)
results.append({
'clip_idx': i,
'start_frame': i * self.clip_duration // 2,
'actions': [
{
'label': self.action_model.config.id2label[idx.item()],
'probability': prob.item()
}
for prob, idx in zip(top5_probs, top5_ids)
]
})
return results
Video Search: Semantic Search in Video Archives
import faiss
from transformers import CLIPProcessor, CLIPModel
class VideoSemanticSearch:
"""
CLIP эмбеддинги кадров → FAISS индекс → поиск по тексту.
Быстрый способ найти «момент, где человек падает» в 1000-часовом архиве.
"""
def __init__(self):
self.clip_model = CLIPModel.from_pretrained(
'openai/clip-vit-large-patch14'
).cuda()
self.clip_processor = CLIPProcessor.from_pretrained(
'openai/clip-vit-large-patch14'
)
self.index = faiss.IndexFlatIP(768) # CLIP ViT-L/14 dim = 768
self.frame_metadata = [] # (video_id, timestamp)
@torch.no_grad()
def index_video(self, video_path: str, video_id: str,
sample_every_n: int = 30):
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_idx = 0
batch_frames = []
batch_meta = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_idx % sample_every_n == 0:
pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
batch_frames.append(pil_frame)
batch_meta.append((video_id, frame_idx / fps))
if len(batch_frames) >= 32:
self._process_batch(batch_frames, batch_meta)
batch_frames, batch_meta = [], []
frame_idx += 1
if batch_frames:
self._process_batch(batch_frames, batch_meta)
cap.release()
def _process_batch(self, frames: list, meta: list):
inputs = self.clip_processor(
images=frames, return_tensors='pt', padding=True
).to('cuda')
embs = self.clip_model.get_image_features(**inputs)
embs = embs / embs.norm(dim=-1, keepdim=True)
embs_np = embs.cpu().float().numpy()
faiss.normalize_L2(embs_np)
self.index.add(embs_np)
self.frame_metadata.extend(meta)
@torch.no_grad()
def search(self, query: str, top_k: int = 10) -> list[dict]:
inputs = self.clip_processor(
text=[query], return_tensors='pt', padding=True
).to('cuda')
text_emb = self.clip_model.get_text_features(**inputs)
text_emb = text_emb / text_emb.norm(dim=-1, keepdim=True)
text_np = text_emb.cpu().float().numpy()
faiss.normalize_L2(text_np)
scores, indices = self.index.search(text_np, top_k)
results = []
for score, idx in zip(scores[0], indices[0]):
video_id, timestamp = self.frame_metadata[idx]
results.append({
'video_id': video_id,
'timestamp_sec': timestamp,
'score': float(score)
})
return results
Temporal Reasoning: Video-LLM for Complex Queries
class VideoLLMAnalyzer:
"""
Video-LLaVA, LLaVA-Video или Qwen2-VL с видео-входом.
Для вопросов типа «что происходит в конце видео?»,
«сколько раз человек посмотрел в камеру?»
"""
def __init__(self):
# Qwen2-VL поддерживает видео до 256 кадров
from transformers import Qwen2VLForConditionalGeneration
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
'Qwen/Qwen2-VL-7B-Instruct',
torch_dtype=torch.bfloat16,
device_map='auto'
)
def query_video(self, video_path: str, question: str) -> str:
# Семплируем не более 32 кадров равномерно
frames = self._sample_frames(video_path, n=32)
# Формируем prompt с видео-токенами
response = self._generate(frames, question)
return response
Performance and optimization
Video generates a huge amount of data. 24 hours of recording at 30 fps = 2.6M frames. Processing every frame is inefficient:
- Motion-based sampling: we process only frames with motion (background subtraction as a filter)
- Adaptive sampling: 8 fps is enough for action recognition; 15 fps for object detection
- Hierarchical indexing: first scene-level (what is happening in the scene), then frame-level
| Task | Model | Latency/frame |
|---|---|---|
| Action recognition (16 frames) | VideoMAE-Base | 45ms |
| Semantic search (CLIP indexing) | CLIP ViT-L/14 | 8ms |
| Video QA | Qwen2-VL-7B | 1.2 sec/clip |
| Object tracking (entire video stream) | YOLOv8 + ByteTrack | 20ms |
| Project type | Term |
|---|---|
| Action recognition system | 4–7 weeks |
| Semantic search in video archives | 5–8 weeks |
| A complete video understanding platform | 10–18 weeks |







