AI Receipt Data Extraction Implementation

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
AI Receipt Data Extraction Implementation
Medium
~3-5 business days
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    823
  • image_logo-aider_0.jpg
    AIDER company logo development
    762
  • image_crm_chasseurs_493_0.webp
    CRM development for Chasseurs
    848

AI Data Extraction from Cash Receipts

Receipts are the most challenging document type for OCR: thermal paper fades over time, print quality is low, formats are unique for each retailer, receipt structure is non-linear (items, discounts, totals can be in any order). On the SROIE dataset LayoutLMv3 gives F1 0.974 — but on clean, scanned documents. On real mobile photos — 0.87–0.91.

Preprocessing: Main Source of Errors

Mobile receipt photos suffer from: perspective distortion, blur, shadows from fingers, overexposure. Without correction OCR makes errors on 15–25% of characters.

import cv2
import numpy as np
from PIL import Image

def preprocess_receipt_photo(
    image: np.ndarray,
    target_width: int = 768   # normalized receipt width
) -> np.ndarray:
    """
    Steps: denoising → brightness equalization → deskew → binarization
    """
    # 1. Denoising
    denoised = cv2.fastNlMeansDenoisingColored(
        image, h=10, hColor=10,
        templateWindowSize=7, searchWindowSize=21
    )

    # 2. CLAHE for lighting equalization
    lab = cv2.cvtColor(denoised, cv2.COLOR_BGR2LAB)
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
    lab[:, :, 0] = clahe.apply(lab[:, :, 0])
    enhanced = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)

    # 3. Automatic detection of receipt contour and deskew
    gray = cv2.cvtColor(enhanced, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150)
    contours, _ = cv2.findContours(
        edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
    )

    if contours:
        # Find largest contour — presumably the receipt
        largest = max(contours, key=cv2.contourArea)
        hull = cv2.convexHull(largest)

        if cv2.contourArea(hull) > 0.1 * image.shape[0] * image.shape[1]:
            rect = cv2.minAreaRect(hull)
            angle = rect[2]
            if abs(angle) > 5:
                M = cv2.getRotationMatrix2D(
                    (image.shape[1]//2, image.shape[0]//2), angle, 1
                )
                enhanced = cv2.warpAffine(
                    enhanced, M, (image.shape[1], image.shape[0])
                )

    # 4. Adaptive binarization for thermal printing
    gray = cv2.cvtColor(enhanced, cv2.COLOR_BGR2GRAY)
    binary = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        blockSize=11, C=2
    )

    # 5. Width normalization
    h, w = binary.shape
    scale = target_width / w
    resized = cv2.resize(
        binary, (target_width, int(h * scale)),
        interpolation=cv2.INTER_LANCZOS4
    )

    return resized

Receipt Structure Parsing

After OCR, the structure must be parsed: product lines, discounts, totals, cashier, INN.

import re
from dataclasses import dataclass, field
from typing import Optional

@dataclass
class ReceiptLineItem:
    name: str
    quantity: float
    unit_price: float
    total_price: float
    discount: float = 0.0

@dataclass
class ParsedReceipt:
    store_name: Optional[str]
    inn: Optional[str]          # seller INN
    datetime_str: Optional[str]
    items: list[ReceiptLineItem] = field(default_factory=list)
    subtotal: Optional[float] = None
    tax_amount: Optional[float] = None
    total: Optional[float] = None
    payment_method: Optional[str] = None
    fiscal_sign: Optional[str] = None

class ReceiptParser:
    # Patterns for receipts
    PATTERNS = {
        'inn': r'INN\s*[:№]?\s*(\d{10,12})',
        'total': r'(?:TOTAL|GRAND TOTAL|SUBTOTAL)\s*[:=]?\s*([\d\s,\.]+)',
        'tax': r'(?:TAX|VAT)\s+\d+%\s*[:=]?\s*([\d\s,\.]+)',
        'fiscal_sign': r'FP\s*[::]?\s*(\d+)',
        'line_item': r'^(.+?)\s+([\d,\.]+)\s*[xх×]\s*([\d,\.]+)\s*=?\s*([\d,\.]+)',
        'datetime': r'(\d{2}[\.\/]\d{2}[\.\/]\d{4})\s+(\d{2}:\d{2}(?::\d{2})?)',
    }

    def parse(self, ocr_text: str) -> ParsedReceipt:
        lines = ocr_text.split('\n')
        receipt = ParsedReceipt(
            store_name=lines[0].strip() if lines else None,
            inn=self._extract(ocr_text, 'inn'),
            datetime_str=self._extract_datetime(ocr_text),
            total=self._parse_amount(self._extract(ocr_text, 'total')),
            tax_amount=self._parse_amount(self._extract(ocr_text, 'tax')),
            fiscal_sign=self._extract(ocr_text, 'fiscal_sign')
        )

        for line in lines:
            item = self._parse_line_item(line)
            if item:
                receipt.items.append(item)

        return receipt

    def _extract(self, text: str, key: str) -> Optional[str]:
        m = re.search(self.PATTERNS[key], text, re.IGNORECASE)
        return m.group(1).strip() if m else None

    def _extract_datetime(self, text: str) -> Optional[str]:
        m = re.search(self.PATTERNS['datetime'], text)
        if m:
            return f'{m.group(1)} {m.group(2)}'
        return None

    def _parse_amount(self, text: Optional[str]) -> Optional[float]:
        if not text:
            return None
        cleaned = re.sub(r'\s', '', text).replace(',', '.')
        try:
            return float(cleaned)
        except ValueError:
            return None

    def _parse_line_item(self, line: str) -> Optional[ReceiptLineItem]:
        m = re.match(self.PATTERNS['line_item'], line.strip())
        if not m:
            return None
        try:
            return ReceiptLineItem(
                name=m.group(1).strip(),
                quantity=float(m.group(2).replace(',', '.')),
                unit_price=float(m.group(3).replace(',', '.')),
                total_price=float(m.group(4).replace(',', '.'))
            )
        except (ValueError, AttributeError):
            return None

Comparison of Approaches

Approach CER (good photo) CER (bad photo) Speed Cost
Tesseract 5 (no preprocessing) 4.2% 18.7% 200ms Free
Tesseract 5 + preprocessing 1.8% 8.3% 350ms Free
PaddleOCR v4 0.9% 4.1% 280ms Free
Azure Read API 0.6% 2.8% 1.2s $1.5/1000
GPT-4V 0.4% 1.9% 3–5s $10+/1000

Timeline

Task Timeline
Parser for specific store chain 1–2 weeks
Universal parser (100+ formats) 4–6 weeks
Mobile app with real-time OCR of receipts 6–10 weeks