AI Data Extraction from Cash Receipts
Receipts are the most challenging document type for OCR: thermal paper fades over time, print quality is low, formats are unique for each retailer, receipt structure is non-linear (items, discounts, totals can be in any order). On the SROIE dataset LayoutLMv3 gives F1 0.974 — but on clean, scanned documents. On real mobile photos — 0.87–0.91.
Preprocessing: Main Source of Errors
Mobile receipt photos suffer from: perspective distortion, blur, shadows from fingers, overexposure. Without correction OCR makes errors on 15–25% of characters.
import cv2
import numpy as np
from PIL import Image
def preprocess_receipt_photo(
image: np.ndarray,
target_width: int = 768 # normalized receipt width
) -> np.ndarray:
"""
Steps: denoising → brightness equalization → deskew → binarization
"""
# 1. Denoising
denoised = cv2.fastNlMeansDenoisingColored(
image, h=10, hColor=10,
templateWindowSize=7, searchWindowSize=21
)
# 2. CLAHE for lighting equalization
lab = cv2.cvtColor(denoised, cv2.COLOR_BGR2LAB)
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
lab[:, :, 0] = clahe.apply(lab[:, :, 0])
enhanced = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
# 3. Automatic detection of receipt contour and deskew
gray = cv2.cvtColor(enhanced, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150)
contours, _ = cv2.findContours(
edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
if contours:
# Find largest contour — presumably the receipt
largest = max(contours, key=cv2.contourArea)
hull = cv2.convexHull(largest)
if cv2.contourArea(hull) > 0.1 * image.shape[0] * image.shape[1]:
rect = cv2.minAreaRect(hull)
angle = rect[2]
if abs(angle) > 5:
M = cv2.getRotationMatrix2D(
(image.shape[1]//2, image.shape[0]//2), angle, 1
)
enhanced = cv2.warpAffine(
enhanced, M, (image.shape[1], image.shape[0])
)
# 4. Adaptive binarization for thermal printing
gray = cv2.cvtColor(enhanced, cv2.COLOR_BGR2GRAY)
binary = cv2.adaptiveThreshold(
gray, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=11, C=2
)
# 5. Width normalization
h, w = binary.shape
scale = target_width / w
resized = cv2.resize(
binary, (target_width, int(h * scale)),
interpolation=cv2.INTER_LANCZOS4
)
return resized
Receipt Structure Parsing
After OCR, the structure must be parsed: product lines, discounts, totals, cashier, INN.
import re
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class ReceiptLineItem:
name: str
quantity: float
unit_price: float
total_price: float
discount: float = 0.0
@dataclass
class ParsedReceipt:
store_name: Optional[str]
inn: Optional[str] # seller INN
datetime_str: Optional[str]
items: list[ReceiptLineItem] = field(default_factory=list)
subtotal: Optional[float] = None
tax_amount: Optional[float] = None
total: Optional[float] = None
payment_method: Optional[str] = None
fiscal_sign: Optional[str] = None
class ReceiptParser:
# Patterns for receipts
PATTERNS = {
'inn': r'INN\s*[:№]?\s*(\d{10,12})',
'total': r'(?:TOTAL|GRAND TOTAL|SUBTOTAL)\s*[:=]?\s*([\d\s,\.]+)',
'tax': r'(?:TAX|VAT)\s+\d+%\s*[:=]?\s*([\d\s,\.]+)',
'fiscal_sign': r'FP\s*[::]?\s*(\d+)',
'line_item': r'^(.+?)\s+([\d,\.]+)\s*[xх×]\s*([\d,\.]+)\s*=?\s*([\d,\.]+)',
'datetime': r'(\d{2}[\.\/]\d{2}[\.\/]\d{4})\s+(\d{2}:\d{2}(?::\d{2})?)',
}
def parse(self, ocr_text: str) -> ParsedReceipt:
lines = ocr_text.split('\n')
receipt = ParsedReceipt(
store_name=lines[0].strip() if lines else None,
inn=self._extract(ocr_text, 'inn'),
datetime_str=self._extract_datetime(ocr_text),
total=self._parse_amount(self._extract(ocr_text, 'total')),
tax_amount=self._parse_amount(self._extract(ocr_text, 'tax')),
fiscal_sign=self._extract(ocr_text, 'fiscal_sign')
)
for line in lines:
item = self._parse_line_item(line)
if item:
receipt.items.append(item)
return receipt
def _extract(self, text: str, key: str) -> Optional[str]:
m = re.search(self.PATTERNS[key], text, re.IGNORECASE)
return m.group(1).strip() if m else None
def _extract_datetime(self, text: str) -> Optional[str]:
m = re.search(self.PATTERNS['datetime'], text)
if m:
return f'{m.group(1)} {m.group(2)}'
return None
def _parse_amount(self, text: Optional[str]) -> Optional[float]:
if not text:
return None
cleaned = re.sub(r'\s', '', text).replace(',', '.')
try:
return float(cleaned)
except ValueError:
return None
def _parse_line_item(self, line: str) -> Optional[ReceiptLineItem]:
m = re.match(self.PATTERNS['line_item'], line.strip())
if not m:
return None
try:
return ReceiptLineItem(
name=m.group(1).strip(),
quantity=float(m.group(2).replace(',', '.')),
unit_price=float(m.group(3).replace(',', '.')),
total_price=float(m.group(4).replace(',', '.'))
)
except (ValueError, AttributeError):
return None
Comparison of Approaches
| Approach | CER (good photo) | CER (bad photo) | Speed | Cost |
|---|---|---|---|---|
| Tesseract 5 (no preprocessing) | 4.2% | 18.7% | 200ms | Free |
| Tesseract 5 + preprocessing | 1.8% | 8.3% | 350ms | Free |
| PaddleOCR v4 | 0.9% | 4.1% | 280ms | Free |
| Azure Read API | 0.6% | 2.8% | 1.2s | $1.5/1000 |
| GPT-4V | 0.4% | 1.9% | 3–5s | $10+/1000 |
Timeline
| Task | Timeline |
|---|---|
| Parser for specific store chain | 1–2 weeks |
| Universal parser (100+ formats) | 4–6 weeks |
| Mobile app with real-time OCR of receipts | 6–10 weeks |







