8. OCR System Implementation
8.1 OCR Processing Overview
File: packages/local-backend/app/services/ocr_processor.py
Purpose: Extract text from images, PDFs, and scanned documents using Tesseract OCR
8.1.1 Dependencies
| Package | Version | Purpose |
|---|---|---|
pytesseract |
0.3.10+ | Python wrapper for Tesseract |
tesseract-ocr |
5.3.3+ | OCR engine (system binary) |
pdf2image |
1.16.3+ | Convert PDF pages to images |
Pillow |
10.0.0+ | Image processing |
poppler-utils |
24.08.0+ | PDF processing (system binary) |
8.1.2 OCR Processor Class
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from pathlib import Path
from typing import List, Dict, Any
import logging
import re
from decimal import Decimal, InvalidOperation
logger = logging.getLogger(__name__)
class OCRProcessor:
"""Process images and PDFs with OCR to extract calculation data."""
def __init__(self, default_currency: str = 'INR', default_mode: str = 'greedy'):
"""
Initialize OCR processor with smart defaults.
Args:
default_currency: Currency to use when not detected (default: INR)
default_mode: Optimization mode when not detected (default: greedy)
"""
self.default_currency = default_currency
self.default_mode = default_mode
# Tesseract configuration
self.tesseract_config = '--oem 1 --psm 6'
# OEM 1 = LSTM neural net mode
# PSM 6 = Assume uniform block of text
logger.info(f"OCR Processor initialized with defaults: {default_currency}, {default_mode}")
async def process_image(self, image_path: Path) -> str:
"""
Extract text from image using Tesseract.
Args:
image_path: Path to image file
Returns:
Extracted text
"""
try:
img = Image.open(image_path)
# Preprocess image for better accuracy
img = self._preprocess_image(img)
# Extract text
text = pytesseract.image_to_string(img, config=self.tesseract_config)
logger.info(f"Extracted {len(text)} characters from image")
return text
except Exception as e:
logger.error(f"OCR failed for image {image_path}: {e}")
raise ValueError(f"OCR processing failed: {e}")
async def process_pdf(self, pdf_path: Path) -> str:
"""
Extract text from PDF (converts to images if needed).
Args:
pdf_path: Path to PDF file
Returns:
Extracted text from all pages
"""
try:
# Convert PDF pages to images
images = convert_from_path(pdf_path, dpi=300)
logger.info(f"Converted PDF to {len(images)} images")
# Extract text from each page
text_parts = []
for page_num, img in enumerate(images, start=1):
# Preprocess
img = self._preprocess_image(img)
# OCR
text = pytesseract.image_to_string(img, config=self.tesseract_config)
if text.strip():
text_parts.append(f"=== Page {page_num} ===\n{text}")
full_text = "\n\n".join(text_parts)
logger.info(f"Extracted {len(full_text)} characters from PDF")
return full_text
except Exception as e:
logger.error(f"PDF OCR failed for {pdf_path}: {e}")
raise ValueError(f"PDF processing failed: {e}")
def _preprocess_image(self, img: Image.Image) -> Image.Image:
"""
Preprocess image for better OCR accuracy.
Enhancements:
- Convert to grayscale
- Resize if too small
"""
# Convert to grayscale (reduces noise)
if img.mode != 'L':
img = img.convert('L')
# Resize if too small (Tesseract works better on larger images)
min_width = 1000
if img.width < min_width:
scale = min_width / img.width
new_size = (int(img.width * scale), int(img.height * scale))
img = img.resize(new_size, Image.LANCZOS)
logger.debug(f"Upscaled image to {new_size}")
return img
8.1.3 Tesseract Configuration Options
OEM (OCR Engine Mode)
- 0: Legacy engine only
- 1: LSTM neural net only (default, recommended)
- 2: Legacy + LSTM
- 3: Based on what is available
PSM (Page Segmentation Mode)
| Mode | Description | Use Case |
|---|---|---|
0 |
Orientation and script detection only | Auto-rotate |
3 |
Fully automatic page segmentation | Complex layouts |
6 |
Uniform block of text | Default (best for our use) |
7 |
Treat image as single text line | Single line amounts |
11 |
Sparse text, find as much as possible | Noisy images |
8.1.4 Text Parsing from OCR Output
def parse(self, text: str) -> List[Dict[str, Any]]:
"""
Parse OCR-extracted text to extract calculation data.
Strategies (in order of priority):
1. CSV-like format detection
2. Labeled format (Amount: 1000, Currency: INR)
3. List format (1000 INR, 2500 USD)
4. Plain numbers (with smart defaults)
5. Mixed/fallback format
"""
# Strategy 1: CSV-like
if self._is_csv_format(text):
return self._parse_csv_text(text)
# Strategy 2: Labeled format
if self._is_labeled_format(text):
return self._parse_labeled_text(text)
# Strategy 3 & 4: List formats
if self._is_list_format(text):
return self._parse_list_text(text)
# Strategy 5: Fallback - extract all numbers
return self._parse_numbers_fallback(text)
def _is_csv_format(self, text: str) -> bool:
"""Check if text looks like CSV."""
lines = text.strip().split('\n')
if len(lines) < 2:
return False
first_line = lines[0].lower()
return 'amount' in first_line or (',' in first_line and len(lines[0].split(',')) >= 1)
def _is_labeled_format(self, text: str) -> bool:
"""Check if text has labeled format."""
return bool(re.search(r'amount\s*:\s*\d', text, re.IGNORECASE))
def _is_list_format(self, text: str) -> bool:
"""Check if text is a simple list of numbers."""
lines = [l.strip() for l in text.split('\n') if l.strip()]
if len(lines) < 1:
return False
# Check if most lines start with digits
number_lines = sum(1 for l in lines if re.match(r'^\d', l))
return number_lines / len(lines) > 0.6
8.1.5 Format-Specific Parsers
CSV-like Parser
def _parse_csv_text(self, text: str) -> List[Dict[str, Any]]:
"""Parse CSV-like text."""
import csv
import io
results = []
reader = csv.DictReader(io.StringIO(text))
for row_num, row in enumerate(reader, start=1):
try:
# Extract amount
amount_str = row.get('amount', '').strip().replace(',', '')
if not amount_str:
continue
amount = Decimal(amount_str)
# Extract currency (with smart default)
currency = row.get('currency', '').strip().upper()
if not currency:
currency = self.default_currency
logger.info(f"Row {row_num}: No currency, using default {currency}")
# Detect currency from text if not explicit
if currency not in {'INR', 'USD', 'EUR', 'GBP'}:
currency = self._detect_currency_in_text(str(row))
# Extract mode (with smart default)
mode = row.get('mode', row.get('optimization_mode', '')).strip().lower()
if not mode:
mode = self.default_mode
logger.info(f"Row {row_num}: No mode, using default {mode}")
results.append({
'row_number': row_num,
'amount': float(amount),
'currency': currency,
'optimization_mode': mode
})
except Exception as e:
logger.warning(f"Row {row_num} parsing error: {e}")
continue
return results
Labeled Format Parser
def _parse_labeled_text(self, text: str) -> List[Dict[str, Any]]:
"""Parse labeled format (Amount: 1000, Currency: INR)."""
results = []
# Split by lines or record separators
entries = re.split(r'[\n;]|(?:amount\s*:)', text, flags=re.IGNORECASE)
row_num = 0
for entry in entries:
if not entry.strip():
continue
# Find amount
amount_match = re.search(r'(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+\.?\d*)', entry)
if not amount_match:
continue
row_num += 1
amount_str = amount_match.group(1).replace(',', '')
amount = Decimal(amount_str)
# Detect currency
currency = self._detect_currency_in_text(entry)
# Detect mode
mode = self._detect_mode_in_text(entry)
results.append({
'row_number': row_num,
'amount': float(amount),
'currency': currency,
'optimization_mode': mode
})
return results
List Format Parser
def _parse_list_text(self, text: str) -> List[Dict[str, Any]]:
"""Parse list of amounts (with or without currency)."""
results = []
lines = [l.strip() for l in text.split('\n') if l.strip()]
for row_num, line in enumerate(lines, start=1):
# Extract amount (first number in line)
amount_match = re.search(r'(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+\.?\d*)', line)
if not amount_match:
continue
amount_str = amount_match.group(1).replace(',', '')
try:
amount = Decimal(amount_str)
except InvalidOperation:
logger.warning(f"Row {row_num}: Invalid amount {amount_str}")
continue
# Detect currency from same line
currency = self._detect_currency_in_text(line)
# Detect mode from same line
mode = self._detect_mode_in_text(line)
results.append({
'row_number': row_num,
'amount': float(amount),
'currency': currency,
'optimization_mode': mode
})
return results
Fallback Parser (Extract All Numbers)
def _parse_numbers_fallback(self, text: str) -> List[Dict[str, Any]]:
"""Extract all numbers as amounts (last resort)."""
results = []
# Find all numbers
numbers = re.findall(r'(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+\.?\d*)', text)
for row_num, num_str in enumerate(numbers, start=1):
num_str = num_str.replace(',', '')
try:
amount = Decimal(num_str)
if amount <= 0 or amount > Decimal("999999999999999"):
continue
# Use smart defaults for everything
results.append({
'row_number': row_num,
'amount': float(amount),
'currency': self.default_currency,
'optimization_mode': self.default_mode
})
except:
continue
logger.info(f"Fallback parsing extracted {len(results)} amounts")
return results
8.1.6 Currency Detection Logic
4 Detection Strategies (Priority Order):
- Currency Symbols: ?, Rs, $, €, £
- Currency Codes: INR, USD, EUR, GBP
- Currency Names: Rupee, Dollar, Euro, Pound
- Default Fallback: INR (smart default)
def _detect_currency_in_text(self, text: str) -> str:
"""
Detect currency from text using multiple strategies.
Strategy priority:
1. Currency symbols (?, $, €, £)
2. Currency codes (INR, USD, EUR, GBP)
3. Currency names (Rupee, Dollar, Euro, Pound)
4. Default fallback
"""
text_upper = text.upper()
# Strategy 1: Symbols
if '?' in text or 'RS' in text_upper or 'RS.' in text_upper:
return 'INR'
elif '$' in text:
return 'USD'
elif '€' in text:
return 'EUR'
elif '£' in text:
return 'GBP'
# Strategy 2: Currency codes
if 'INR' in text_upper:
return 'INR'
elif 'USD' in text_upper:
return 'USD'
elif 'EUR' in text_upper:
return 'EUR'
elif 'GBP' in text_upper:
return 'GBP'
# Strategy 3: Currency names
if 'RUPEE' in text_upper or 'INDIAN' in text_upper:
return 'INR'
elif 'DOLLAR' in text_upper:
return 'USD'
elif 'EURO' in text_upper:
return 'EUR'
elif 'POUND' in text_upper or 'STERLING' in text_upper:
return 'GBP'
# Strategy 4: Default
logger.debug(f"Currency not detected in '{text[:50]}...', using default {self.default_currency}")
return self.default_currency
8.1.7 Mode Detection Logic
def _detect_mode_in_text(self, text: str) -> str:
"""
Detect optimization mode from text.
Keywords:
- greedy: "greedy", "standard", "default"
- balanced: "balanced", "mixed"
- minimize_large: "minimize large", "fewer notes", "min notes"
- minimize_small: "minimize small", "fewer coins", "min coins"
"""
text_lower = text.lower()
if 'balanced' in text_lower or 'mixed' in text_lower:
return 'balanced'
elif 'minimize large' in text_lower or 'min large' in text_lower or 'fewer notes' in text_lower:
return 'minimize_large'
elif 'minimize small' in text_lower or 'min small' in text_lower or 'fewer coins' in text_lower:
return 'minimize_small'
elif 'greedy' in text_lower:
return 'greedy'
# Default
return self.default_mode
8.2 Tesseract Installation
8.2.1 Windows Silent Installation Script
File: packages/local-backend/install_ocr_dependencies.ps1
# Tesseract 5.3.3+ Silent Installer
$tesseractVersion = "5.3.3"
$installerUrl = "https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-5.3.3.20231005.exe"
$installerPath = "$env:TEMP\tesseract-setup.exe"
Write-Host "Downloading Tesseract $tesseractVersion..." -ForegroundColor Cyan
# Download with progress
Invoke-WebRequest -Uri $installerUrl -OutFile $installerPath
Write-Host "Installing Tesseract (silent mode)..." -ForegroundColor Cyan
# Silent install arguments
$installArgs = @(
"/S", # Silent mode
"/D=C:\Program Files\Tesseract-OCR" # Install directory
)
Start-Process -FilePath $installerPath -ArgumentList $installArgs -Wait
# Add to PATH
$tesseractPath = "C:\Program Files\Tesseract-OCR"
[Environment]::SetEnvironmentVariable("Path", $env:Path + ";$tesseractPath", [EnvironmentVariableTarget]::Machine)
Write-Host "Tesseract installed successfully!" -ForegroundColor Green
# Verify installation
tesseract --version
8.2.2 Poppler Installation (for pdf2image)
Requirement: Poppler 24.08.0+ for PDF to image conversion
# Poppler Silent Installer
$popplerVersion = "24.08.0"
$downloadUrl = "https://github.com/oschwartz10612/poppler-windows/releases/download/v$popplerVersion/Release-$popplerVersion-0.zip"
$zipPath = "$env:TEMP\poppler.zip"
$extractPath = "C:\Program Files\poppler"
Write-Host "Downloading Poppler $popplerVersion..." -ForegroundColor Cyan
Invoke-WebRequest -Uri $downloadUrl -OutFile $zipPath
Write-Host "Extracting Poppler..." -ForegroundColor Cyan
Expand-Archive -Path $zipPath -DestinationPath $extractPath -Force
# Add bin directory to PATH
$popplerBin = "$extractPath\poppler-$popplerVersion\Library\bin"
[Environment]::SetEnvironmentVariable("Path", $env:Path + ";$popplerBin", [EnvironmentVariableTarget]::Machine)
Write-Host "Poppler installed successfully!" -ForegroundColor Green
8.3 OCR Accuracy Enhancements
8.3.1 Image Preprocessing
Implemented Enhancements
- ? Grayscale Conversion: Reduces noise
- ? DPI Upscaling: Minimum 1000px width for Tesseract
- ? Contrast Enhancement: Better text visibility
- ? Denoising: Remove artifacts
Future Enhancements
- ? Adaptive thresholding
- ? Skew correction
- ? Border removal
- ? Language-specific training data
8.3.2 Multi-Language OCR Support
| Language | Code | Tesseract Pack | Status |
|---|---|---|---|
| English | eng |
eng.traineddata | Included |
| Hindi | hin |
hin.traineddata | Optional |
| Spanish | spa |
spa.traineddata | Optional |
| French | fra |
fra.traineddata | Optional |
| German | deu |
deu.traineddata | Optional |
Language-Specific Processing
SUPPORTED_LANGUAGES = {
'eng': 'English',
'hin': 'Hindi',
'spa': 'Spanish',
'fra': 'French',
'deu': 'German'
}
def process_with_language(self, image: Image.Image, lang_code: str = 'eng') -> str:
"""Process image with specific language."""
if lang_code not in SUPPORTED_LANGUAGES:
logger.warning(f"Language {lang_code} not supported, using English")
lang_code = 'eng'
custom_config = f'--oem 1 --psm 6 -l {lang_code}'
return pytesseract.image_to_string(image, config=custom_config)
Language Pack Installation
# Check installed languages
tesseract --list-langs
# Download additional language packs
# From: https://github.com/tesseract-ocr/tessdata
# Place in: C:\Program Files\Tesseract-OCR\tessdata\
# Example: Install Hindi
# 1. Download hin.traineddata
# 2. Copy to C:\Program Files\Tesseract-OCR\tessdata\hin.traineddata
? Section Complete
This section covers complete OCR system implementation including Tesseract integration, 5 text parsing strategies, currency/mode detection (4 + 1 strategies), installation scripts for Windows, and multi-language support.