7. Bulk Upload System Specification
7.1 CSV Processing
File: packages/local-backend/app/services/csv_processor.py
7.1.1 Supported CSV Formats
Standard Format (All columns)
amount,currency,mode
1000,INR,greedy
2500,USD,balanced
500.50,EUR,minimize_large
Minimal Format (Defaults applied)
amount,currency
1000,INR
2500,USD
500,EUR
Ultra-Minimal Format (Smart defaults)
amount
1000
2500
500
Defaults: currency=INR, mode=greedy
7.1.2 CSV Parser Implementation
import csv
import logging
from typing import List, Dict
from decimal import Decimal
from pathlib import Path
logger = logging.getLogger(__name__)
class CSVProcessor:
"""Process CSV files for bulk calculations."""
REQUIRED_COLUMNS = {'amount'}
OPTIONAL_COLUMNS = {'currency', 'mode'}
VALID_COLUMNS = REQUIRED_COLUMNS | OPTIONAL_COLUMNS
DEFAULT_CURRENCY = 'INR'
DEFAULT_MODE = 'greedy'
def process_csv(self, file_path: Path) -> List[Dict]:
"""
Parse CSV file and extract calculation data.
Args:
file_path: Path to CSV file
Returns:
List of calculation dictionaries
Raises:
ValueError: If CSV format is invalid
"""
results = []
try:
with open(file_path, 'r', encoding='utf-8-sig') as f:
# Detect delimiter (support comma and semicolon)
sample = f.read(1024)
f.seek(0)
delimiter = ',' if ',' in sample else ';'
reader = csv.DictReader(f, delimiter=delimiter)
# Validate headers
if not reader.fieldnames:
raise ValueError("CSV file is empty")
headers = {h.strip().lower() for h in reader.fieldnames}
# Check required columns
if 'amount' not in headers:
raise ValueError("CSV must contain 'amount' column")
# Process rows
for row_num, row in enumerate(reader, start=2):
try:
calc_data = self._parse_row(row, row_num)
results.append(calc_data)
except Exception as e:
logger.error(f"Row {row_num} error: {e}")
results.append({
'row_number': row_num,
'error': str(e),
'status': 'failed'
})
except Exception as e:
raise ValueError(f"Failed to process CSV: {e}")
return results
def _parse_row(self, row: Dict, row_number: int) -> Dict:
"""Parse a single CSV row."""
# Extract and clean amount
amount_str = row.get('amount', '').strip()
if not amount_str:
raise ValueError("Amount is required")
# Remove common formatting
amount_str = amount_str.replace(',', '').replace('$', '').replace('?', '')
try:
amount = Decimal(amount_str)
except:
raise ValueError(f"Invalid amount: {amount_str}")
# Validate amount
if amount <= 0:
raise ValueError("Amount must be greater than 0")
# Extract currency (with default)
currency = row.get('currency', '').strip().upper()
if not currency:
currency = self.DEFAULT_CURRENCY
logger.info(f"Row {row_number}: Using default currency {currency}")
# Extract mode (with default)
mode = row.get('mode', '').strip().lower()
if not mode:
mode = self.DEFAULT_MODE
logger.info(f"Row {row_number}: Using default mode {mode}")
return {
'row_number': row_number,
'amount': amount,
'currency': currency,
'mode': mode,
'status': 'pending'
}
7.1.3 Edge Cases Handling
| Edge Case | Handling | Code |
|---|---|---|
| Empty Rows | Skip silently | if not any(row.values()): continue |
| BOM (Byte Order Mark) | Handle UTF-8 with BOM | encoding='utf-8-sig' |
| Different Delimiters | Auto-detect comma or semicolon | delimiter = ',' if ',' in sample else ';' |
| Currency Symbols in Amount | Strip before parsing | amount_str.replace('$', '').replace('?', '') |
| Case Sensitivity | Normalize headers and values | headers = {h.strip().lower()} |
7.2 PDF Processing
File: packages/local-backend/app/services/pdf_processor.py
7.2.1 PDF Text Extraction
import fitz # PyMuPDF
from typing import Dict, List
import logging
logger = logging.getLogger(__name__)
class PDFProcessor:
"""Process PDF files with text extraction and OCR."""
def __init__(self):
self.ocr_processor = None # Lazy load
async def process_pdf(self, file_path: Path) -> List[Dict]:
"""
Process PDF file (text or scanned).
Strategy:
1. Try extracting embedded text
2. If no text found, use OCR on pages
3. Parse extracted content
"""
try:
doc = fitz.open(file_path)
# Check if PDF has text
has_text = self._has_text_content(doc)
if has_text:
logger.info(f"PDF has text content, extracting...")
text = self._extract_text(doc)
else:
logger.info(f"PDF is scanned, using OCR...")
text = await self._extract_with_ocr(doc, file_path)
doc.close()
# Parse extracted text
return self._parse_text_content(text)
except Exception as e:
raise ValueError(f"Failed to process PDF: {e}")
def _has_text_content(self, doc: fitz.Document) -> bool:
"""Check if PDF contains extractable text."""
for page in doc:
text = page.get_text().strip()
if len(text) > 50: # Arbitrary threshold
return True
return False
def _extract_text(self, doc: fitz.Document) -> str:
"""Extract text from all pages."""
text_parts = []
for page_num, page in enumerate(doc, start=1):
text = page.get_text()
if text.strip():
text_parts.append(f"=== Page {page_num} ===\n{text}")
return "\n\n".join(text_parts)
7.2.2 Hybrid PDF Processing
Detection Logic
def detect_pdf_type(doc: fitz.Document) -> str:
"""
Detect PDF type.
Returns:
'text': Has extractable text
'scanned': Image-based (needs OCR)
'hybrid': Mix of text and images
"""
text_pages = 0
image_pages = 0
for page in doc:
text = page.get_text().strip()
images = page.get_images()
if len(text) > 100:
text_pages += 1
elif images:
image_pages += 1
if text_pages > 0 and image_pages == 0:
return 'text'
elif image_pages > 0 and text_pages == 0:
return 'scanned'
else:
return 'hybrid'
Hybrid Processing Strategy:
- Iterate through each page
- Attempt text extraction first
- If text insufficient (< 50 chars), convert page to image
- Apply OCR to image
- Combine results from all pages
7.3 Word Document Processing
File: packages/local-backend/app/services/word_processor.py
7.3.1 DOCX Extraction
from docx import Document
from pathlib import Path
from typing import List, Dict
import logging
logger = logging.getLogger(__name__)
class WordProcessor:
"""Process Word documents (.docx)."""
async def process_word(self, file_path: Path) -> List[Dict]:
"""Extract text from Word document and parse."""
try:
doc = Document(file_path)
# Extract all text
text_parts = []
# Extract from paragraphs
for para in doc.paragraphs:
if para.text.strip():
text_parts.append(para.text)
# Extract from tables
for table in doc.tables:
table_text = self._extract_table(table)
if table_text:
text_parts.append(table_text)
full_text = "\n".join(text_parts)
logger.info(f"Extracted {len(full_text)} characters from Word document")
# Parse content
from app.services.text_parser import TextParser
parser = TextParser()
return parser.parse(full_text)
except Exception as e:
raise ValueError(f"Failed to process Word document: {e}")
def _extract_table(self, table) -> str:
"""Extract text from Word table."""
rows = []
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells]
if any(cells): # Skip empty rows
rows.append("\t".join(cells))
return "\n".join(rows)
7.3.2 Table Detection & Parsing
def parse_table_as_csv(table) -> List[Dict]:
"""
Parse Word table as CSV-like data.
Assumes first row is header.
"""
if len(table.rows) < 2:
return []
# Extract headers
headers = [cell.text.strip().lower() for cell in table.rows[0].cells]
# Extract data rows
results = []
for row_num, row in enumerate(table.rows[1:], start=2):
row_data = {}
for header, cell in zip(headers, row.cells):
row_data[header] = cell.text.strip()
results.append({
'row_number': row_num,
**row_data
})
return results
7.4 Image Processing
File: packages/local-backend/app/services/image_processor.py
from PIL import Image
from pathlib import Path
from typing import List, Dict
import logging
logger = logging.getLogger(__name__)
class ImageProcessor:
"""Process image files for OCR."""
def __init__(self):
self.ocr_processor = None
async def process_image(self, file_path: Path) -> List[Dict]:
"""Process image file with OCR."""
try:
# Validate image
img = Image.open(file_path)
# Preprocess if needed
img = self._preprocess_image(img)
# Save preprocessed image
temp_path = Path(f"/tmp/preprocessed_{file_path.name}")
img.save(temp_path)
# OCR processing
if not self.ocr_processor:
from app.services.ocr_processor import OCRProcessor
self.ocr_processor = OCRProcessor()
text = await self.ocr_processor.process_image(temp_path)
# Cleanup
temp_path.unlink(missing_ok=True)
# Parse text
from app.services.text_parser import TextParser
parser = TextParser()
return parser.parse(text)
except Exception as e:
raise ValueError(f"Failed to process image: {e}")
def _preprocess_image(self, img: Image.Image) -> Image.Image:
"""
Preprocess image for better OCR accuracy.
Steps:
- Convert to grayscale
- Increase contrast
- Denoise (if needed)
- Resize if too small
"""
# Convert to grayscale
if img.mode != 'L':
img = img.convert('L')
# Resize if too small (OCR works better on larger images)
min_width = 1000
if img.width < min_width:
scale = min_width / img.width
new_size = (int(img.width * scale), int(img.height * scale))
img = img.resize(new_size, Image.LANCZOS)
return img
7.5 Text Parsing Logic
File: packages/local-backend/app/services/text_parser.py
7.5.1 Intelligent Format Detection
import re
from decimal import Decimal
from typing import List, Dict
import logging
logger = logging.getLogger(__name__)
class TextParser:
"""Intelligent text parser for extracting calculation data."""
# Patterns for amount detection
AMOUNT_PATTERNS = [
r'(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', # 1,000.00
r'(\d+\.\d{2})', # 1000.00
r'(\d+)', # 1000
]
# Currency symbols and codes
CURRENCY_SYMBOLS = {
'?': 'INR', 'Rs': 'INR', 'INR': 'INR',
'$': 'USD', 'USD': 'USD',
'€': 'EUR', 'EUR': 'EUR',
'£': 'GBP', 'GBP': 'GBP',
}
def parse(self, text: str) -> List[Dict]:
"""
Parse text to extract calculation data.
Strategies:
1. Structured format (CSV-like in text)
2. Labeled format ("Amount: 1000, Currency: INR")
3. List format ("1000 INR", "2500 USD")
4. Plain numbers (with smart defaults)
5. Mixed format (best effort)
"""
# Try structured parsing first
if self._is_csv_like(text):
return self._parse_csv_like(text)
# Try labeled format
if self._has_labels(text):
return self._parse_labeled(text)
# Try list format
if self._is_list_like(text):
return self._parse_list(text)
# Fallback: extract all numbers
return self._parse_numbers_only(text)
7.5.2 Parsing Strategies (5 Formats)
Format 1: CSV-like
amount,currency,mode
1000,INR,greedy
2500,USD,balanced
Format 2: Labeled
Amount: 1000
Currency: INR
Mode: greedy
Amount: 2500
Currency: USD
Mode: balanced
Format 3: List with Currency
1000 INR
2500 USD
500 EUR
Format 4: Plain Numbers
1000
2500
500
Smart defaults: INR, greedy
Format 5: Natural Language (Mixed)
Please calculate 1000 rupees
Process $2500 with balanced optimization
Calculate 500 euros
7.5.3 Currency Detection Strategies
| Strategy | Pattern | Priority |
|---|---|---|
| Currency Symbols | ?, $, €, £ ? INR, USD, EUR, GBP | Highest |
| 3-Letter Codes | INR, USD, EUR, GBP (case-insensitive) | High |
| Currency Names | rupee/rupees ? INR, dollar/dollars ? USD | Medium |
| Default Fallback | If all strategies fail ? INR | Lowest |
7.5.4 Mode Detection Keywords
MODE_KEYWORDS = {
'greedy': ['greedy', 'standard', 'default', 'largest'],
'balanced': ['balanced', 'mixed', 'even'],
'minimize_large': ['minimize large', 'min large', 'fewer notes'],
'minimize_small': ['minimize small', 'min small', 'fewer coins'],
}
? Section Complete
This section covers the complete Bulk Upload System including CSV processing, PDF extraction (text + OCR), Word document parsing, image processing, and intelligent text parsing with 5 format support and smart defaults.