7. Bulk Upload System Specification

7.1 CSV Processing

File: packages/local-backend/app/services/csv_processor.py

7.1.1 Supported CSV Formats

Standard Format (All columns)

amount,currency,mode
1000,INR,greedy
2500,USD,balanced
500.50,EUR,minimize_large

Minimal Format (Defaults applied)

amount,currency
1000,INR
2500,USD
500,EUR

Ultra-Minimal Format (Smart defaults)

amount
1000
2500
500

Defaults: currency=INR, mode=greedy

7.1.2 CSV Parser Implementation

import csv
import logging
from typing import List, Dict
from decimal import Decimal
from pathlib import Path

logger = logging.getLogger(__name__)

class CSVProcessor:
    """Process CSV files for bulk calculations."""
    
    REQUIRED_COLUMNS = {'amount'}
    OPTIONAL_COLUMNS = {'currency', 'mode'}
    VALID_COLUMNS = REQUIRED_COLUMNS | OPTIONAL_COLUMNS
    
    DEFAULT_CURRENCY = 'INR'
    DEFAULT_MODE = 'greedy'
    
    def process_csv(self, file_path: Path) -> List[Dict]:
        """
        Parse CSV file and extract calculation data.
        
        Args:
            file_path: Path to CSV file
            
        Returns:
            List of calculation dictionaries
            
        Raises:
            ValueError: If CSV format is invalid
        """
        results = []
        
        try:
            with open(file_path, 'r', encoding='utf-8-sig') as f:
                # Detect delimiter (support comma and semicolon)
                sample = f.read(1024)
                f.seek(0)
                delimiter = ',' if ',' in sample else ';'
                
                reader = csv.DictReader(f, delimiter=delimiter)
                
                # Validate headers
                if not reader.fieldnames:
                    raise ValueError("CSV file is empty")
                
                headers = {h.strip().lower() for h in reader.fieldnames}
                
                # Check required columns
                if 'amount' not in headers:
                    raise ValueError("CSV must contain 'amount' column")
                
                # Process rows
                for row_num, row in enumerate(reader, start=2):
                    try:
                        calc_data = self._parse_row(row, row_num)
                        results.append(calc_data)
                    except Exception as e:
                        logger.error(f"Row {row_num} error: {e}")
                        results.append({
                            'row_number': row_num,
                            'error': str(e),
                            'status': 'failed'
                        })
        
        except Exception as e:
            raise ValueError(f"Failed to process CSV: {e}")
        
        return results
    
    def _parse_row(self, row: Dict, row_number: int) -> Dict:
        """Parse a single CSV row."""
        # Extract and clean amount
        amount_str = row.get('amount', '').strip()
        if not amount_str:
            raise ValueError("Amount is required")
        
        # Remove common formatting
        amount_str = amount_str.replace(',', '').replace('$', '').replace('?', '')
        
        try:
            amount = Decimal(amount_str)
        except:
            raise ValueError(f"Invalid amount: {amount_str}")
        
        # Validate amount
        if amount <= 0:
            raise ValueError("Amount must be greater than 0")
        
        # Extract currency (with default)
        currency = row.get('currency', '').strip().upper()
        if not currency:
            currency = self.DEFAULT_CURRENCY
            logger.info(f"Row {row_number}: Using default currency {currency}")
        
        # Extract mode (with default)
        mode = row.get('mode', '').strip().lower()
        if not mode:
            mode = self.DEFAULT_MODE
            logger.info(f"Row {row_number}: Using default mode {mode}")
        
        return {
            'row_number': row_number,
            'amount': amount,
            'currency': currency,
            'mode': mode,
            'status': 'pending'
        }

7.1.3 Edge Cases Handling

Edge Case Handling Code
Empty Rows Skip silently if not any(row.values()): continue
BOM (Byte Order Mark) Handle UTF-8 with BOM encoding='utf-8-sig'
Different Delimiters Auto-detect comma or semicolon delimiter = ',' if ',' in sample else ';'
Currency Symbols in Amount Strip before parsing amount_str.replace('$', '').replace('?', '')
Case Sensitivity Normalize headers and values headers = {h.strip().lower()}

7.2 PDF Processing

File: packages/local-backend/app/services/pdf_processor.py

7.2.1 PDF Text Extraction

import fitz  # PyMuPDF
from typing import Dict, List
import logging

logger = logging.getLogger(__name__)

class PDFProcessor:
    """Process PDF files with text extraction and OCR."""
    
    def __init__(self):
        self.ocr_processor = None  # Lazy load
    
    async def process_pdf(self, file_path: Path) -> List[Dict]:
        """
        Process PDF file (text or scanned).
        
        Strategy:
        1. Try extracting embedded text
        2. If no text found, use OCR on pages
        3. Parse extracted content
        """
        try:
            doc = fitz.open(file_path)
            
            # Check if PDF has text
            has_text = self._has_text_content(doc)
            
            if has_text:
                logger.info(f"PDF has text content, extracting...")
                text = self._extract_text(doc)
            else:
                logger.info(f"PDF is scanned, using OCR...")
                text = await self._extract_with_ocr(doc, file_path)
            
            doc.close()
            
            # Parse extracted text
            return self._parse_text_content(text)
        
        except Exception as e:
            raise ValueError(f"Failed to process PDF: {e}")
    
    def _has_text_content(self, doc: fitz.Document) -> bool:
        """Check if PDF contains extractable text."""
        for page in doc:
            text = page.get_text().strip()
            if len(text) > 50:  # Arbitrary threshold
                return True
        return False
    
    def _extract_text(self, doc: fitz.Document) -> str:
        """Extract text from all pages."""
        text_parts = []
        
        for page_num, page in enumerate(doc, start=1):
            text = page.get_text()
            if text.strip():
                text_parts.append(f"=== Page {page_num} ===\n{text}")
        
        return "\n\n".join(text_parts)

7.2.2 Hybrid PDF Processing

Detection Logic

def detect_pdf_type(doc: fitz.Document) -> str:
    """
    Detect PDF type.
    
    Returns:
        'text': Has extractable text
        'scanned': Image-based (needs OCR)
        'hybrid': Mix of text and images
    """
    text_pages = 0
    image_pages = 0
    
    for page in doc:
        text = page.get_text().strip()
        images = page.get_images()
        
        if len(text) > 100:
            text_pages += 1
        elif images:
            image_pages += 1
    
    if text_pages > 0 and image_pages == 0:
        return 'text'
    elif image_pages > 0 and text_pages == 0:
        return 'scanned'
    else:
        return 'hybrid'
Hybrid Processing Strategy:
  1. Iterate through each page
  2. Attempt text extraction first
  3. If text insufficient (< 50 chars), convert page to image
  4. Apply OCR to image
  5. Combine results from all pages

7.3 Word Document Processing

File: packages/local-backend/app/services/word_processor.py

7.3.1 DOCX Extraction

from docx import Document
from pathlib import Path
from typing import List, Dict
import logging

logger = logging.getLogger(__name__)

class WordProcessor:
    """Process Word documents (.docx)."""
    
    async def process_word(self, file_path: Path) -> List[Dict]:
        """Extract text from Word document and parse."""
        try:
            doc = Document(file_path)
            
            # Extract all text
            text_parts = []
            
            # Extract from paragraphs
            for para in doc.paragraphs:
                if para.text.strip():
                    text_parts.append(para.text)
            
            # Extract from tables
            for table in doc.tables:
                table_text = self._extract_table(table)
                if table_text:
                    text_parts.append(table_text)
            
            full_text = "\n".join(text_parts)
            
            logger.info(f"Extracted {len(full_text)} characters from Word document")
            
            # Parse content
            from app.services.text_parser import TextParser
            parser = TextParser()
            return parser.parse(full_text)
        
        except Exception as e:
            raise ValueError(f"Failed to process Word document: {e}")
    
    def _extract_table(self, table) -> str:
        """Extract text from Word table."""
        rows = []
        
        for row in table.rows:
            cells = [cell.text.strip() for cell in row.cells]
            if any(cells):  # Skip empty rows
                rows.append("\t".join(cells))
        
        return "\n".join(rows)

7.3.2 Table Detection & Parsing

def parse_table_as_csv(table) -> List[Dict]:
    """
    Parse Word table as CSV-like data.
    
    Assumes first row is header.
    """
    if len(table.rows) < 2:
        return []
    
    # Extract headers
    headers = [cell.text.strip().lower() for cell in table.rows[0].cells]
    
    # Extract data rows
    results = []
    for row_num, row in enumerate(table.rows[1:], start=2):
        row_data = {}
        for header, cell in zip(headers, row.cells):
            row_data[header] = cell.text.strip()
        
        results.append({
            'row_number': row_num,
            **row_data
        })
    
    return results

7.4 Image Processing

File: packages/local-backend/app/services/image_processor.py

from PIL import Image
from pathlib import Path
from typing import List, Dict
import logging

logger = logging.getLogger(__name__)

class ImageProcessor:
    """Process image files for OCR."""
    
    def __init__(self):
        self.ocr_processor = None
    
    async def process_image(self, file_path: Path) -> List[Dict]:
        """Process image file with OCR."""
        try:
            # Validate image
            img = Image.open(file_path)
            
            # Preprocess if needed
            img = self._preprocess_image(img)
            
            # Save preprocessed image
            temp_path = Path(f"/tmp/preprocessed_{file_path.name}")
            img.save(temp_path)
            
            # OCR processing
            if not self.ocr_processor:
                from app.services.ocr_processor import OCRProcessor
                self.ocr_processor = OCRProcessor()
            
            text = await self.ocr_processor.process_image(temp_path)
            
            # Cleanup
            temp_path.unlink(missing_ok=True)
            
            # Parse text
            from app.services.text_parser import TextParser
            parser = TextParser()
            return parser.parse(text)
        
        except Exception as e:
            raise ValueError(f"Failed to process image: {e}")
    
    def _preprocess_image(self, img: Image.Image) -> Image.Image:
        """
        Preprocess image for better OCR accuracy.
        
        Steps:
        - Convert to grayscale
        - Increase contrast
        - Denoise (if needed)
        - Resize if too small
        """
        # Convert to grayscale
        if img.mode != 'L':
            img = img.convert('L')
        
        # Resize if too small (OCR works better on larger images)
        min_width = 1000
        if img.width < min_width:
            scale = min_width / img.width
            new_size = (int(img.width * scale), int(img.height * scale))
            img = img.resize(new_size, Image.LANCZOS)
        
        return img

7.5 Text Parsing Logic

File: packages/local-backend/app/services/text_parser.py

7.5.1 Intelligent Format Detection

import re
from decimal import Decimal
from typing import List, Dict
import logging

logger = logging.getLogger(__name__)

class TextParser:
    """Intelligent text parser for extracting calculation data."""
    
    # Patterns for amount detection
    AMOUNT_PATTERNS = [
        r'(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)',  # 1,000.00
        r'(\d+\.\d{2})',  # 1000.00
        r'(\d+)',  # 1000
    ]
    
    # Currency symbols and codes
    CURRENCY_SYMBOLS = {
        '?': 'INR', 'Rs': 'INR', 'INR': 'INR',
        '$': 'USD', 'USD': 'USD',
        '€': 'EUR', 'EUR': 'EUR',
        '£': 'GBP', 'GBP': 'GBP',
    }
    
    def parse(self, text: str) -> List[Dict]:
        """
        Parse text to extract calculation data.
        
        Strategies:
        1. Structured format (CSV-like in text)
        2. Labeled format ("Amount: 1000, Currency: INR")
        3. List format ("1000 INR", "2500 USD")
        4. Plain numbers (with smart defaults)
        5. Mixed format (best effort)
        """
        # Try structured parsing first
        if self._is_csv_like(text):
            return self._parse_csv_like(text)
        
        # Try labeled format
        if self._has_labels(text):
            return self._parse_labeled(text)
        
        # Try list format
        if self._is_list_like(text):
            return self._parse_list(text)
        
        # Fallback: extract all numbers
        return self._parse_numbers_only(text)

7.5.2 Parsing Strategies (5 Formats)

Format 1: CSV-like

amount,currency,mode
1000,INR,greedy
2500,USD,balanced

Format 2: Labeled

Amount: 1000
Currency: INR
Mode: greedy

Amount: 2500
Currency: USD
Mode: balanced

Format 3: List with Currency

1000 INR
2500 USD
500 EUR

Format 4: Plain Numbers

1000
2500
500

Smart defaults: INR, greedy

Format 5: Natural Language (Mixed)

Please calculate 1000 rupees
Process $2500 with balanced optimization
Calculate 500 euros

7.5.3 Currency Detection Strategies

Strategy Pattern Priority
Currency Symbols ?, $, €, £ ? INR, USD, EUR, GBP Highest
3-Letter Codes INR, USD, EUR, GBP (case-insensitive) High
Currency Names rupee/rupees ? INR, dollar/dollars ? USD Medium
Default Fallback If all strategies fail ? INR Lowest

7.5.4 Mode Detection Keywords

MODE_KEYWORDS = {
    'greedy': ['greedy', 'standard', 'default', 'largest'],
    'balanced': ['balanced', 'mixed', 'even'],
    'minimize_large': ['minimize large', 'min large', 'fewer notes'],
    'minimize_small': ['minimize small', 'min small', 'fewer coins'],
}
? Section Complete

This section covers the complete Bulk Upload System including CSV processing, PDF extraction (text + OCR), Word document parsing, image processing, and intelligent text parsing with 5 format support and smart defaults.