import gradio as gr from doctr.io import DocumentFile from doctr.models import ocr_predictor from pdf2image import convert_from_path from PIL import Image import numpy as np import tempfile import os import io import base64 from together import Together import json # Load OCR model once model = ocr_predictor(pretrained=True) # Your upload_and_encode function (modified for Gradio) def upload_and_encode(file_path): if file_path.lower().endswith('.pdf'): images = convert_from_path(file_path, dpi=300, first_page=1, last_page=1) image = images[0] else: image = Image.open(file_path) buffered = io.BytesIO() image.save(buffered, format="PNG") return base64.b64encode(buffered.getvalue()).decode("utf-8") def process_document(uploaded_file, together_api_key): if uploaded_file is None: return "Please upload a file." # Save uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file: tmp_file.write(uploaded_file.read()) file_path = tmp_file.name # Run OCR (full document, as in your notebook) pages = [] if file_path.lower().endswith('.pdf'): images = convert_from_path(file_path, dpi=300) pages = [np.array(img) for img in images] else: pages = [np.array(Image.open(file_path).convert("RGB"))] extracted_texts = [] for page_num, image in enumerate(pages, 1): result = model([image]) text_output = result.render() extracted_texts.append(text_output) full_text_output = "\n".join(extracted_texts) # Combine all pages' text # Get base64 image (using first page, as in your code) base64_image = upload_and_encode(file_path) # Clean up temp file os.unlink(file_path) # Call Together AI LLM client = Together(api_key=together_api_key) response = client.chat.completions.create( model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", messages=[{ "role": "user", "content": [ { "type": "text", "text": """ You are the world's most accurate invoice data extraction expert, capable of processing ANY Indian business document format. 🎯 MISSION: Extract ALL information from this document (Invoice/Credit Note/Debit Note/Tax Invoice/e-Invoice/RCM Invoice). 🔍 CRITICAL RULES: - Use the IMAGE as PRIMARY source - correct OCR errors you can see - Extract EVERY line item, charge, tax, and cess found in tables - Handle multiple document types: Tax Invoice, Credit Note, Debit Note, RCM Invoice, e-Invoice, Railway Invoice - Return ONLY valid JSON - no explanations, no markdown - Use null for missing fields - NEVER guess or hallucinate - Preserve exact number formatting as strings - Extract government-specific fields (IRN, Ack No, e-way bill, etc.) 📋 UNIVERSAL JSON SCHEMA: { "document_type": "string (Tax Invoice/Credit Note/Debit Note/RCM Invoice/e-Invoice)", "document_info": { "invoice_number": "string or null", "document_number": "string or null", "invoice_date": "string or null", "document_date": "string or null", "po_number": "string or null", "internal_ref_no": "string or null", "place_of_supply": "string or null", "bill_period_from": "string or null", "bill_period_to": "string or null", "reverse_charge_applicable": "string or null" }, "government_fields": { "irn": "string or null", "ack_no": "string or null", "ack_date": "string or null", "eway_bill_no": "string or null", "eway_bill_date": "string or null", "cin": "string or null" }, "supplier": { "name": "string or null", "address": "string or null", "gstin": "string or null", "pan": "string or null", "state": "string or null", "state_code": "string or null", "contact": { "email": "string or null", "phone": "string or null", "fax": "string or null" } }, "customer": { "name": "string or null", "address": "string or null", "gstin": "string or null", "pan": "string or null", "state": "string or null", "state_code": "string or null", "customer_code": "string or null" }, "consignee": { "name": "string or null", "address": "string or null", "gstin": "string or null", "state": "string or null" }, "line_items": [ { "sl_no": "string or null", "description": "string", "hsn_sac_code": "string or null", "uom": "string or null", "quantity": "string or null", "rate": "string or null", "amount": "string or null", "taxable_value": "string or null", "cgst_rate": "string or null", "cgst_amount": "string or null", "sgst_rate": "string or null", "sgst_amount": "string or null", "igst_rate": "string or null", "igst_amount": "string or null", "cess_rate": "string or null", "cess_amount": "string or null" } ], "additional_charges": [ { "description": "string", "amount": "string", "type": "string (freight/packing/handling/penalty/bonus/escalation/cess etc.)" } ], "financial_totals": { "subtotal": "string or null", "total_taxable_amount": "string or null", "total_cgst": "string or null", "total_sgst": "string or null", "total_igst": "string or null", "total_cess": "string or null", "infrastructure_cess": "string or null", "environmental_cess": "string or null", "forest_permit_fee": "string or null", "total_tax_amount": "string or null", "round_off": "string or null", "total_invoice_amount": "string or null", "amount_in_words": { "tax_amount": "string or null", "total_amount": "string or null" } }, "transport_details": { "mode_of_dispatch": "string or null", "vehicle_no": "string or null", "lr_rr_no": "string or null", "lr_rr_date": "string or null", "transporter": "string or null", "destination": "string or null" }, "work_commodity_details": { "work_description": "string or null", "commodity": "string or null", "fe_percentage": "string or null", "batch_lot_no": "string or null" }, "payment_terms": { "terms": "string or null", "due_date": "string or null" }, "remarks_notes": "string or null" } ⚠️ EXTRACT EVERYTHING: Don't skip penalty amounts, cess charges, infrastructure fees, environmental charges, bonus payments, escalation amounts, or any government-mandated fields. Return the JSON only: """ }, {"type": "text", "text": f"OCR REFERENCE: {full_text_output}"}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}} ] }], max_tokens=3000, temperature=0.05, top_p=0.9 ) # Get and format the JSON response try: json_response = json.loads(response.choices[0].message.content) return json.dumps(json_response, indent=4) except: return "Error: Invalid JSON from LLM. Raw output: " + response.choices[0].message.content # Gradio interface iface = gr.Interface( fn=process_document, inputs=[ gr.File(label="Upload PDF or Image"), gr.Textbox(label="Together AI API Key", type="password") # For testing; use secrets in production ], outputs=gr.Textbox(label="Extracted Invoice JSON"), title="Invoice OCR & Extraction App", description="Upload a document to extract text via OCR and structured data via open-source LLM." ) if __name__ == "__main__": iface.launch()