Invoice #INV-2025-0789 from Tech Supplies Inc.

import os
import pytesseract
from PIL import Image
import pdf2image
import pyodbc
import re
from datetime import datetime
import email
import imaplib
from docx import Document
import hashlib

# SQL Server Connection Configuration
SQL_SERVER = "XYZCORP-DB-SERVER"
SQL_DATABASE = "XYZCorp_Documents"
SQL_USERNAME = "document_processor"
SQL_PASSWORD = "secure_password_123"

# Email Configuration for automatic processing
EMAIL_SERVER = "mail.xyzcorp.com"
EMAIL_USER = "documents@xyzcorp.com"
EMAIL_PASSWORD = "email_password_123"

# Document storage base URL
DOCUMENT_BASE_URL = "https://sqlserver/xyzcorp/documents"

class DocumentProcessor:
    def __init__(self):
        self.connection = self.connect_to_sql()
        
    def connect_to_sql(self):
        try:
            conn_str = (
                f"Driver={{ODBC Driver 17 for SQL Server}};"
                f"Server={SQL_SERVER};"
                f"Database={SQL_DATABASE};"
                f"UID={SQL_USERNAME};"
                f"PWD={SQL_PASSWORD};"
            )
            return pyodbc.connect(conn_str)
        except Exception as e:
            print(f"Error connecting to SQL Server: {e}")
            return None
    
    def extract_text_from_pdf(self, pdf_path):
        """Extract text from PDF using OCR"""
        try:
            images = pdf2image.convert_from_path(pdf_path)
            full_text = ""
            for image in images:
                text = pytesseract.image_to_string(image)
                full_text += text + "\n"
            return full_text
        except Exception as e:
            print(f"Error extracting PDF text: {e}")
            return ""
    
    def extract_text_from_image(self, image_path):
        """Extract text from image using OCR"""
        try:
            image = Image.open(image_path)
            text = pytesseract.image_to_string(image)
            return text
        except Exception as e:
            print(f"Error extracting image text: {e}")
            return ""
    
    def extract_text_from_docx(self, docx_path):
        """Extract text from Word document"""
        try:
            doc = Document(docx_path)
            full_text = ""
            for para in doc.paragraphs:
                full_text += para.text + "\n"
            return full_text
        except Exception as e:
            print(f"Error extracting DOCX text: {e}")
            return ""
    
    def classify_document(self, text):
        """AI-based document classification"""
        text_lower = text.lower()
        
        if any(word in text_lower for word in ['invoice', 'bill', 'tax invoice', 'gst']):
            return "vendor_bill"
        elif any(word in text_lower for word in ['purchase order', 'po', 'order no']):
            return "customer_po"
        elif any(word in text_lower for word in ['quotation', 'quote', 'proposal']):
            return "vendor_quote"
        elif any(word in text_lower for word in ['resume', 'curriculum vitae', 'cv', 'experience']):
            return "candidate_cv"
        else:
            return "unknown"
    
    def parse_invoice_data(self, text):
        """Extract structured data from invoice text using regex patterns"""
        data = {}
        
        # Extract invoice number
        invoice_patterns = [
            r'Invoice\s*[#:]?\s*([A-Za-z0-9\-]+)',
            r'INV[\s\-]*([A-Za-z0-9\-]+)',
            r'Invoice\s*No[.:]*\s*([A-Za-z0-9\-]+)'
        ]
        
        for pattern in invoice_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                data['invoice_number'] = match.group(1)
                break
        
        # Extract dates
        date_pattern = r'(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})'
        dates = re.findall(date_pattern, text)
        if dates:
            data['invoice_date'] = dates[0]
            if len(dates) > 1:
                data['due_date'] = dates[1]
        
        # Extract amounts
        amount_patterns = [
            r'Total\s*[A-Za-z]*\s*[:]?\s*[₹$€]?\s*([\d,]+\.?\d*)',
            r'Amount\s*[A-Za-z]*\s*[:]?\s*[₹$€]?\s*([\d,]+\.?\d*)',
            r'Grand\s*Total\s*[:]?\s*[₹$€]?\s*([\d,]+\.?\d*)'
        ]
        
        for pattern in amount_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                amount = match.group(1).replace(',', '')
                data['total_amount'] = float(amount)
                break
        
        # Extract GST number
        gst_pattern = r'[0-9]{2}[A-Z]{5}[0-9]{4}[A-Z]{1}[1-9A-Z]{1}Z[0-9A-Z]{1}'
        gst_match = re.search(gst_pattern, text)
        if gst_match:
            data['gst_number'] = gst_match.group()
        
        # Extract vendor name (simplified logic)
        lines = text.split('\n')
        for i, line in enumerate(lines):
            if len(line) > 5 and len(line) < 50 and any(word in line.lower() for word in ['ltd', 'inc', 'corp', 'supplier']):
                data['vendor_name'] = line.strip()
                break
        
        return data
    
    def save_to_sql(self, document_type, data, file_path):
        """Save extracted data to SQL Server"""
        if not self.connection:
            print("No database connection")
            return False
        
        try:
            cursor = self.connection.cursor()
            
            # Generate document URL based on file path and type
            file_hash = hashlib.md5(open(file_path, 'rb').read()).hexdigest()
            file_ext = os.path.splitext(file_path)[1]
            now = datetime.now()
            
            document_url = f"{DOCUMENT_BASE_URL}/{document_type}s/{now.year}/{now.month:02d}/{file_hash}" + file_ext
            
            if document_type == "vendor_bill":
                sql = """
                INSERT INTO invoices (invoice_id, vendor_name, invoice_date, total_amount, due_date, gst_number, document_url, processed_date, status)
                VALUES (?, ?, ?, ?, ?, ?, ?, GETDATE(), 'PROCESSED')
                """
                
                cursor.execute(sql, (
                    data.get('invoice_number', 'UNKNOWN'),
                    data.get('vendor_name', 'Unknown Vendor'),
                    data.get('invoice_date', now.strftime('%Y-%m-%d')),
                    data.get('total_amount', 0.0),
                    data.get('due_date', ''),
                    data.get('gst_number', ''),
                    document_url
                ))
            
            elif document_type == "customer_po":
                # Similar logic for purchase orders
                pass
            
            self.connection.commit()
            print(f"Successfully saved {document_type} to SQL Server")
            return True
            
        except Exception as e:
            print(f"Error saving to SQL Server: {e}")
            return False
    
    def process_document(self, file_path):
        """Main processing function for a single document"""
        print(f"Processing document: {file_path}")
        
        # Extract text based on file type
        file_ext = os.path.splitext(file_path)[1].lower()
        
        if file_ext == '.pdf':
            text = self.extract_text_from_pdf(file_path)
        elif file_ext in ['.jpg', '.jpeg', '.png']:
            text = self.extract_text_from_image(file_path)
        elif file_ext == '.docx':
            text = self.extract_text_from_docx(file_path)
        else:
            print(f"Unsupported file format: {file_ext}")
            return False
        
        # Classify document type
        document_type = self.classify_document(text)
        print(f"Document classified as: {document_type}")
        
        # Parse data based on document type
        if document_type == "vendor_bill":
            data = self.parse_invoice_data(text)
        elif document_type == "customer_po":
            data = self.parse_purchase_order_data(text)
        else:
            data = {}
        
        # Save to SQL Server
        success = self.save_to_sql(document_type, data, file_path)
        
        if success:
            print(f"Document processed successfully: {file_path}")
            return True
        else:
            print(f"Failed to process document: {file_path}")
            return False

# Email Processing Functions
def process_incoming_emails():
    """Monitor and process incoming emails with attachments"""
    try:
        # Connect to email server
        mail = imaplib.IMAP4_SSL(EMAIL_SERVER)
        mail.login(EMAIL_USER, EMAIL_PASSWORD)
        mail.select('inbox')
        
        # Search for unread emails
        status, messages = mail.search(None, 'UNSEEN')
        
        if status == 'OK':
            processor = DocumentProcessor()
            
            for num in messages[0].split():
                status, data = mail.fetch(num, '(RFC822)')
                
                if status == 'OK':
                    email_message = email.message_from_bytes(data[0][1])
                    
                    # Process email attachments
                    for part in email_message.walk():
                        if part.get_content_maintype() == 'multipart':
                            continue
                        if part.get('Content-Disposition') is None:
                            continue
                        
                        filename = part.get_filename()
                        if filename:
                            # Save attachment
                            filepath = os.path.join('temp_downloads', filename)
                            with open(filepath, 'wb') as f:
                                f.write(part.get_payload(decode=True))
                            
                            # Process the document
                            processor.process_document(filepath)
        
        mail.close()
        mail.logout()
        
    except Exception as e:
        print(f"Error processing emails: {e}")

# Main execution
if __name__ == "__main__":
    processor = DocumentProcessor()
    
    # Process a single document (example)
    if len(sys.argv) > 1:
        processor.process_document(sys.argv[1])
    else:
        print("Usage: python document_processor.py <file_path>")
        print("Or run email monitoring: python document_processor.py --monitor-emails")

Field	Extracted Value	Confidence
Invoice Number	INV-2025-0452	98%
Vendor Name	ABC Suppliers Ltd.	96%
Invoice Date	2025-12-10	95%
Total Amount	₹ 1,25,000.50	88%
Due Date	2025-12-30	92%
GST Number	27AABCU9603R1Z5	94%

AI DOCUMENT PROCESSING

DOCUMENT UPLOAD & PROCESSING

Select Document Type

VENDOR BILL

CUSTOMER PO

VENDOR QUOTE

CANDIDATE CV

Drag & Drop Documents Here

AI Processing Pipeline

Upload

OCR Processing

AI Parsing

SQL Insertion

EXTRACTED DATA

VENDOR BILL DETAILS

SQL INSERT STATEMENT

EMAIL PROCESSING SYSTEM

Email Server Configuration

Simulate Incoming Email

EMAIL PROCESSING LOG

SQL SERVER DATABASE STRUCTURE

Document Storage Tables

invoices

purchase_orders

quotations

candidates

Document URL Structure in SQL Server

PROCESSING STATISTICS

Sample SQL Queries

PYTHON DOCUMENT PROCESSING SCRIPT

Complete Python Script for Document Processing

Installation & Setup Instructions