OCR + SQL SERVER INTEGRATION
Invoice from supplier
Purchase order from customer
Price quotation from vendor
Resume for job vacancy
or click to browse files from your computer
Supports PDF, JPG, PNG, DOC, DOCX formats
Document received
Extracting text
Understanding content
Saving to database
| Field | Extracted Value | Confidence |
|---|---|---|
| Invoice Number | INV-2025-0452 | 98% |
| Vendor Name | ABC Suppliers Ltd. | 96% |
| Invoice Date | 2025-12-10 | 95% |
| Total Amount | ₹ 1,25,000.50 | 88% |
| Due Date | 2025-12-30 | 92% |
| GST Number | 27AABCU9603R1Z5 | 94% |
Dear XYZ Corporation,
Please find attached our invoice #INV-2025-0789 for the equipment supplied last month.
Total amount: ₹ 89,500.00
Due date: 30th December 2025
GST: 18% included
Please process the payment at your earliest convenience.
Best regards,
Tech Supplies Inc.
Each document is stored with a URL that points to the SQL Server file location:
Example URL for an invoice:
import os import pytesseract from PIL import Image import pdf2image import pyodbc import re from datetime import datetime import email import imaplib from docx import Document import hashlib # SQL Server Connection Configuration SQL_SERVER = "XYZCORP-DB-SERVER" SQL_DATABASE = "XYZCorp_Documents" SQL_USERNAME = "document_processor" SQL_PASSWORD = "secure_password_123" # Email Configuration for automatic processing EMAIL_SERVER = "mail.xyzcorp.com" EMAIL_USER = "documents@xyzcorp.com" EMAIL_PASSWORD = "email_password_123" # Document storage base URL DOCUMENT_BASE_URL = "https://sqlserver/xyzcorp/documents" class DocumentProcessor: def __init__(self): self.connection = self.connect_to_sql() def connect_to_sql(self): try: conn_str = ( f"Driver={{ODBC Driver 17 for SQL Server}};" f"Server={SQL_SERVER};" f"Database={SQL_DATABASE};" f"UID={SQL_USERNAME};" f"PWD={SQL_PASSWORD};" ) return pyodbc.connect(conn_str) except Exception as e: print(f"Error connecting to SQL Server: {e}") return None def extract_text_from_pdf(self, pdf_path): """Extract text from PDF using OCR""" try: images = pdf2image.convert_from_path(pdf_path) full_text = "" for image in images: text = pytesseract.image_to_string(image) full_text += text + "\n" return full_text except Exception as e: print(f"Error extracting PDF text: {e}") return "" def extract_text_from_image(self, image_path): """Extract text from image using OCR""" try: image = Image.open(image_path) text = pytesseract.image_to_string(image) return text except Exception as e: print(f"Error extracting image text: {e}") return "" def extract_text_from_docx(self, docx_path): """Extract text from Word document""" try: doc = Document(docx_path) full_text = "" for para in doc.paragraphs: full_text += para.text + "\n" return full_text except Exception as e: print(f"Error extracting DOCX text: {e}") return "" def classify_document(self, text): """AI-based document classification""" text_lower = text.lower() if any(word in text_lower for word in ['invoice', 'bill', 'tax invoice', 'gst']): return "vendor_bill" elif any(word in text_lower for word in ['purchase order', 'po', 'order no']): return "customer_po" elif any(word in text_lower for word in ['quotation', 'quote', 'proposal']): return "vendor_quote" elif any(word in text_lower for word in ['resume', 'curriculum vitae', 'cv', 'experience']): return "candidate_cv" else: return "unknown" def parse_invoice_data(self, text): """Extract structured data from invoice text using regex patterns""" data = {} # Extract invoice number invoice_patterns = [ r'Invoice\s*[#:]?\s*([A-Za-z0-9\-]+)', r'INV[\s\-]*([A-Za-z0-9\-]+)', r'Invoice\s*No[.:]*\s*([A-Za-z0-9\-]+)' ] for pattern in invoice_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: data['invoice_number'] = match.group(1) break # Extract dates date_pattern = r'(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})' dates = re.findall(date_pattern, text) if dates: data['invoice_date'] = dates[0] if len(dates) > 1: data['due_date'] = dates[1] # Extract amounts amount_patterns = [ r'Total\s*[A-Za-z]*\s*[:]?\s*[₹$€]?\s*([\d,]+\.?\d*)', r'Amount\s*[A-Za-z]*\s*[:]?\s*[₹$€]?\s*([\d,]+\.?\d*)', r'Grand\s*Total\s*[:]?\s*[₹$€]?\s*([\d,]+\.?\d*)' ] for pattern in amount_patterns: match = re.search(pattern, text, re.IGNORECASE) if match: amount = match.group(1).replace(',', '') data['total_amount'] = float(amount) break # Extract GST number gst_pattern = r'[0-9]{2}[A-Z]{5}[0-9]{4}[A-Z]{1}[1-9A-Z]{1}Z[0-9A-Z]{1}' gst_match = re.search(gst_pattern, text) if gst_match: data['gst_number'] = gst_match.group() # Extract vendor name (simplified logic) lines = text.split('\n') for i, line in enumerate(lines): if len(line) > 5 and len(line) < 50 and any(word in line.lower() for word in ['ltd', 'inc', 'corp', 'supplier']): data['vendor_name'] = line.strip() break return data def save_to_sql(self, document_type, data, file_path): """Save extracted data to SQL Server""" if not self.connection: print("No database connection") return False try: cursor = self.connection.cursor() # Generate document URL based on file path and type file_hash = hashlib.md5(open(file_path, 'rb').read()).hexdigest() file_ext = os.path.splitext(file_path)[1] now = datetime.now() document_url = f"{DOCUMENT_BASE_URL}/{document_type}s/{now.year}/{now.month:02d}/{file_hash}" + file_ext if document_type == "vendor_bill": sql = """ INSERT INTO invoices (invoice_id, vendor_name, invoice_date, total_amount, due_date, gst_number, document_url, processed_date, status) VALUES (?, ?, ?, ?, ?, ?, ?, GETDATE(), 'PROCESSED') """ cursor.execute(sql, ( data.get('invoice_number', 'UNKNOWN'), data.get('vendor_name', 'Unknown Vendor'), data.get('invoice_date', now.strftime('%Y-%m-%d')), data.get('total_amount', 0.0), data.get('due_date', ''), data.get('gst_number', ''), document_url )) elif document_type == "customer_po": # Similar logic for purchase orders pass self.connection.commit() print(f"Successfully saved {document_type} to SQL Server") return True except Exception as e: print(f"Error saving to SQL Server: {e}") return False def process_document(self, file_path): """Main processing function for a single document""" print(f"Processing document: {file_path}") # Extract text based on file type file_ext = os.path.splitext(file_path)[1].lower() if file_ext == '.pdf': text = self.extract_text_from_pdf(file_path) elif file_ext in ['.jpg', '.jpeg', '.png']: text = self.extract_text_from_image(file_path) elif file_ext == '.docx': text = self.extract_text_from_docx(file_path) else: print(f"Unsupported file format: {file_ext}") return False # Classify document type document_type = self.classify_document(text) print(f"Document classified as: {document_type}") # Parse data based on document type if document_type == "vendor_bill": data = self.parse_invoice_data(text) elif document_type == "customer_po": data = self.parse_purchase_order_data(text) else: data = {} # Save to SQL Server success = self.save_to_sql(document_type, data, file_path) if success: print(f"Document processed successfully: {file_path}") return True else: print(f"Failed to process document: {file_path}") return False # Email Processing Functions def process_incoming_emails(): """Monitor and process incoming emails with attachments""" try: # Connect to email server mail = imaplib.IMAP4_SSL(EMAIL_SERVER) mail.login(EMAIL_USER, EMAIL_PASSWORD) mail.select('inbox') # Search for unread emails status, messages = mail.search(None, 'UNSEEN') if status == 'OK': processor = DocumentProcessor() for num in messages[0].split(): status, data = mail.fetch(num, '(RFC822)') if status == 'OK': email_message = email.message_from_bytes(data[0][1]) # Process email attachments for part in email_message.walk(): if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue filename = part.get_filename() if filename: # Save attachment filepath = os.path.join('temp_downloads', filename) with open(filepath, 'wb') as f: f.write(part.get_payload(decode=True)) # Process the document processor.process_document(filepath) mail.close() mail.logout() except Exception as e: print(f"Error processing emails: {e}") # Main execution if __name__ == "__main__": processor = DocumentProcessor() # Process a single document (example) if len(sys.argv) > 1: processor.process_document(sys.argv[1]) else: print("Usage: python document_processor.py <file_path>") print("Or run email monitoring: python document_processor.py --monitor-emails")
1. Install Required Python Packages:
2. Install Tesseract OCR:
3. Configure SQL Server Connection:
4. Run the Script: