#!/usr/bin/env python3 """ Extract invoice data from PDFs in to_process/ and append to a dated CSV. Processed PDFs are moved to processed/. """ import csv import os import re import shutil from datetime import date, datetime from pathlib import Path import pdfplumber TO_PROCESS = Path("to_process") PROCESSED = Path("processed") CSV_FILENAME = f"{date.today().strftime('%Y-%m-%d')}.csv" CSV_HEADERS = ["Invoice Date", "Services From", "Services To", "Invoice Number", "Amount"] def extract_invoice_data(pdf_path: Path) -> dict: with pdfplumber.open(pdf_path) as pdf: text = "\n".join(page.extract_text() or "" for page in pdf.pages) # Invoice Date - look for pattern like "16Mar2026" invoice_date = "" m = re.search(r"(\d{1,2}\w+\d{4})", text) if m: date_str = m.group(1) # Parse and reformat to dd/MM/yyyy try: parsed_date = datetime.strptime(date_str, "%d%b%Y") invoice_date = parsed_date.strftime("%d/%m/%Y") except ValueError: invoice_date = date_str # Fallback to original if parsing fails # Invoice Number - look for 7-digit number after customer name invoice_number = "" m = re.search(r"JosephBasten\s+(\d{7})", text) if not m: # Fallback: look for Invoice Number followed by digits m = re.search(r"InvoiceNumber\s*(\d{7,})", text) if m: invoice_number = m.group(1) # Total Amount amount = "" m = re.search(r"TOTALAUD\s*([\d,]+\.\d{2})", text) if m: amount = m.group(1) # Service dates: lines containing "@ DD/MM/YYYY" (no spaces) service_dates = re.findall(r"@(\d{2}/\d{2}/\d{4})", text) services_from = service_dates[0] if service_dates else "" services_to = service_dates[-1] if service_dates else "" return { "Invoice Date": invoice_date, "Services From": services_from, "Services To": services_to, "Invoice Number": invoice_number, "Amount": amount, } def main(): pdfs = sorted(TO_PROCESS.glob("*.pdf")) if not pdfs: print("No PDFs found in to_process/") return csv_path = Path(CSV_FILENAME) write_header = not csv_path.exists() with csv_path.open("a", newline="") as f: writer = csv.DictWriter(f, fieldnames=CSV_HEADERS) if write_header: writer.writeheader() for pdf_path in pdfs: print(f"Processing {pdf_path.name} ...") try: data = extract_invoice_data(pdf_path) writer.writerow(data) shutil.move(str(pdf_path), str(PROCESSED / pdf_path.name)) print(f" -> Invoice {data['Invoice Number']} | {data['Invoice Date']} | {data['Services From']} to {data['Services To']} | AUD {data['Amount']}") except Exception as e: print(f" ERROR processing {pdf_path.name}: {e}") print(f"\nDone. Results written to {csv_path}") if __name__ == "__main__": main()