initial upload
This commit is contained in:
95
process_invoices.py
Executable file
95
process_invoices.py
Executable file
@@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract invoice data from PDFs in to_process/ and append to a dated CSV.
|
||||
Processed PDFs are moved to processed/.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from datetime import date, datetime
|
||||
from pathlib import Path
|
||||
|
||||
import pdfplumber
|
||||
|
||||
TO_PROCESS = Path("to_process")
|
||||
PROCESSED = Path("processed")
|
||||
CSV_FILENAME = f"{date.today().strftime('%Y-%m-%d')}.csv"
|
||||
CSV_HEADERS = ["Invoice Date", "Services From", "Services To", "Invoice Number", "Amount"]
|
||||
|
||||
|
||||
def extract_invoice_data(pdf_path: Path) -> dict:
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
text = "\n".join(page.extract_text() or "" for page in pdf.pages)
|
||||
|
||||
# Invoice Date - look for pattern like "16Mar2026"
|
||||
invoice_date = ""
|
||||
m = re.search(r"(\d{1,2}\w+\d{4})", text)
|
||||
if m:
|
||||
date_str = m.group(1)
|
||||
# Parse and reformat to dd/MM/yyyy
|
||||
try:
|
||||
parsed_date = datetime.strptime(date_str, "%d%b%Y")
|
||||
invoice_date = parsed_date.strftime("%d/%m/%Y")
|
||||
except ValueError:
|
||||
invoice_date = date_str # Fallback to original if parsing fails
|
||||
|
||||
# Invoice Number - look for 7-digit number after customer name
|
||||
invoice_number = ""
|
||||
m = re.search(r"JosephBasten\s+(\d{7})", text)
|
||||
if not m:
|
||||
# Fallback: look for Invoice Number followed by digits
|
||||
m = re.search(r"InvoiceNumber\s*(\d{7,})", text)
|
||||
if m:
|
||||
invoice_number = m.group(1)
|
||||
|
||||
# Total Amount
|
||||
amount = ""
|
||||
m = re.search(r"TOTALAUD\s*([\d,]+\.\d{2})", text)
|
||||
if m:
|
||||
amount = m.group(1)
|
||||
|
||||
# Service dates: lines containing "@ DD/MM/YYYY" (no spaces)
|
||||
service_dates = re.findall(r"@(\d{2}/\d{2}/\d{4})", text)
|
||||
services_from = service_dates[0] if service_dates else ""
|
||||
services_to = service_dates[-1] if service_dates else ""
|
||||
|
||||
return {
|
||||
"Invoice Date": invoice_date,
|
||||
"Services From": services_from,
|
||||
"Services To": services_to,
|
||||
"Invoice Number": invoice_number,
|
||||
"Amount": amount,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
pdfs = sorted(TO_PROCESS.glob("*.pdf"))
|
||||
if not pdfs:
|
||||
print("No PDFs found in to_process/")
|
||||
return
|
||||
|
||||
csv_path = Path(CSV_FILENAME)
|
||||
write_header = not csv_path.exists()
|
||||
|
||||
with csv_path.open("a", newline="") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
|
||||
if write_header:
|
||||
writer.writeheader()
|
||||
|
||||
for pdf_path in pdfs:
|
||||
print(f"Processing {pdf_path.name} ...")
|
||||
try:
|
||||
data = extract_invoice_data(pdf_path)
|
||||
writer.writerow(data)
|
||||
shutil.move(str(pdf_path), str(PROCESSED / pdf_path.name))
|
||||
print(f" -> Invoice {data['Invoice Number']} | {data['Invoice Date']} | {data['Services From']} to {data['Services To']} | AUD {data['Amount']}")
|
||||
except Exception as e:
|
||||
print(f" ERROR processing {pdf_path.name}: {e}")
|
||||
|
||||
print(f"\nDone. Results written to {csv_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user