From cb47f16facdc9cf8be2e96ad6ab573e470512673 Mon Sep 17 00:00:00 2001 From: Tim Basten Date: Wed, 18 Mar 2026 08:22:28 +0800 Subject: [PATCH] initial upload --- 2026-03-18.csv | 21 ++++++++++ README.md | 33 ++++++++++++++++ process_invoices.py | 95 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+) create mode 100644 2026-03-18.csv create mode 100644 README.md create mode 100755 process_invoices.py diff --git a/2026-03-18.csv b/2026-03-18.csv new file mode 100644 index 0000000..b0cf7fc --- /dev/null +++ b/2026-03-18.csv @@ -0,0 +1,21 @@ +Invoice Date,Services From,Services To,Invoice Number,Amount +03/11/2025,28/10/2025,01/11/2025,7107589,981.89 +08/12/2025,02/12/2025,05/12/2025,7411907,596.97 +15/12/2025,09/12/2025,13/12/2025,7474557,"1,139.91" +22/12/2025,17/12/2025,18/12/2025,7539861,280.93 +29/12/2025,27/12/2025,27/12/2025,7587913,420.03 +05/01/2026,29/12/2025,03/01/2026,7632132,"1,104.78" +12/01/2026,05/01/2026,10/01/2026,7692780,"1,178.28" +12/01/2026,05/01/2026,10/01/2026,7692780,"1,178.28" +19/01/2026,12/01/2026,17/01/2026,7752635,"1,227.69" +27/01/2026,19/01/2026,24/01/2026,7817540,"1,227.69" +27/01/2026,19/01/2026,24/01/2026,7817540,"1,227.69" +02/02/2026,29/01/2026,30/01/2026,7877540,561.85 +02/02/2026,29/01/2026,30/01/2026,7877540,561.85 +09/02/2026,03/02/2026,05/02/2026,7942095,579.41 +09/02/2026,03/02/2026,05/02/2026,7942095,579.41 +16/02/2026,10/02/2026,14/02/2026,8008309,841.42 +23/02/2026,16/02/2026,21/02/2026,8072274,"1,192.59" +03/03/2026,24/02/2026,28/02/2026,8156829,"1,083.98" +09/03/2026,03/03/2026,07/03/2026,8206898,"1,017.01" +16/03/2026,10/03/2026,12/03/2026,8279890,491.62 diff --git a/README.md b/README.md new file mode 100644 index 0000000..d854e0d --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ +# Invoice Processing Script + +This script extracts invoice data from PDFs and saves it to a CSV file. + +## Setup + +1. Install dependencies: +```bash +python3 -m venv venv +venv/bin/pip install pdfplumber +``` + +## Usage + +1. Place unprocessed invoice PDFs in the `to_process/` folder +2. Run the script: +```bash +venv/bin/python process_invoices.py +``` + +The script will: +- Extract invoice data (Invoice Date, Services From, Services To, Invoice Number, Amount) +- Append the data to a CSV file named with today's date (e.g., `2026-03-18.csv`) +- Move processed PDFs to the `processed/` folder + +## CSV Output + +The CSV contains the following columns: +- **Invoice Date**: The date the invoice was issued +- **Services From**: First service date +- **Services To**: Last service date +- **Invoice Number**: The invoice number +- **Amount**: Total amount in AUD diff --git a/process_invoices.py b/process_invoices.py new file mode 100755 index 0000000..bb65c89 --- /dev/null +++ b/process_invoices.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +""" +Extract invoice data from PDFs in to_process/ and append to a dated CSV. +Processed PDFs are moved to processed/. +""" + +import csv +import os +import re +import shutil +from datetime import date, datetime +from pathlib import Path + +import pdfplumber + +TO_PROCESS = Path("to_process") +PROCESSED = Path("processed") +CSV_FILENAME = f"{date.today().strftime('%Y-%m-%d')}.csv" +CSV_HEADERS = ["Invoice Date", "Services From", "Services To", "Invoice Number", "Amount"] + + +def extract_invoice_data(pdf_path: Path) -> dict: + with pdfplumber.open(pdf_path) as pdf: + text = "\n".join(page.extract_text() or "" for page in pdf.pages) + + # Invoice Date - look for pattern like "16Mar2026" + invoice_date = "" + m = re.search(r"(\d{1,2}\w+\d{4})", text) + if m: + date_str = m.group(1) + # Parse and reformat to dd/MM/yyyy + try: + parsed_date = datetime.strptime(date_str, "%d%b%Y") + invoice_date = parsed_date.strftime("%d/%m/%Y") + except ValueError: + invoice_date = date_str # Fallback to original if parsing fails + + # Invoice Number - look for 7-digit number after customer name + invoice_number = "" + m = re.search(r"JosephBasten\s+(\d{7})", text) + if not m: + # Fallback: look for Invoice Number followed by digits + m = re.search(r"InvoiceNumber\s*(\d{7,})", text) + if m: + invoice_number = m.group(1) + + # Total Amount + amount = "" + m = re.search(r"TOTALAUD\s*([\d,]+\.\d{2})", text) + if m: + amount = m.group(1) + + # Service dates: lines containing "@ DD/MM/YYYY" (no spaces) + service_dates = re.findall(r"@(\d{2}/\d{2}/\d{4})", text) + services_from = service_dates[0] if service_dates else "" + services_to = service_dates[-1] if service_dates else "" + + return { + "Invoice Date": invoice_date, + "Services From": services_from, + "Services To": services_to, + "Invoice Number": invoice_number, + "Amount": amount, + } + + +def main(): + pdfs = sorted(TO_PROCESS.glob("*.pdf")) + if not pdfs: + print("No PDFs found in to_process/") + return + + csv_path = Path(CSV_FILENAME) + write_header = not csv_path.exists() + + with csv_path.open("a", newline="") as f: + writer = csv.DictWriter(f, fieldnames=CSV_HEADERS) + if write_header: + writer.writeheader() + + for pdf_path in pdfs: + print(f"Processing {pdf_path.name} ...") + try: + data = extract_invoice_data(pdf_path) + writer.writerow(data) + shutil.move(str(pdf_path), str(PROCESSED / pdf_path.name)) + print(f" -> Invoice {data['Invoice Number']} | {data['Invoice Date']} | {data['Services From']} to {data['Services To']} | AUD {data['Amount']}") + except Exception as e: + print(f" ERROR processing {pdf_path.name}: {e}") + + print(f"\nDone. Results written to {csv_path}") + + +if __name__ == "__main__": + main()