From cb47f16facdc9cf8be2e96ad6ab573e470512673 Mon Sep 17 00:00:00 2001
From: Tim Basten <tbasten@gmail.com>
Date: Wed, 18 Mar 2026 08:22:28 +0800
Subject: [PATCH] initial upload

---
 2026-03-18.csv      | 21 ++++++++++
 README.md           | 33 ++++++++++++++++
 process_invoices.py | 95 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 149 insertions(+)
 create mode 100644 2026-03-18.csv
 create mode 100644 README.md
 create mode 100755 process_invoices.py

diff --git a/2026-03-18.csv b/2026-03-18.csv
new file mode 100644
index 0000000..b0cf7fc
--- /dev/null
+++ b/2026-03-18.csv
@@ -0,0 +1,21 @@
+Invoice Date,Services From,Services To,Invoice Number,Amount
+03/11/2025,28/10/2025,01/11/2025,7107589,981.89
+08/12/2025,02/12/2025,05/12/2025,7411907,596.97
+15/12/2025,09/12/2025,13/12/2025,7474557,"1,139.91"
+22/12/2025,17/12/2025,18/12/2025,7539861,280.93
+29/12/2025,27/12/2025,27/12/2025,7587913,420.03
+05/01/2026,29/12/2025,03/01/2026,7632132,"1,104.78"
+12/01/2026,05/01/2026,10/01/2026,7692780,"1,178.28"
+12/01/2026,05/01/2026,10/01/2026,7692780,"1,178.28"
+19/01/2026,12/01/2026,17/01/2026,7752635,"1,227.69"
+27/01/2026,19/01/2026,24/01/2026,7817540,"1,227.69"
+27/01/2026,19/01/2026,24/01/2026,7817540,"1,227.69"
+02/02/2026,29/01/2026,30/01/2026,7877540,561.85
+02/02/2026,29/01/2026,30/01/2026,7877540,561.85
+09/02/2026,03/02/2026,05/02/2026,7942095,579.41
+09/02/2026,03/02/2026,05/02/2026,7942095,579.41
+16/02/2026,10/02/2026,14/02/2026,8008309,841.42
+23/02/2026,16/02/2026,21/02/2026,8072274,"1,192.59"
+03/03/2026,24/02/2026,28/02/2026,8156829,"1,083.98"
+09/03/2026,03/03/2026,07/03/2026,8206898,"1,017.01"
+16/03/2026,10/03/2026,12/03/2026,8279890,491.62
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d854e0d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,33 @@
+# Invoice Processing Script
+
+This script extracts invoice data from PDFs and saves it to a CSV file.
+
+## Setup
+
+1. Install dependencies:
+```bash
+python3 -m venv venv
+venv/bin/pip install pdfplumber
+```
+
+## Usage
+
+1. Place unprocessed invoice PDFs in the `to_process/` folder
+2. Run the script:
+```bash
+venv/bin/python process_invoices.py
+```
+
+The script will:
+- Extract invoice data (Invoice Date, Services From, Services To, Invoice Number, Amount)
+- Append the data to a CSV file named with today's date (e.g., `2026-03-18.csv`)
+- Move processed PDFs to the `processed/` folder
+
+## CSV Output
+
+The CSV contains the following columns:
+- **Invoice Date**: The date the invoice was issued
+- **Services From**: First service date
+- **Services To**: Last service date
+- **Invoice Number**: The invoice number
+- **Amount**: Total amount in AUD
diff --git a/process_invoices.py b/process_invoices.py
new file mode 100755
index 0000000..bb65c89
--- /dev/null
+++ b/process_invoices.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""
+Extract invoice data from PDFs in to_process/ and append to a dated CSV.
+Processed PDFs are moved to processed/.
+"""
+
+import csv
+import os
+import re
+import shutil
+from datetime import date, datetime
+from pathlib import Path
+
+import pdfplumber
+
+TO_PROCESS = Path("to_process")
+PROCESSED = Path("processed")
+CSV_FILENAME = f"{date.today().strftime('%Y-%m-%d')}.csv"
+CSV_HEADERS = ["Invoice Date", "Services From", "Services To", "Invoice Number", "Amount"]
+
+
+def extract_invoice_data(pdf_path: Path) -> dict:
+    with pdfplumber.open(pdf_path) as pdf:
+        text = "\n".join(page.extract_text() or "" for page in pdf.pages)
+
+    # Invoice Date - look for pattern like "16Mar2026"
+    invoice_date = ""
+    m = re.search(r"(\d{1,2}\w+\d{4})", text)
+    if m:
+        date_str = m.group(1)
+        # Parse and reformat to dd/MM/yyyy
+        try:
+            parsed_date = datetime.strptime(date_str, "%d%b%Y")
+            invoice_date = parsed_date.strftime("%d/%m/%Y")
+        except ValueError:
+            invoice_date = date_str  # Fallback to original if parsing fails
+
+    # Invoice Number - look for 7-digit number after customer name
+    invoice_number = ""
+    m = re.search(r"JosephBasten\s+(\d{7})", text)
+    if not m:
+        # Fallback: look for Invoice Number followed by digits
+        m = re.search(r"InvoiceNumber\s*(\d{7,})", text)
+    if m:
+        invoice_number = m.group(1)
+
+    # Total Amount
+    amount = ""
+    m = re.search(r"TOTALAUD\s*([\d,]+\.\d{2})", text)
+    if m:
+        amount = m.group(1)
+
+    # Service dates: lines containing "@ DD/MM/YYYY" (no spaces)
+    service_dates = re.findall(r"@(\d{2}/\d{2}/\d{4})", text)
+    services_from = service_dates[0] if service_dates else ""
+    services_to = service_dates[-1] if service_dates else ""
+
+    return {
+        "Invoice Date": invoice_date,
+        "Services From": services_from,
+        "Services To": services_to,
+        "Invoice Number": invoice_number,
+        "Amount": amount,
+    }
+
+
+def main():
+    pdfs = sorted(TO_PROCESS.glob("*.pdf"))
+    if not pdfs:
+        print("No PDFs found in to_process/")
+        return
+
+    csv_path = Path(CSV_FILENAME)
+    write_header = not csv_path.exists()
+
+    with csv_path.open("a", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
+        if write_header:
+            writer.writeheader()
+
+        for pdf_path in pdfs:
+            print(f"Processing {pdf_path.name} ...")
+            try:
+                data = extract_invoice_data(pdf_path)
+                writer.writerow(data)
+                shutil.move(str(pdf_path), str(PROCESSED / pdf_path.name))
+                print(f"  -> Invoice {data['Invoice Number']} | {data['Invoice Date']} | {data['Services From']} to {data['Services To']} | AUD {data['Amount']}")
+            except Exception as e:
+                print(f"  ERROR processing {pdf_path.name}: {e}")
+
+    print(f"\nDone. Results written to {csv_path}")
+
+
+if __name__ == "__main__":
+    main()