initial upload
This commit is contained in:
21
2026-03-18.csv
Normal file
21
2026-03-18.csv
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
Invoice Date,Services From,Services To,Invoice Number,Amount
|
||||||
|
03/11/2025,28/10/2025,01/11/2025,7107589,981.89
|
||||||
|
08/12/2025,02/12/2025,05/12/2025,7411907,596.97
|
||||||
|
15/12/2025,09/12/2025,13/12/2025,7474557,"1,139.91"
|
||||||
|
22/12/2025,17/12/2025,18/12/2025,7539861,280.93
|
||||||
|
29/12/2025,27/12/2025,27/12/2025,7587913,420.03
|
||||||
|
05/01/2026,29/12/2025,03/01/2026,7632132,"1,104.78"
|
||||||
|
12/01/2026,05/01/2026,10/01/2026,7692780,"1,178.28"
|
||||||
|
12/01/2026,05/01/2026,10/01/2026,7692780,"1,178.28"
|
||||||
|
19/01/2026,12/01/2026,17/01/2026,7752635,"1,227.69"
|
||||||
|
27/01/2026,19/01/2026,24/01/2026,7817540,"1,227.69"
|
||||||
|
27/01/2026,19/01/2026,24/01/2026,7817540,"1,227.69"
|
||||||
|
02/02/2026,29/01/2026,30/01/2026,7877540,561.85
|
||||||
|
02/02/2026,29/01/2026,30/01/2026,7877540,561.85
|
||||||
|
09/02/2026,03/02/2026,05/02/2026,7942095,579.41
|
||||||
|
09/02/2026,03/02/2026,05/02/2026,7942095,579.41
|
||||||
|
16/02/2026,10/02/2026,14/02/2026,8008309,841.42
|
||||||
|
23/02/2026,16/02/2026,21/02/2026,8072274,"1,192.59"
|
||||||
|
03/03/2026,24/02/2026,28/02/2026,8156829,"1,083.98"
|
||||||
|
09/03/2026,03/03/2026,07/03/2026,8206898,"1,017.01"
|
||||||
|
16/03/2026,10/03/2026,12/03/2026,8279890,491.62
|
||||||
|
33
README.md
Normal file
33
README.md
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# Invoice Processing Script
|
||||||
|
|
||||||
|
This script extracts invoice data from PDFs and saves it to a CSV file.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
1. Install dependencies:
|
||||||
|
```bash
|
||||||
|
python3 -m venv venv
|
||||||
|
venv/bin/pip install pdfplumber
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
1. Place unprocessed invoice PDFs in the `to_process/` folder
|
||||||
|
2. Run the script:
|
||||||
|
```bash
|
||||||
|
venv/bin/python process_invoices.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The script will:
|
||||||
|
- Extract invoice data (Invoice Date, Services From, Services To, Invoice Number, Amount)
|
||||||
|
- Append the data to a CSV file named with today's date (e.g., `2026-03-18.csv`)
|
||||||
|
- Move processed PDFs to the `processed/` folder
|
||||||
|
|
||||||
|
## CSV Output
|
||||||
|
|
||||||
|
The CSV contains the following columns:
|
||||||
|
- **Invoice Date**: The date the invoice was issued
|
||||||
|
- **Services From**: First service date
|
||||||
|
- **Services To**: Last service date
|
||||||
|
- **Invoice Number**: The invoice number
|
||||||
|
- **Amount**: Total amount in AUD
|
||||||
95
process_invoices.py
Executable file
95
process_invoices.py
Executable file
@@ -0,0 +1,95 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Extract invoice data from PDFs in to_process/ and append to a dated CSV.
|
||||||
|
Processed PDFs are moved to processed/.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
from datetime import date, datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pdfplumber
|
||||||
|
|
||||||
|
TO_PROCESS = Path("to_process")
|
||||||
|
PROCESSED = Path("processed")
|
||||||
|
CSV_FILENAME = f"{date.today().strftime('%Y-%m-%d')}.csv"
|
||||||
|
CSV_HEADERS = ["Invoice Date", "Services From", "Services To", "Invoice Number", "Amount"]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_invoice_data(pdf_path: Path) -> dict:
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
text = "\n".join(page.extract_text() or "" for page in pdf.pages)
|
||||||
|
|
||||||
|
# Invoice Date - look for pattern like "16Mar2026"
|
||||||
|
invoice_date = ""
|
||||||
|
m = re.search(r"(\d{1,2}\w+\d{4})", text)
|
||||||
|
if m:
|
||||||
|
date_str = m.group(1)
|
||||||
|
# Parse and reformat to dd/MM/yyyy
|
||||||
|
try:
|
||||||
|
parsed_date = datetime.strptime(date_str, "%d%b%Y")
|
||||||
|
invoice_date = parsed_date.strftime("%d/%m/%Y")
|
||||||
|
except ValueError:
|
||||||
|
invoice_date = date_str # Fallback to original if parsing fails
|
||||||
|
|
||||||
|
# Invoice Number - look for 7-digit number after customer name
|
||||||
|
invoice_number = ""
|
||||||
|
m = re.search(r"JosephBasten\s+(\d{7})", text)
|
||||||
|
if not m:
|
||||||
|
# Fallback: look for Invoice Number followed by digits
|
||||||
|
m = re.search(r"InvoiceNumber\s*(\d{7,})", text)
|
||||||
|
if m:
|
||||||
|
invoice_number = m.group(1)
|
||||||
|
|
||||||
|
# Total Amount
|
||||||
|
amount = ""
|
||||||
|
m = re.search(r"TOTALAUD\s*([\d,]+\.\d{2})", text)
|
||||||
|
if m:
|
||||||
|
amount = m.group(1)
|
||||||
|
|
||||||
|
# Service dates: lines containing "@ DD/MM/YYYY" (no spaces)
|
||||||
|
service_dates = re.findall(r"@(\d{2}/\d{2}/\d{4})", text)
|
||||||
|
services_from = service_dates[0] if service_dates else ""
|
||||||
|
services_to = service_dates[-1] if service_dates else ""
|
||||||
|
|
||||||
|
return {
|
||||||
|
"Invoice Date": invoice_date,
|
||||||
|
"Services From": services_from,
|
||||||
|
"Services To": services_to,
|
||||||
|
"Invoice Number": invoice_number,
|
||||||
|
"Amount": amount,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pdfs = sorted(TO_PROCESS.glob("*.pdf"))
|
||||||
|
if not pdfs:
|
||||||
|
print("No PDFs found in to_process/")
|
||||||
|
return
|
||||||
|
|
||||||
|
csv_path = Path(CSV_FILENAME)
|
||||||
|
write_header = not csv_path.exists()
|
||||||
|
|
||||||
|
with csv_path.open("a", newline="") as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
|
||||||
|
if write_header:
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
for pdf_path in pdfs:
|
||||||
|
print(f"Processing {pdf_path.name} ...")
|
||||||
|
try:
|
||||||
|
data = extract_invoice_data(pdf_path)
|
||||||
|
writer.writerow(data)
|
||||||
|
shutil.move(str(pdf_path), str(PROCESSED / pdf_path.name))
|
||||||
|
print(f" -> Invoice {data['Invoice Number']} | {data['Invoice Date']} | {data['Services From']} to {data['Services To']} | AUD {data['Amount']}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR processing {pdf_path.name}: {e}")
|
||||||
|
|
||||||
|
print(f"\nDone. Results written to {csv_path}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user