#!/usr/bin/env python3
"""
Extract invoice data from PDFs in to_process/ and append to a dated CSV.
Processed PDFs are moved to processed/.
"""

import csv
import os
import re
import shutil
from datetime import date, datetime
from pathlib import Path

import pdfplumber

TO_PROCESS = Path("to_process")
PROCESSED = Path("processed")
CSV_FILENAME = f"{date.today().strftime('%Y-%m-%d')}.csv"
CSV_HEADERS = ["Invoice Date", "Services From", "Services To", "Invoice Number", "Amount"]


def extract_invoice_data(pdf_path: Path) -> dict:
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join(page.extract_text() or "" for page in pdf.pages)

    # Invoice Date - look for pattern like "16Mar2026"
    invoice_date = ""
    m = re.search(r"(\d{1,2}\w+\d{4})", text)
    if m:
        date_str = m.group(1)
        # Parse and reformat to dd/MM/yyyy
        try:
            parsed_date = datetime.strptime(date_str, "%d%b%Y")
            invoice_date = parsed_date.strftime("%d/%m/%Y")
        except ValueError:
            invoice_date = date_str  # Fallback to original if parsing fails

    # Invoice Number - look for 7-digit number after customer name
    invoice_number = ""
    m = re.search(r"JosephBasten\s+(\d{7})", text)
    if not m:
        # Fallback: look for Invoice Number followed by digits
        m = re.search(r"InvoiceNumber\s*(\d{7,})", text)
    if m:
        invoice_number = m.group(1)

    # Total Amount
    amount = ""
    m = re.search(r"TOTALAUD\s*([\d,]+\.\d{2})", text)
    if m:
        amount = m.group(1)

    # Service dates: lines containing "@ DD/MM/YYYY" (no spaces)
    service_dates = re.findall(r"@(\d{2}/\d{2}/\d{4})", text)
    services_from = service_dates[0] if service_dates else ""
    services_to = service_dates[-1] if service_dates else ""

    return {
        "Invoice Date": invoice_date,
        "Services From": services_from,
        "Services To": services_to,
        "Invoice Number": invoice_number,
        "Amount": amount,
    }


def main():
    pdfs = sorted(TO_PROCESS.glob("*.pdf"))
    if not pdfs:
        print("No PDFs found in to_process/")
        return

    csv_path = Path(CSV_FILENAME)
    write_header = not csv_path.exists()

    with csv_path.open("a", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
        if write_header:
            writer.writeheader()

        for pdf_path in pdfs:
            print(f"Processing {pdf_path.name} ...")
            try:
                data = extract_invoice_data(pdf_path)
                writer.writerow(data)
                shutil.move(str(pdf_path), str(PROCESSED / pdf_path.name))
                print(f"  -> Invoice {data['Invoice Number']} | {data['Invoice Date']} | {data['Services From']} to {data['Services To']} | AUD {data['Amount']}")
            except Exception as e:
                print(f"  ERROR processing {pdf_path.name}: {e}")

    print(f"\nDone. Results written to {csv_path}")


if __name__ == "__main__":
    main()