#!/usr/bin/env python3
import argparse
import csv
import os
import re
import sys

try:
    import pdfplumber
except Exception as exc:  # pragma: no cover
    print("Error: pdfplumber is required. Install with: pip install -r requirements.txt", file=sys.stderr)
    raise


DIGIT_RE = re.compile(r"\d")


def median(values):
    if not values:
        return 0.0
    sorted_vals = sorted(values)
    n = len(sorted_vals)
    mid = n // 2
    if n % 2 == 1:
        return float(sorted_vals[mid])
    return float(sorted_vals[mid - 1] + sorted_vals[mid]) / 2.0


def group_chars_into_tokens(chars, line_tolerance=3.5, gap_factor=1.8):
    """
    Group numeric pdfplumber characters into tokens by line and proximity.

    Returns list of tokens: { text, x0, x1, top, bottom }
    """
    # Keep only numeric characters
    digit_chars = [ch for ch in chars if DIGIT_RE.match(ch.get("text", ""))]
    if not digit_chars:
        return []

    # Sort primarily by vertical position (top), then by horizontal (x0)
    digit_chars.sort(key=lambda c: (c.get("top", 0.0), c.get("x0", 0.0)))

    # Build lines by clustering nearby tops within line_tolerance
    lines = []  # list[list[char]]
    for ch in digit_chars:
        if not lines:
            lines.append([ch])
            continue
        current_line = lines[-1]
        current_top = sum(c.get("top", 0.0) for c in current_line) / float(len(current_line))
        if abs(ch.get("top", 0.0) - current_top) <= line_tolerance:
            current_line.append(ch)
        else:
            lines.append([ch])

    tokens = []
    for line_chars in lines:
        # Sort left-to-right
        line_chars.sort(key=lambda c: c.get("x0", 0.0))
        widths = [(c.get("x1", 0.0) - c.get("x0", 0.0)) for c in line_chars]
        m_width = median([w for w in widths if w > 0]) or 5.0
        # Threshold that determines whether two adjacent digits belong to same token
        gap_threshold = max(m_width * gap_factor, 7.0)

        current = [line_chars[0]]
        for prev, nxt in zip(line_chars, line_chars[1:]):
            gap = nxt.get("x0", 0.0) - prev.get("x1", 0.0)
            if gap <= gap_threshold:
                current.append(nxt)
            else:
                tokens.append(_chars_to_token(current))
                current = [nxt]
        if current:
            tokens.append(_chars_to_token(current))

    return tokens


def _chars_to_token(chars):
    text = "".join(ch.get("text", "") for ch in chars)
    x0 = min(ch.get("x0", 0.0) for ch in chars)
    x1 = max(ch.get("x1", 0.0) for ch in chars)
    top = min(ch.get("top", 0.0) for ch in chars)
    bottom = max(ch.get("bottom", 0.0) for ch in chars)
    return {"text": text, "x0": x0, "x1": x1, "top": top, "bottom": bottom}


def tokens_to_codes(tokens):
    """Normalize tokens, returning only exact 6-digit strings with geometry."""
    codes = []
    for t in tokens:
        # Remove any non-digits just in case (defensive)
        digits = re.sub(r"\D", "", t.get("text", ""))
        if len(digits) == 6:
            codes.append({
                "code": digits,
                "x0": t.get("x0", 0.0),
                "x1": t.get("x1", 0.0),
                "top": t.get("top", 0.0),
                "bottom": t.get("bottom", 0.0),
            })
    return codes


def split_into_columns(codes, page_width, num_cols=5):
    col_width = float(page_width) / float(num_cols)
    columns = {i: [] for i in range(num_cols)}
    for c in codes:
        x_center = (c.get("x0", 0.0) + c.get("x1", 0.0)) / 2.0
        col_idx = int(x_center // col_width)
        if col_idx < 0:
            col_idx = 0
        if col_idx >= num_cols:
            col_idx = num_cols - 1
        columns[col_idx].append(c)
    # Sort each column top-to-bottom (smaller 'top' is closer to top of page)
    for idx in columns:
        columns[idx].sort(key=lambda c: c.get("top", 0.0))
    return columns


def parse_rows_per_page(value):
    if isinstance(value, int):
        return value
    s = str(value).strip().lower()
    if s == "auto":
        return "auto"
    try:
        n = int(s)
        if n <= 0:
            raise ValueError
        return n
    except Exception:
        raise argparse.ArgumentTypeError("--rows-per-page must be a positive integer or 'auto'")


def process_pdf(pdf_path, rows_per_page=3, num_cols=5):
    all_codes_in_grid_order = []
    csv_rows = []
    col_letters = [chr(ord('A') + i) for i in range(num_cols)]

    base_row_offset = 0

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            tokens = group_chars_into_tokens(page.chars)
            page_codes = tokens_to_codes(tokens)

            columns = split_into_columns(page_codes, page.width, num_cols=num_cols)

            # Determine how many rows this page contributes
            if rows_per_page == "auto":
                rows_this_page = max((len(columns[i]) for i in range(num_cols)), default=0)
            else:
                rows_this_page = int(rows_per_page)

            # Collect per column, pad with MISSING
            col_code_lists = []  # list[list[str]] matching columns order
            for i in range(num_cols):
                codes_i = [c["code"] for c in columns.get(i, [])]
                if rows_this_page > len(codes_i):
                    codes_i = codes_i + ["MISSING"] * (rows_this_page - len(codes_i))
                else:
                    codes_i = codes_i[:rows_this_page]
                col_code_lists.append(codes_i)

            # Emit in order: for each column A..E, rows top..bottom
            for col_idx, codes_i in enumerate(col_code_lists):
                for r in range(rows_this_page):
                    code_val = codes_i[r]
                    all_codes_in_grid_order.append(code_val)
                    csv_rows.append({
                        "Column": col_letters[col_idx],
                        "Row": base_row_offset + r + 1,
                        "Code": code_val,
                        "Page": page_num,
                    })

            base_row_offset += rows_this_page

    return all_codes_in_grid_order, csv_rows


def write_outputs(out_prefix, codes, csv_rows):
    # TXT: single line of codes separated by ", "
    txt_path = f"{out_prefix}.codes.txt"
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(", ".join(codes) + "\n")

    # CSV: Column, Row, Code
    csv_path = f"{out_prefix}.codes.csv"
    fieldnames = ["Column", "Row", "Code", "Page"]
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in csv_rows:
            writer.writerow(row)

    return txt_path, csv_path


def main():
    parser = argparse.ArgumentParser(description="Extract 6-digit colour codes from a PDF colour card in grid order.")
    parser.add_argument("pdf", help="Path to input PDF file")
    parser.add_argument("--rows-per-page", dest="rows_per_page", default=3, type=parse_rows_per_page,
                        help="Rows per page (default: 3). Use 'auto' to infer from content.")
    parser.add_argument("--cols", dest="num_cols", default=5, type=int,
                        help="Number of columns per page (default: 5).")
    parser.add_argument("--out-prefix", dest="out_prefix", default=None,
                        help="Output prefix (default: input filename without extension)")

    args = parser.parse_args()

    pdf_path = args.pdf
    if not os.path.isfile(pdf_path):
        print(f"Error: file not found: {pdf_path}", file=sys.stderr)
        sys.exit(1)

    out_prefix = args.out_prefix
    if not out_prefix:
        base = os.path.basename(pdf_path)
        name, _ = os.path.splitext(base)
        out_prefix = name

    codes, csv_rows = process_pdf(pdf_path, rows_per_page=args.rows_per_page, num_cols=args.num_cols)
    txt_path, csv_path = write_outputs(out_prefix, codes, csv_rows)

    # Also print the comma-separated line to stdout
    print(", ".join(codes))
    print(f"Wrote: {txt_path}")
    print(f"Wrote: {csv_path}")


if __name__ == "__main__":
    main()


