#!/usr/bin/env python3
"""
Advanced PDF color code extractor using PyMuPDF (fitz) for better image detection.
This version uses PyMuPDF's superior vector graphics and image extraction capabilities.
"""

import argparse
import re
import math
import os
import sys
from pathlib import Path
import fitz  # PyMuPDF
import numpy as np
from PIL import Image
import io

def extract_codes_with_pymupdf(pdf_path, out_prefix, export_crops_dir=None, dpi=192):
    """
    Extract color codes using PyMuPDF for better vector graphics support.
    """
    doc = fitz.open(pdf_path)
    all_codes = []
    
    # Create crops directory if specified
    if export_crops_dir:
        os.makedirs(export_crops_dir, exist_ok=True)
    
    for page_num, page in enumerate(doc):
        print(f"Processing page {page_num + 1}...")
        
        # Get page dimensions
        page_rect = page.rect
        page_width = page_rect.width
        page_height = page_rect.height
        
        # Extract text with coordinates
        text_dict = page.get_text("dict")
        
        # Find all 6-digit codes
        codes_on_page = []
        for block in text_dict["blocks"]:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        # Normalize text: remove spaces, keep only 6-digit sequences
                        normalized = re.sub(r'\s+', '', text)
                        if re.match(r'^\d{6}$', normalized):
                            bbox = span["bbox"]  # (x0, y0, x1, y1)
                            codes_on_page.append({
                                'code': normalized,
                                'bbox': bbox,
                                'page': page_num
                            })
        
        # Sort codes by position (top to bottom, left to right)
        codes_on_page.sort(key=lambda x: (x['bbox'][1], x['bbox'][0]))
        
        # For each code, try to find the associated image/swatch
        for code_info in codes_on_page:
            code = code_info['code']
            bbox = code_info['bbox']
            x0, y0, x1, y1 = bbox
            
            print(f"  Processing code: {code} at ({x0:.1f}, {y0:.1f})")
            
            # Try multiple methods to find the swatch image
            swatch_bbox = find_swatch_advanced(page, bbox, page_width, page_height)
            
            if swatch_bbox and export_crops_dir:
                # Crop and save the swatch
                crop_swatch_pymupdf(page, swatch_bbox, code, export_crops_dir, dpi)
            
            all_codes.append(code)
    
    doc.close()
    
    # Write output files
    codes_str = ', '.join(all_codes)
    
    # Write .txt file
    txt_path = f"{out_prefix}.codes.txt"
    with open(txt_path, 'w') as f:
        f.write(codes_str)
    print(f"Wrote: {txt_path}")
    
    # Write .csv file
    csv_path = f"{out_prefix}.codes.csv"
    with open(csv_path, 'w') as f:
        f.write("Column,Row,Code\n")
        for i, code in enumerate(all_codes):
            col = chr(ord('A') + (i % 5))  # A-E
            row = (i // 5) + 1
            f.write(f"{col},{row},{code}\n")
    print(f"Wrote: {csv_path}")
    
    return all_codes

def find_swatch_advanced(page, code_bbox, page_width, page_height):
    """
    Advanced swatch detection using PyMuPDF's vector graphics capabilities.
    """
    x0, y0, x1, y1 = code_bbox
    
    # Method 1: Look for vector rectangles near the code
    try:
        rects = page.get_drawings()
        best_rect = None
        best_score = 0
        
        for rect in rects:
            rect_bbox = rect["rect"]
            rx0, ry0, rx1, ry1 = rect_bbox
            
            # Check if rectangle is to the left of the code
            if rx1 < x0 and abs(ry0 - y0) < 50:  # Within reasonable vertical distance
                # Calculate score based on proximity and size
                distance = x0 - rx1
                area = (rx1 - rx0) * (ry1 - ry0)
                aspect_ratio = min(rx1 - rx0, ry1 - ry0) / max(rx1 - rx0, ry1 - ry0)
                
                # Prefer squares that are close and reasonably sized
                score = aspect_ratio * area / (distance + 1)
                if score > best_score:
                    best_score = score
                    best_rect = rect_bbox
        
        if best_rect:
            print(f"    Found vector rectangle: {best_rect}")
            return best_rect
    except Exception as e:
        print(f"    Error getting drawings: {e}")
    
    # Method 2: Look for images near the code
    try:
        image_list = page.get_images()
        for img_index, img in enumerate(image_list):
            try:
                # Get image rectangle
                img_rects = page.get_image_rects(img_index)
                if img_rects:
                    img_rect = img_rects[0]
                    ix0, iy0, ix1, iy1 = img_rect
                    
                    # Check if image is to the left of the code
                    if ix1 < x0 and abs(iy0 - y0) < 50:
                        print(f"    Found image: {img_rect}")
                        return img_rect
            except Exception as e:
                print(f"    Error processing image {img_index}: {e}")
                continue
    except Exception as e:
        print(f"    Error getting images: {e}")
    
    # Method 3: Use PyMuPDF's get_pixmap to render and analyze
    # Create a region around the code to search for swatches
    search_margin = 100
    search_rect = fitz.Rect(
        max(0, x0 - search_margin),
        max(0, y0 - 50),
        min(page_width, x0),
        min(page_height, y0 + 100)
    )
    
    # Render the search region at high DPI
    mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better detail
    pix = page.get_pixmap(matrix=mat, clip=search_rect)
    img_data = pix.tobytes("png")
    
    # Convert to PIL Image for analysis
    pil_img = Image.open(io.BytesIO(img_data))
    
    # Find the most colorful/dark region (likely the swatch)
    swatch_bbox = find_swatch_in_image(pil_img, search_rect, mat)
    
    if swatch_bbox:
        print(f"    Found swatch via image analysis: {swatch_bbox}")
        return swatch_bbox
    
    # Method 4: Fallback - use a fixed offset based on typical layout
    fallback_rect = fitz.Rect(
        max(0, x0 - 60),  # 60 points to the left
        y0 - 10,          # slightly above center
        x0 - 10,          # 10 points to the left of code
        y0 + 50           # 50 points tall
    )
    
    print(f"    Using fallback rectangle: {fallback_rect}")
    return fallback_rect

def find_swatch_in_image(pil_img, search_rect, matrix):
    """
    Analyze PIL image to find the most likely swatch region.
    """
    # Convert to RGB if needed
    if pil_img.mode != 'RGB':
        pil_img = pil_img.convert('RGB')
    
    img_array = np.array(pil_img)
    height, width = img_array.shape[:2]
    
    # Look for regions with high color variance (not white/black)
    best_region = None
    best_score = 0
    
    # Search in a grid pattern
    step_size = 20
    for y in range(0, height - 40, step_size):
        for x in range(0, width - 40, step_size):
            # Extract region
            region = img_array[y:y+40, x:x+40]
            
            # Calculate color variance (higher = more colorful)
            color_variance = np.var(region)
            
            # Calculate brightness (avoid very dark or very light)
            brightness = np.mean(region)
            
            # Score combines variance and reasonable brightness
            if 50 < brightness < 200:  # Not too dark, not too light
                score = color_variance * (1 - abs(brightness - 128) / 128)
                
                if score > best_score:
                    best_score = score
                    # Convert back to PDF coordinates
                    pdf_x0 = search_rect.x0 + (x / matrix.a)
                    pdf_y0 = search_rect.y0 + (y / matrix.d)
                    pdf_x1 = search_rect.x0 + ((x + 40) / matrix.a)
                    pdf_y1 = search_rect.y0 + ((y + 40) / matrix.d)
                    
                    best_region = (pdf_x0, pdf_y0, pdf_x1, pdf_y1)
    
    return best_region

def crop_swatch_pymupdf(page, swatch_bbox, code, export_crops_dir, dpi):
    """
    Crop and save swatch using PyMuPDF's high-quality rendering.
    """
    try:
        # Create high-DPI matrix
        zoom = dpi / 72.0  # 72 is default DPI
        mat = fitz.Matrix(zoom, zoom)
        
        # Render the swatch region
        pix = page.get_pixmap(matrix=mat, clip=swatch_bbox)
        
        # Convert to PIL Image
        img_data = pix.tobytes("png")
        pil_img = Image.open(io.BytesIO(img_data))
        
        # Make it square (crop to smaller dimension)
        width, height = pil_img.size
        size = min(width, height)
        
        # Center crop
        left = (width - size) // 2
        top = (height - size) // 2
        right = left + size
        bottom = top + size
        
        cropped = pil_img.crop((left, top, right, bottom))
        
        # Save
        output_path = os.path.join(export_crops_dir, f"swatch_{code}.png")
        cropped.save(output_path, "PNG")
        print(f"    Saved swatch: {output_path}")
        
    except Exception as e:
        print(f"    Error cropping swatch for {code}: {e}")

def main():
    parser = argparse.ArgumentParser(description='Extract color codes from PDF using PyMuPDF')
    parser.add_argument('pdf', help='Path to PDF file')
    parser.add_argument('--out-prefix', default='output', help='Output file prefix')
    parser.add_argument('--export-crops-dir', help='Directory to save cropped swatches')
    parser.add_argument('--dpi', type=int, default=192, help='DPI for image export')
    
    args = parser.parse_args()
    
    if not os.path.exists(args.pdf):
        print(f"Error: PDF file not found: {args.pdf}")
        sys.exit(1)
    
    print(f"Extracting codes from: {args.pdf}")
    codes = extract_codes_with_pymupdf(
        args.pdf, 
        args.out_prefix, 
        args.export_crops_dir, 
        args.dpi
    )
    
    print(f"Extracted {len(codes)} color codes")

if __name__ == "__main__":
    main()
