<?php

require_once 'vendor/autoload.php';

// Fixed extraction script that looks for pages with colour codes
class FixedColourwayExtractor
{
    public function extractColourways()
    {
        $pdfPath = __DIR__ . '/colourways.pdf';
        $tabulaJar = __DIR__ . '/tabula/tabula-1.0.5-jar-with-dependencies.jar';
        
        if (!file_exists($pdfPath) || !file_exists($tabulaJar)) {
            echo "Required files not found\n";
            return;
        }
        
        echo "Extracting colourways with fixed logic...\n\n";
        
        $outputJson = __DIR__ . '/fixed_output.json';
        $cmd = sprintf('java -jar %s -p all -f JSON -o %s %s', 
            escapeshellarg($tabulaJar), 
            escapeshellarg($outputJson), 
            escapeshellarg($pdfPath)
        );
        
        exec($cmd . ' 2>&1', $out, $code);
        
        if ($code === 0 && file_exists($outputJson)) {
            $json = json_decode(file_get_contents($outputJson), true);
            $this->extractFromAllPages($json);
            unlink($outputJson);
        }
    }
    
    private function extractFromAllPages($jsonData)
    {
        if (!is_array($jsonData)) {
            echo "Invalid JSON data\n";
            return;
        }
        
        $allColourCodes = [];
        $pageStats = [];
        
        foreach ($jsonData as $pageIndex => $page) {
            if (isset($page['data']) && is_array($page['data'])) {
                $pageNum = $pageIndex + 1;
                echo "=== PAGE {$pageNum} ===\n";
                
                $pageCodes = $this->extractFromPage($page['data'], $pageNum);
                $allColourCodes = array_merge($allColourCodes, $pageCodes);
                
                $pageStats[$pageNum] = count($pageCodes);
                echo "Colour codes found: " . count($pageCodes) . "\n\n";
            }
        }
        
        // Filter out non-colour codes
        $filteredCodes = $this->filterColourCodes($allColourCodes);
        
        echo "=== FINAL RESULTS ===\n";
        echo "Total 6-digit numbers found: " . count($allColourCodes) . "\n";
        echo "After filtering: " . count($filteredCodes) . " codes\n";
        
        echo "\nPage breakdown:\n";
        foreach ($pageStats as $pageNum => $count) {
            echo "Page {$pageNum}: {$count} codes\n";
        }
        
        echo "\nFirst 20 filtered codes:\n";
        for ($i = 0; $i < min(20, count($filteredCodes)); $i++) {
            $code = $filteredCodes[$i];
            echo "  {$code}\n";
        }
        
        echo "\nLast 20 filtered codes:\n";
        $lastCodes = array_slice($filteredCodes, -20);
        foreach ($lastCodes as $code) {
            echo "  {$code}\n";
        }
    }
    
    private function extractFromPage($pageData, $pageNum)
    {
        $allText = '';
        
        // Collect all text from the page
        foreach ($pageData as $row) {
            foreach ($row as $cell) {
                if (isset($cell['text']) && is_string($cell['text'])) {
                    $allText .= ' ' . $cell['text'];
                }
            }
        }
        
        // Find all 6-digit numbers
        preg_match_all('/\b\d{6}\b/', $allText, $matches);
        
        if (empty($matches[0])) {
            return [];
        }
        
        // Return unique codes in order of appearance
        $codes = [];
        foreach ($matches[0] as $code) {
            if (!in_array($code, $codes)) {
                $codes[] = $code;
            }
        }
        
        return $codes;
    }
    
    private function filterColourCodes($allCodes)
    {
        $filtered = [];
        
        foreach ($allCodes as $code) {
            // Skip obvious ID numbers and references
            if ($this->isLikelyColourCode($code)) {
                $filtered[] = $code;
            }
        }
        
        return $filtered;
    }
    
    private function isLikelyColourCode($code)
    {
        // Skip the obvious ID number from page 1
        if ($code === '100277') {
            return false;
        }
        
        // Accept all other 6-digit codes as potential colour codes
        return true;
    }
}

$extractor = new FixedColourwayExtractor();
$extractor->extractColourways();
