#!/usr/bin/env python3
"""
PDFx Research Helper — Academic PDF Research Tool
Wrapper around pdfx for research automation workflows.
"""

import os
import sys
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import re
from dataclasses import dataclass, asdict
from datetime import datetime
import requests
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed


# ============================================================================
# Configuration
# ============================================================================

logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)


@dataclass
class ResearchConfig:
    """Configuration for research helper."""
    base_dir: Path = Path(os.path.expanduser("~/Documents/Research"))
    output_dir: Path = Path(os.path.expanduser("~/Documents/Research/output"))
    cache_dir: Path = Path(os.path.expanduser("~/Documents/Research/cache"))
    
    # Download settings
    max_workers: int = 3  # Parallel downloads
    timeout_sec: int = 30
    verify_ssl: bool = True
    
    # Reference detection regex patterns
    doi_pattern: str = r"(?i)(doi|https?://doi\.org/)(10\.\S+/\S+)"
    arxiv_pattern: str = r"(?i)(arxiv\.org/abs/)?(\d{4}\.\d{4,5})"
    
    def __post_init__(self):
        """Create directories."""
        self.base_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.cache_dir.mkdir(parents=True, exist_ok=True)


# ============================================================================
# Data Classes
# ============================================================================

@dataclass
class PDFReference:
    """Single reference from a PDF."""
    type: str  # 'url', 'pdf', 'doi', 'arxiv'
    value: str
    context: str = ""  # Surrounding text if available


@dataclass
class PDFMetadata:
    """Extracted PDF metadata."""
    title: str = ""
    author: str = ""
    creator: str = ""
    creation_date: str = ""
    pages: int = 0
    producer: str = ""
    custom_fields: Dict = None
    
    def __post_init__(self):
        if self.custom_fields is None:
            self.custom_fields = {}


@dataclass
class PDFAnalysis:
    """Complete PDF analysis result."""
    source: str  # File path or URL
    metadata: PDFMetadata
    references: List[PDFReference]
    text_length: int = 0
    analysis_date: str = ""
    
    def to_dict(self):
        """Convert to dictionary for JSON serialization."""
        return {
            'source': self.source,
            'metadata': asdict(self.metadata),
            'references': [
                {'type': r.type, 'value': r.value, 'context': r.context}
                for r in self.references
            ],
            'text_length': self.text_length,
            'analysis_date': self.analysis_date,
        }


# ============================================================================
# PDF Extraction (using built-in libraries, no pdfx dependency)
# ============================================================================

class PDFExtractor:
    """Extract metadata and text from PDFs."""
    
    @staticmethod
    def _try_import_pdf_libs():
        """Try to import PDF libraries in order of preference."""
        libs = {}
        try:
            import pdfplumber
            libs['pdfplumber'] = pdfplumber
        except ImportError:
            pass
        
        try:
            from PyPDF2 import PdfReader
            libs['PyPDF2'] = PdfReader
        except ImportError:
            pass
        
        try:
            import pypdf
            libs['pypdf'] = pypdf.PdfReader
        except ImportError:
            pass
        
        return libs
    
    @staticmethod
    def extract_from_file(pdf_path: str) -> Tuple[Optional[PDFMetadata], str]:
        """Extract metadata and text from local PDF."""
        pdf_path = Path(pdf_path)
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF not found: {pdf_path}")
        
        libs = PDFExtractor._try_import_pdf_libs()
        
        if not libs:
            logger.error("No PDF library available. Install pdfplumber or PyPDF2")
            return None, ""
        
        # Try pdfplumber first (best text extraction)
        if 'pdfplumber' in libs:
            return PDFExtractor._extract_pdfplumber(pdf_path, libs['pdfplumber'])
        
        # Fall back to PyPDF2
        if 'PyPDF2' in libs:
            return PDFExtractor._extract_pypdf2(pdf_path, libs['PyPDF2'])
        
        return None, ""
    
    @staticmethod
    def _extract_pdfplumber(pdf_path: Path, pdfplumber):
        """Extract using pdfplumber."""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                # Metadata
                metadata = PDFMetadata(
                    title=pdf.metadata.get('Title', ''),
                    author=pdf.metadata.get('Author', ''),
                    creator=pdf.metadata.get('Creator', ''),
                    creation_date=str(pdf.metadata.get('CreationDate', '')),
                    pages=len(pdf.pages),
                    producer=pdf.metadata.get('Producer', ''),
                    custom_fields=dict(pdf.metadata) if pdf.metadata else {}
                )
                
                # Text extraction
                text = '\n\n'.join(
                    page.extract_text() or '' for page in pdf.pages
                )
                
                logger.info(f"Extracted {len(text)} chars from {pdf_path.name}")
                return metadata, text
        except Exception as e:
            logger.error(f"pdfplumber extraction failed: {e}")
            return None, ""
    
    @staticmethod
    def _extract_pypdf2(pdf_path: Path, PdfReader):
        """Extract using PyPDF2."""
        try:
            with open(pdf_path, 'rb') as f:
                reader = PdfReader(f)
                
                # Metadata
                meta = reader.metadata or {}
                metadata = PDFMetadata(
                    title=meta.get('/Title', ''),
                    author=meta.get('/Author', ''),
                    creator=meta.get('/Creator', ''),
                    creation_date=str(meta.get('/CreationDate', '')),
                    pages=len(reader.pages),
                    producer=meta.get('/Producer', ''),
                    custom_fields=dict(meta)
                )
                
                # Text extraction
                text = '\n\n'.join(
                    page.extract_text() for page in reader.pages
                )
                
                logger.info(f"Extracted {len(text)} chars from {pdf_path.name}")
                return metadata, text
        except Exception as e:
            logger.error(f"PyPDF2 extraction failed: {e}")
            return None, ""


# ============================================================================
# Reference Detection
# ============================================================================

class ReferenceExtractor:
    """Extract references from PDF text."""
    
    DOI_PATTERN = re.compile(r"(?i)(doi|https?://doi\.org/)(10\.\S+?(?=\s|$|[,;.]|/]|\)))")
    ARXIV_PATTERN = re.compile(r"((?:https?://)?(?:www\.)?arxiv\.org/(?:abs|pdf)/)?(\d{4}\.\d{4,5})")
    URL_PATTERN = re.compile(
        r'https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&/=]*)'
    )
    PDF_EXTENSION_PATTERN = re.compile(r'https?://\S+\.pdf(?:\?|$|\s)', re.IGNORECASE)
    
    @staticmethod
    def extract_references(text: str) -> List[PDFReference]:
        """Extract all reference types from text."""
        references = []
        
        # Extract DOIs
        for match in ReferenceExtractor.DOI_PATTERN.finditer(text):
            value = match.group(2).strip('.,;)')
            ref = PDFReference(
                type='doi',
                value=f"https://doi.org/{value}",
                context=text[max(0, match.start()-50):match.end()+50]
            )
            references.append(ref)
        
        # Extract ArXiv links
        for match in ReferenceExtractor.ARXIV_PATTERN.finditer(text):
            arxiv_id = match.group(2)
            ref = PDFReference(
                type='arxiv',
                value=f"https://arxiv.org/abs/{arxiv_id}",
                context=text[max(0, match.start()-50):match.end()+50]
            )
            references.append(ref)
        
        # Extract URLs (PDFs first)
        for match in ReferenceExtractor.PDF_EXTENSION_PATTERN.finditer(text):
            url = match.group(0).strip()
            ref = PDFReference(
                type='pdf',
                value=url,
                context=text[max(0, match.start()-50):match.end()+50]
            )
            references.append(ref)
        
        # Extract other URLs
        for match in ReferenceExtractor.URL_PATTERN.finditer(text):
            url = match.group(0).strip('.,;)')
            # Skip if already captured as PDF or DOI
            if not any(r.value == url for r in references):
                ref = PDFReference(
                    type='url',
                    value=url,
                    context=text[max(0, match.start()-50):match.end()+50]
                )
                references.append(ref)
        
        logger.info(f"Extracted {len(references)} references")
        return references


# ============================================================================
# Download Management
# ============================================================================

class PDFDownloader:
    """Download referenced PDFs."""
    
    def __init__(self, config: ResearchConfig):
        self.config = config
    
    def download_pdfs(self, references: List[PDFReference], 
                     output_dir: Optional[Path] = None) -> Dict[str, str]:
        """Download all PDF references. Returns mapping of URL -> local path."""
        output_dir = output_dir or self.config.output_dir / "downloads"
        output_dir.mkdir(parents=True, exist_ok=True)
        
        pdf_refs = [r for r in references if r.type == 'pdf']
        logger.info(f"Downloading {len(pdf_refs)} PDFs...")
        
        results = {}
        with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
            futures = {
                executor.submit(self._download_single, ref, output_dir): ref
                for ref in pdf_refs
            }
            
            for future in as_completed(futures):
                ref = futures[future]
                try:
                    local_path = future.result()
                    if local_path:
                        results[ref.value] = local_path
                except Exception as e:
                    logger.error(f"Failed to download {ref.value}: {e}")
        
        return results
    
    def _download_single(self, ref: PDFReference, output_dir: Path) -> Optional[str]:
        """Download a single PDF."""
        try:
            resp = requests.get(
                ref.value,
                timeout=self.config.timeout_sec,
                verify=self.config.verify_ssl
            )
            
            if resp.status_code != 200:
                logger.warning(f"Failed to download {ref.value}: {resp.status_code}")
                return None
            
            # Generate filename from URL or use generic
            parsed = urlparse(ref.value)
            filename = Path(parsed.path).name or "document.pdf"
            
            output_path = output_dir / filename
            output_path.write_bytes(resp.content)
            
            logger.info(f"Downloaded: {filename}")
            return str(output_path)
        
        except Exception as e:
            logger.error(f"Download error for {ref.value}: {e}")
            return None


# ============================================================================
# Link Validation
# ============================================================================

class LinkValidator:
    """Check for broken links."""
    
    def __init__(self, config: ResearchConfig):
        self.config = config
    
    def validate_links(self, references: List[PDFReference]) -> Dict[str, bool]:
        """Check if all URLs are still live. Returns mapping of URL -> is_valid."""
        url_refs = [r for r in references if r.type in ('url', 'pdf', 'doi', 'arxiv')]
        logger.info(f"Validating {len(url_refs)} URLs...")
        
        results = {}
        with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
            futures = {
                executor.submit(self._validate_single, ref): ref
                for ref in url_refs
            }
            
            for future in as_completed(futures):
                ref = futures[future]
                try:
                    is_valid = future.result()
                    results[ref.value] = is_valid
                except Exception as e:
                    logger.error(f"Validation error for {ref.value}: {e}")
                    results[ref.value] = False
        
        return results
    
    def _validate_single(self, ref: PDFReference) -> bool:
        """Check if a single URL is reachable."""
        try:
            resp = requests.head(
                ref.value,
                timeout=self.config.timeout_sec,
                verify=self.config.verify_ssl,
                allow_redirects=True
            )
            return resp.status_code < 400
        except Exception:
            return False


# ============================================================================
# Main Research Helper
# ============================================================================

class ResearchHelper:
    """Main orchestration for PDF research workflows."""
    
    def __init__(self, config: Optional[ResearchConfig] = None):
        self.config = config or ResearchConfig()
        self.extractor = PDFExtractor()
        self.ref_extractor = ReferenceExtractor()
        self.downloader = PDFDownloader(self.config)
        self.validator = LinkValidator(self.config)
    
    def analyze_pdf(self, pdf_source: str, extract_text: bool = False,
                   validate_links: bool = False) -> PDFAnalysis:
        """Complete PDF analysis: extract metadata + references + optionally text & validation."""
        logger.info(f"Analyzing: {pdf_source}")
        
        # Extract metadata and text
        metadata, text = self.extractor.extract_from_file(pdf_source)
        if not metadata:
            raise RuntimeError(f"Failed to extract from {pdf_source}")
        
        # Extract references
        references = self.ref_extractor.extract_references(text)
        
        # Validate links if requested
        if validate_links:
            validation = self.validator.validate_links(references)
            for ref in references:
                if ref.value in validation:
                    ref.valid = validation[ref.value]
        
        analysis = PDFAnalysis(
            source=pdf_source,
            metadata=metadata,
            references=references,
            text_length=len(text) if extract_text else 0,
            analysis_date=datetime.now().isoformat()
        )
        
        return analysis
    
    def save_analysis(self, analysis: PDFAnalysis, output_file: str):
        """Save analysis to JSON file."""
        output_path = Path(output_file)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        with open(output_path, 'w') as f:
            json.dump(analysis.to_dict(), f, indent=2)
        
        logger.info(f"Saved analysis to {output_path}")
    
    def download_referenced_pdfs(self, analysis: PDFAnalysis,
                                output_dir: Optional[str] = None) -> Dict[str, str]:
        """Download all PDFs referenced in the analysis."""
        output = Path(output_dir) if output_dir else self.config.output_dir / "pdfs"
        return self.downloader.download_pdfs(analysis.references, output)


# ============================================================================
# CLI
# ============================================================================

if __name__ == '__main__':
    import argparse
    
    parser = argparse.ArgumentParser(description='PDF Research Helper')
    parser.add_argument('pdf', help='PDF file path or URL')
    parser.add_argument('-o', '--output', help='Output JSON file')
    parser.add_argument('-d', '--download', metavar='DIR', help='Download referenced PDFs to directory')
    parser.add_argument('-c', '--check-links', action='store_true', help='Validate all URLs')
    parser.add_argument('-v', '--verbose', action='store_true', help='Verbose logging')
    
    args = parser.parse_args()
    
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    
    helper = ResearchHelper()
    
    try:
        # Analyze PDF
        analysis = helper.analyze_pdf(
            args.pdf,
            extract_text=False,
            validate_links=args.check_links
        )
        
        # Save analysis if requested
        if args.output:
            helper.save_analysis(analysis, args.output)
        else:
            print(json.dumps(analysis.to_dict(), indent=2))
        
        # Download referenced PDFs if requested
        if args.download:
            downloads = helper.download_referenced_pdfs(analysis, args.download)
            logger.info(f"Downloaded {len(downloads)} PDFs")
    
    except Exception as e:
        logger.error(f"Error: {e}")
        sys.exit(1)
