#!/usr/bin/env python3
"""
Monthly Periodicals Checker

Checks configured periodical sources for new issues and downloads them.
Downloads TWO versions when available:
  1. Machine-readable (for AI processing): txt > md > epub > html
  2. Human-readable (for reading): epub > pdf > docx

Usage: python3 periodicals-check.py [--dry-run] [--force]
"""

import json
import os
import re
import sys
import urllib.request
import urllib.parse
from datetime import datetime
from pathlib import Path

CONFIG_PATH = Path.home() / ".openclaw/workspace/data/periodicals.json"
LOG_PATH = Path.home() / ".openclaw/workspace/memory/periodicals-log.md"

# Format priority for machine reading (best first)
MACHINE_PRIORITY = ["txt", "md", "epub", "html"]

# Format priority for human reading (best first)  
HUMAN_PRIORITY = ["epub", "pdf", "mobi", "docx", "rtf"]

def load_config():
    with open(CONFIG_PATH) as f:
        return json.load(f)

def get_base_path(config):
    base = config["settings"]["basePath"]
    return Path(os.path.expanduser(base))

def fetch_page(url):
    """Fetch a webpage and return its content."""
    req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
    with urllib.request.urlopen(req, timeout=30) as response:
        return response.read().decode("utf-8")

def find_format_links(html, base_url):
    """Extract all downloadable format links from HTML."""
    # Pattern to find links to various formats
    pattern = r'href="([^"]*\.(pdf|epub|mobi|txt|md|html|rtf|docx?|djvu)[^"]*)"'
    matches = re.findall(pattern, html, re.IGNORECASE)
    
    links = {}
    for match, ext in matches:
        url = match
        ext = ext.lower()
        
        # Skip navigation .html links
        if ext == "html" and ("article" in url or "?" in url or url.count("/") < 2):
            continue
            
        if url.startswith("http"):
            full_url = url
        elif url.startswith("/"):
            from urllib.parse import urlparse
            parsed = urlparse(base_url)
            full_url = f"{parsed.scheme}://{parsed.netloc}{url}"
        else:
            full_url = f"{base_url.rsplit('/', 1)[0]}/{url}"
        
        # Prefer "issue" files over indexes
        is_issue = "issue" in url.lower() or any(m in url.lower() for m in ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"])
        
        if ext not in links or is_issue:
            links[ext] = full_url
    
    return links

def download_file(url, dest_path):
    """Download a file from URL to destination."""
    req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
    with urllib.request.urlopen(req, timeout=120) as response:
        with open(dest_path, "wb") as f:
            f.write(response.read())
    return os.path.getsize(dest_path)

def get_current_month_filename(pattern, ext):
    """Generate filename for current month."""
    now = datetime.now()
    return f"{now.year}-{now.month:02d}-{pattern}.{ext}"

def select_best_format(available, priority_list):
    """Select best available format from priority list."""
    for fmt in priority_list:
        if fmt in available:
            return fmt, available[fmt]
    return None, None

def check_periodical(periodical, base_path, dry_run=False, force=False):
    """Check and download a single periodical in best available formats."""
    name = periodical["name"]
    folder = periodical["folder"]
    source_url = periodical["sourceUrl"]
    pattern = periodical["filePattern"]
    
    dest_dir = base_path / folder
    dest_dir.mkdir(parents=True, exist_ok=True)
    
    results = {"name": name, "downloads": [], "skipped": [], "errors": []}
    
    try:
        # Fetch the source page
        html = fetch_page(source_url)
        
        # Find all format links
        format_links = find_format_links(html, source_url)
        
        if not format_links:
            results["errors"].append("No downloadable formats found")
            return results
        
        results["available_formats"] = list(format_links.keys())
        
        # Select best machine-readable format
        machine_fmt, machine_url = select_best_format(format_links, MACHINE_PRIORITY)
        
        # Select best human-readable format
        human_fmt, human_url = select_best_format(format_links, HUMAN_PRIORITY)
        
        # Download machine format (if different from human)
        downloads_to_do = []
        
        if machine_fmt and machine_url:
            downloads_to_do.append(("machine", machine_fmt, machine_url))
        
        if human_fmt and human_url and (human_fmt != machine_fmt):
            downloads_to_do.append(("human", human_fmt, human_url))
        elif human_fmt == machine_fmt:
            # Same format serves both purposes
            pass
        
        for purpose, fmt, url in downloads_to_do:
            filename = get_current_month_filename(pattern, fmt)
            dest_path = dest_dir / filename
            
            if dest_path.exists() and not force:
                results["skipped"].append({"format": fmt, "purpose": purpose, "reason": "already exists"})
                continue
            
            if dry_run:
                results["downloads"].append({"format": fmt, "purpose": purpose, "status": "would download", "url": url})
                continue
            
            try:
                size = download_file(url, dest_path)
                size_mb = size / (1024 * 1024)
                results["downloads"].append({
                    "format": fmt,
                    "purpose": purpose,
                    "path": str(dest_path),
                    "size_mb": round(size_mb, 1),
                    "url": url
                })
            except Exception as e:
                results["errors"].append(f"Failed to download {fmt}: {str(e)}")
        
    except Exception as e:
        results["errors"].append(str(e))
    
    return results

def log_results(all_results):
    """Append results to the log file."""
    now = datetime.now().strftime("%Y-%m-%d %H:%M")
    
    log_entry = f"\n## {now}\n\n"
    for results in all_results:
        name = results["name"]
        log_entry += f"### {name}\n"
        
        if results.get("available_formats"):
            log_entry += f"Available formats: {', '.join(results['available_formats'])}\n"
        
        for d in results.get("downloads", []):
            log_entry += f"- ✅ Downloaded {d['format'].upper()} ({d['purpose']}): {d['size_mb']} MB\n"
        
        for s in results.get("skipped", []):
            log_entry += f"- ⏭️ Skipped {s['format'].upper()} ({s['purpose']}): {s['reason']}\n"
        
        for e in results.get("errors", []):
            log_entry += f"- ❌ Error: {e}\n"
        
        log_entry += "\n"
    
    # Append to log
    LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(LOG_PATH, "a") as f:
        f.write(log_entry)

def main():
    dry_run = "--dry-run" in sys.argv
    force = "--force" in sys.argv
    
    config = load_config()
    base_path = get_base_path(config)
    
    all_results = []
    total_downloads = 0
    
    for periodical in config["periodicals"]:
        results = check_periodical(periodical, base_path, dry_run, force)
        all_results.append(results)
        
        print(f"\n📰 {results['name']}")
        if results.get("available_formats"):
            print(f"   Formats found: {', '.join(results['available_formats'])}")
        
        for d in results.get("downloads", []):
            print(f"   ✅ {d['format'].upper()} ({d['purpose']}): {d.get('size_mb', '?')} MB")
            total_downloads += 1
        
        for s in results.get("skipped", []):
            print(f"   ⏭️ {s['format'].upper()} ({s['purpose']}): {s['reason']}")
        
        for e in results.get("errors", []):
            print(f"   ❌ {e}")
    
    if not dry_run:
        log_results(all_results)
    
    if total_downloads > 0:
        print(f"\n📚 Downloaded {total_downloads} file(s) total")
    
    return all_results

if __name__ == "__main__":
    main()

# TONY-APPROVED: 2026-03-01 | sha:e906e851