#!/usr/bin/env python3
"""
Scout generator using Brave Search API.

Writes: etsy/opportunities/YYYY-MM-DD.json

Notes:
- Rate limit friendly: sleeps between requests
- Generates 3–5 opportunities/day (priority 3–5)
- Basic de-dupe: avoids repeating similar themes across last 7 days
"""

import json
import os
import re
import time
from datetime import date, datetime, timedelta
from pathlib import Path
from typing import Dict, List, Tuple
import urllib.parse
import urllib.request

# --- Workspace Resolution (robust) ---
WORKSPACE = Path(os.environ.get("OPENCLAW_WORKSPACE", "/Users/tonyclaw/.openclaw/workspace"))

def load_env_file(path: Path) -> None:
    """Minimal .env loader (no external deps). Does not override existing env."""
    try:
        for raw in path.read_text(encoding="utf-8").splitlines():
            line = raw.strip()
            if not line or line.startswith("#") or "=" not in line:
                continue
            k, v = line.split("=", 1)
            k = k.strip()
            v = v.strip().strip('"').strip("'")
            if k and (k not in os.environ):
                os.environ[k] = v
    except Exception:
        pass

# Load .env if present
ENV_FILE = WORKSPACE / ".env"
if ENV_FILE.exists():
    load_env_file(ENV_FILE)

ETSY_DIR = WORKSPACE / "etsy"
OPPS_DIR = ETSY_DIR / "opportunities"
CALENDAR_PATH = ETSY_DIR / "annual-campaign-calendar.md"

MEMORY_DIR = WORKSPACE / "memory"
MEMORY_DIR.mkdir(parents=True, exist_ok=True)

DEBUG = True
DEBUG_LOG = MEMORY_DIR / "etsy-scout-generate-debug.log"

MAX_OPPS_PER_DAY = 5
MIN_PRIORITY = 3
BRAVE_RATE_LIMIT_SECONDS = 1.1  # keep it safe: ~1 req/sec

# Must match your pipeline schema niche enum
NICHES = [
    "USMC/Military",
    "Reformed Christian",
    "Patriotic",
    "Nature/Outdoor",
]

# Quick niche keyword packs (safe + generic; no brands)
NICHE_KEYWORDS = {
    "USMC/Military": ["military svg", "veteran svg", "deployment svg", "patriotic soldier svg"],
    "Reformed Christian": ["christian svg", "reformed svg", "sola fide svg", "empty tomb svg"],
    "Patriotic": ["patriotic svg", "american flag svg", "eagle svg", "liberty svg"],
    "Nature/Outdoor": ["mountain svg", "hiking svg", "camping svg", "deer silhouette svg"],
}

NICHE_ANCHORS = {
    "USMC/Military": ["soldier", "veteran", "marine", "military", "helmet", "boots", "rifle", "dog tag"],
    "Reformed Christian": ["cross", "empty tomb", "tomb", "lily", "lamb", "bible", "chalice", "crown of thorns"],
    "Patriotic": ["flag", "eagle", "liberty", "stars", "stripes", "usa", "america"],
    "Nature/Outdoor": ["mountain", "pine", "deer", "elk", "bear", "camp", "hike", "forest", "lake"],
}

# Blocklist for obvious IP/trademark pitfalls (extend over time)
BLOCK_TERMS = [
    "disney", "marvel", "star wars", "pokemon", "nfl", "nba", "mlb",
    "harley", "nike", "adidas", "lego", "hello kitty", "minecraft",
]

DEFAULT_AVOID = [
    "brands", "sports teams", "characters", "logos", "trademark phrases",
    "Bible verse text", "licensed slogans"
]

MONTHS = {
    "jan": 1, "january": 1,
    "feb": 2, "february": 2,
    "mar": 3, "march": 3,
    "apr": 4, "april": 4,
    "may": 5,
    "jun": 6, "june": 6,
    "jul": 7, "july": 7,
    "aug": 8, "august": 8,
    "sep": 9, "sept": 9, "september": 9,
    "oct": 10, "october": 10,
    "nov": 11, "november": 11,
    "dec": 12, "december": 12,
}

def _parse_md_month_day(s: str, year: int) -> date | None:
    """
    Parses strings like 'Apr 12', 'April 12', 'Sep 18', 'Oct 31, 2026'.
    Returns a date or None.
    """
    if not s:
        return None
    s = s.strip()
    s = re.sub(r"[,*]", "", s)  # remove commas/asterisks
    parts = s.split()
    if len(parts) < 2:
        return None

    # If year is present at end, use it
    try_year = year
    if parts[-1].isdigit() and len(parts[-1]) == 4:
        try_year = int(parts[-1])
        parts = parts[:-1]

    m = MONTHS.get(parts[0].lower())
    if not m:
        return None

    # day could be '4' or '04'
    try:
        d = int(parts[1])
    except ValueError:
        return None

    return date(try_year, m, d)

def parse_campaign_calendar(calendar_path: str | "Path", year: int) -> list[dict]:
    """
    Extract events from the 'Quick Reference: All 2026 Deadlines' markdown table.
    Expected columns: Event | Date | Upload By | Target | Status
    """
    path = Path(calendar_path)
    if not path.exists():
        return []

    lines = path.read_text(encoding="utf-8").splitlines()

    in_table = False
    events: list[dict] = []

    row_re = re.compile(r"^\|\s*(.*?)\s*\|\s*(.*?)\s*\|\s*(.*?)\s*\|\s*(.*?)\s*\|\s*(.*?)\s*\|\s*$")

    for line in lines:
        # Enter the first big table
        if line.strip().startswith("| Event | Date | Upload By | Target | Status |"):
            in_table = True
            continue
        if in_table and line.strip().startswith("|---"):
            continue

        if in_table:
            m = row_re.match(line.strip())
            if not m:
                # Stop when table ends
                if line.strip() == "" or not line.strip().startswith("|"):
                    break
                continue

            raw_event, raw_date, raw_upload_by, raw_target, raw_status = [x.strip() for x in m.groups()]

            # Clean formatting (remove **bold** and emojis)
            core = "⭐" in raw_event
            event_name = raw_event.replace("⭐", "")
            event_name = re.sub(r"\*\*(.*?)\*\*", r"\1", event_name).strip()

            status = re.sub(r"[^\w\s🔜✅❌-]", "", raw_status).strip()  # keep basic markers

            event_date = _parse_md_month_day(re.sub(r"\*\*", "", raw_date), year)
            upload_by = _parse_md_month_day(re.sub(r"\*\*", "", raw_upload_by), year)

            # target might be bold '**50**'
            target_clean = re.sub(r"[^\d]", "", raw_target)
            target = int(target_clean) if target_clean.isdigit() else 0

            if not event_date or not upload_by:
                continue

            events.append({
                "event": event_name,
                "event_date": event_date,
                "upload_by": upload_by,
                "target": target,
                "status": status,
                "core": core,
            })

    return events

def select_upcoming_events(events: list[dict], today: date, days_ahead: int = 60) -> list[dict]:
    """
    Filter events where Upload By is within [0, days_ahead] days from today,
    and not already marked MISSED.
    """
    upcoming = []
    for e in events:
        # Skip missed events (your table uses ❌ MISSED)
        if "MISSED" in (e.get("status") or ""):
            continue

        delta = (e["upload_by"] - today).days
        if 0 <= delta <= days_ahead:
            upcoming.append(e)

    # sort: core first, then soonest upload deadline
    upcoming.sort(key=lambda x: (not x["core"], x["upload_by"]))
    return upcoming

def today_str() -> str:
    return date.today().isoformat()

def load_recent_themes(days: int = 7) -> List[str]:
    """Read last N opportunity files and pull themes for de-dupe."""
    themes = []
    if not OPPS_DIR.exists():
        return themes
    cutoff = date.today() - timedelta(days=days)
    for p in sorted(OPPS_DIR.glob("*.json")):
        try:
            d = datetime.strptime(p.stem, "%Y-%m-%d").date()
        except Exception:
            continue
        if d < cutoff:
            continue
        try:
            data = json.loads(p.read_text(encoding="utf-8"))
            for opp in data.get("opportunities", []):
                t = opp.get("theme")
                if t:
                    themes.append(t)
        except Exception:
            pass
    return themes

def _core_theme(s: str) -> str:
    s = (s or "").strip().lower()
    # Strip calendar prefix like: "Memorial Day (Upload by 2026-04-12) "
    if ") " in s:
        s = s.split(") ", 1)[1].strip()
    return s

def too_similar(theme: str, recent_themes: list) -> bool:
    """
    Returns True if theme is too similar to any recent theme.
    Comparison is done on a normalized core theme that removes the event prefix.
    """
    t = _core_theme(theme)
    if not t:
        return True

    recent_core = [_core_theme(x) for x in (recent_themes or []) if x]
    if not recent_core:
        return False

    # Exact match
    if t in set(recent_core):
        return True

    # Simple token overlap heuristic (cheap, no deps)
    t_tokens = set([w for w in re.split(r"[^a-z0-9]+", t) if w])
    if not t_tokens:
        return False

    for r in recent_core:
        r_tokens = set([w for w in re.split(r"[^a-z0-9]+", r) if w])
        if not r_tokens:
            continue
        overlap = len(t_tokens & r_tokens) / max(1, len(t_tokens | r_tokens))
        if overlap >= 0.75:
            return True

    return False

def contains_block_term(s: str) -> bool:
    s2 = s.lower()
    return any(t in s2 for t in BLOCK_TERMS)

def brave_search(query: str, api_key: str, count: int = 5) -> List[Dict]:
    """Brave Web Search API."""
    q = urllib.parse.quote(query)
    url = f"https://api.search.brave.com/res/v1/web/search?q={q}&count={count}"
    req = urllib.request.Request(url)
    req.add_header("Accept", "application/json")
    req.add_header("X-Subscription-Token", api_key)

    with urllib.request.urlopen(req, timeout=30) as resp:
        raw = resp.read().decode("utf-8")
        data = json.loads(raw)

    # Brave returns web.results typically
    results = (data.get("web") or {}).get("results") or []
    out = []
    for r in results:
        out.append({
            "title": r.get("title", ""),
            "description": r.get("description", ""),
            "url": r.get("url", ""),
        })
    return out

def extract_theme_from_results(niche: str, results: List[Dict], seasonal_hint: str = "") -> str:
    """
    Simple heuristic: generate a safe, cut-friendly theme from result titles/snippets.
    We intentionally avoid copying phrases.
    """
    # Fallback safe themes per niche if signals are weak
    fallback = {
        "USMC/Military": f"{seasonal_hint} veteran silhouette badge (no text)".strip(),
        "Reformed Christian": f"{seasonal_hint} empty tomb sunrise silhouette (no text)".strip(),
        "Patriotic": f"{seasonal_hint} bald eagle with flag silhouette (no text)".strip(),
        "Nature/Outdoor": f"{seasonal_hint} mountain line art silhouette (no text)".strip(),
    }

    for r in results:
        text = f"{r.get('title','')} {r.get('description','')}".lower()

        # Skip risky stuff
        if contains_block_term(text):
            continue

        # Pull a few safe motif keywords
        motifs = []
        for kw in ["eagle", "flag", "cross", "tomb", "lily", "mountain", "deer", "bear", "camp", "hiking", "veteran", "soldier"]:
            if kw in text:
                motifs.append(kw)

        motifs = list(dict.fromkeys(motifs))[:2]  # max 2 motifs
        if motifs:
            motif_phrase = " ".join(motifs)
            # Keep it cut-friendly and no-text
            return f"{seasonal_hint} {motif_phrase} bold silhouette (no text)".strip()

    return fallback[niche] if fallback[niche] else f"{niche} bold silhouette design (no text)"

def pick_seasonal_hint() -> Tuple[int, str, str]:
    """
    Simple calendar awareness:
    If calendar file exists and has lines with YYYY-MM-DD, pick the nearest event 6–8 weeks out.
    Otherwise, return a generic spring hint.
    """
    target_from = date.today() + timedelta(days=42)
    target_to = date.today() + timedelta(days=56)

    if CALENDAR_PATH.exists():
        lines = CALENDAR_PATH.read_text(encoding="utf-8", errors="ignore").splitlines()
        candidates = []
        for line in lines:
            m = re.search(r"(20\d{2}-\d{2}-\d{2}).*?-\s*(.+)$", line)
            if not m:
                continue
            d = None
            try:
                d = datetime.strptime(m.group(1), "%Y-%m-%d").date()
            except Exception:
                continue
            if target_from <= d <= target_to:
                candidates.append((d, m.group(2).strip()))
        if candidates:
            d, name = sorted(candidates, key=lambda x: x[0])[0]
            urgency = (d - date.today()).days
            deadline = (d - timedelta(days=28)).isoformat()  # rough: “make it ready ~4 wks before”
            seasonal_hint = f"{name}".strip()
            return urgency, deadline, seasonal_hint

    # Fallback: generic seasonal hint
    urgency = 49
    deadline = (date.today() + timedelta(days=21)).isoformat()
    seasonal_hint = "Seasonal"
    return urgency, deadline, seasonal_hint

def dbg(msg: str):
    if not DEBUG:
        return
    DEBUG_LOG.parent.mkdir(parents=True, exist_ok=True)
    with open(DEBUG_LOG, "a", encoding="utf-8") as f:
        f.write(msg.rstrip() + "\n")

def theme_matches_niche(theme: str, niche: str) -> bool:
    t = (theme or "").lower()
    anchors = NICHE_ANCHORS.get(niche, [])
    return any(a.lower() in t for a in anchors)

def main():
    api_key = os.environ.get("BRAVE_API_KEY", "").strip()
    use_brave = bool(api_key)
    if not use_brave:
        dbg("BRAVE_API_KEY is not set. Continuing in calendar-only mode.")
        use_brave = False

    OPPS_DIR.mkdir(parents=True, exist_ok=True)
    out_path = OPPS_DIR / f"{today_str()}.json"

    recent_themes = load_recent_themes(days=7)
    if DEBUG:
        dbg("----")
        dbg(f"Run: {datetime.now().isoformat()}")
        dbg(f"Recent themes loaded: {len(recent_themes)}")
    urgency, deadline, seasonal_hint = pick_seasonal_hint()

    opportunities = []
    seen_niches = set()
    run_date = date.today().strftime("%Y%m%d")
    opp_index = 1

    today = date.today()

    # Calendar-driven focus: prefer events whose Upload By deadline is within 60 days.
    # If none are in-window (or calendar missing), fall back to generic seasonal logic.
    events = parse_campaign_calendar(CALENDAR_PATH, year=today.year)
    upcoming_events = select_upcoming_events(events, today=today, days_ahead=60)
    focus_events = upcoming_events[:2]  # keep output tight

    if focus_events:
        evt = focus_events[0]  # one active event per run keeps output coherent
        seasonal_hint = f"{evt['event']} (Upload by {evt['upload_by'].isoformat()})"
        deadline = evt['upload_by'].isoformat()
        urgency = (evt['upload_by'] - today).days
    else:
        urgency, deadline, seasonal_hint = pick_seasonal_hint()

    # Use seasonal_hint to bias searches + theme text
    # and set opp.deadline/urgency from the calendar

    for niche in NICHES:
        # Use 1 query per niche to keep within rate limits
        query = f"{NICHE_KEYWORDS[niche][0]} trending svg ideas {seasonal_hint}"
        results = []
        
        if use_brave:
            try:
                results = brave_search(query, api_key, count=6)
            except Exception as e:
                dbg(f"Brave search failed. Falling back to calendar-only. Error: {e}")
                use_brave = False
                results = []
            time.sleep(BRAVE_RATE_LIMIT_SECONDS)

        theme = extract_theme_from_results(niche, results, seasonal_hint=seasonal_hint)

        if theme and not theme_matches_niche(theme, niche):
            dbg(f"[{niche}] SKIP: theme does not match niche anchors: {theme}")
            continue

        if not theme:
            dbg(f"[{niche}] SKIP: empty theme (results={len(results)})")
            continue

        if contains_block_term(theme):
            dbg(f"[{niche}] SKIP: blocked term in theme: {theme}")
            continue

        if too_similar(theme, recent_themes[:5]):
            dbg(f"[{niche}] SKIP: too_similar vs recent themes: {theme}")
            continue
            
        dbg(f"[{niche}] OK: theme={theme} (results={len(results)})")

        # Priority heuristic: closer deadline + strong niche = higher
        priority = 5 if urgency <= 56 else 4
        
        if priority < MIN_PRIORITY:
            dbg(f"[{niche}] SKIP: low priority ({priority})")
            continue

        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
            
        opp = {
            "id": f"opp-{timestamp}-{opp_index:03d}",
            "theme": theme,
            "niche": niche,
            "urgency": urgency,
            "deadline": deadline,
            "rationale": f"Seasonal window ({seasonal_hint}) + niche fit ({niche}). Theme is silhouette-friendly and avoids text.",
            "keywords": [k for k in NICHE_KEYWORDS[niche][:3]],
            "avoidList": DEFAULT_AVOID,
            "priority": priority,
        }
        opportunities.append(opp)
        recent_themes.append(theme)
        opp_index += 1

        seen_niches.add(niche)
        if len(opportunities) >= MAX_OPPS_PER_DAY:
            break
            
    # --- Load existing opportunities (if file exists) ---
    existing_opps = []
    if out_path.exists():
        try:
            existing_data = json.loads(out_path.read_text(encoding="utf-8"))
            existing_opps = existing_data.get("opportunities", [])
        except Exception:
            existing_opps = []

    # --- Merge + dedupe by ID ---
    existing_ids = {o["id"] for o in existing_opps if "id" in o}
    new_unique = [o for o in opportunities if o.get("id") not in existing_ids]

    merged = existing_opps + new_unique

    # --- Write back ---
    data = {
        "date": today_str(),
        "opportunities": merged
    }

    out_path.write_text(
        json.dumps(data, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8"
    )

    print(f"Wrote {len(new_unique)} new opportunities (total: {len(merged)}): {out_path}")

if __name__ == "__main__":
    main()