#!/usr/bin/env python3
"""
Aggregator — Combine signals from all sources and rank by relevance.
Deduplicates similar titles and picks top 10 signals across niches.
"""

import re
from typing import List, Dict
from difflib import SequenceMatcher


def deduplicate_signals(signals: List[Dict]) -> List[Dict]:
    """
    Remove duplicate or very similar signals.
    Uses string similarity to identify near-dupes.
    """
    unique = []
    for signal in signals:
        # Check if we already have a very similar signal
        is_dup = False
        for existing in unique:
            # Compare titles with similarity > 0.8
            similarity = SequenceMatcher(
                None,
                signal["title"].lower(),
                existing["title"].lower()
            ).ratio()
            if similarity > 0.75:
                is_dup = True
                # Upgrade engagement score to higher one
                if signal.get("engagement", 0) > existing.get("engagement", 0):
                    existing["engagement"] = signal["engagement"]
                break
        
        if not is_dup:
            unique.append(signal)
    
    return unique


def rank_signals(signals: List[Dict]) -> List[Dict]:
    """
    Rank signals by relevance: engagement + recency + source credibility.
    Returns signals sorted by score (highest first).
    """
    # Assign source weights
    source_weights = {
        "reddit": 1.0,      # Reddit engagement scores are reliable
        "brave": 0.7,       # Brave results ranked by algorithm
        "youtube": 0.6      # YouTube RSS doesn't expose view counts
    }
    
    for signal in signals:
        source = signal.get("source", "")
        engagement = max(signal.get("engagement", 0), 1)  # Avoid zero scoring
        
        # Calculate score: (engagement * source_weight)
        # Higher engagement + trusted source = higher score
        signal["score"] = engagement * source_weights.get(source, 0.5)
    
    return sorted(signals, key=lambda x: x["score"], reverse=True)


def aggregate_signals(
    reddit_signals: List[Dict],
    brave_signals: List[Dict],
    youtube_signals: List[Dict]
) -> List[Dict]:
    """
    Aggregate signals from all sources.
    Returns top 10 ranked signals across all niches.
    """
    all_signals = reddit_signals + brave_signals + youtube_signals
    
    # Remove duplicates
    unique = deduplicate_signals(all_signals)
    
    # Rank and return top 10
    ranked = rank_signals(unique)
    return ranked[:10]


if __name__ == "__main__":
    # Test: create mock signals
    test_signals = [
        {
            "source": "reddit",
            "niche": "USMC / Military",
            "title": "Marine Corps recruitment trending",
            "engagement": 150
        },
        {
            "source": "brave",
            "niche": "Reformed Christian",
            "title": "Reformed theology gaining popularity",
            "engagement": 0
        },
        {
            "source": "youtube",
            "niche": "Print on Demand",
            "title": "POD design tutorial",
            "engagement": 0
        }
    ]
    
    ranked = rank_signals(test_signals)
    print("Ranked signals:")
    for sig in ranked:
        print(f"  {sig['niche']}: {sig['title']} (score: {sig['score']:.1f})")
