#!/usr/bin/env python3
"""
Omi Transcript Processor (Phase 1 - Local BLE Mode)

Processes transcripts from both:
1. Cloud webhook (existing format)
2. Local BLE direct pipeline (new format)

Extracts actionable insights, action items, and summaries.

Usage:
    python3 transcript_processor.py --mode local  # Process local BLE transcripts
    python3 transcript_processor.py --mode cloud  # Process cloud transcripts
    python3 transcript_processor.py --mode both   # Process both sources
"""

import json
import logging
import sys
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Set

logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s'
)

WORKSPACE = Path.home() / ".openclaw/workspace"
TRANSCRIPTS_DIR = WORKSPACE / "omi-data/transcripts"
INSIGHTS_DIR = WORKSPACE / "omi-data/insights"
PROCESSED_LOG = WORKSPACE / "omi-data/processed.json"

# Phase 1: Local pipeline directories
LOCAL_TRANSCRIPTS_DIR = WORKSPACE / "projects/active/omi-direct-pipeline/omi-data/transcripts"
LOCAL_INSIGHTS_DIR = WORKSPACE / "projects/active/omi-direct-pipeline/omi-data/insights"


class TranscriptProcessor:
    """Process transcripts and extract insights."""
    
    # Keywords for topic detection
    TOPIC_KEYWORDS = [
        "meeting", "appointment", "doctor", "call", "email",
        "buy", "purchase", "fix", "repair", "reminder",
        "tomorrow", "deadline", "important", "urgent",
        "project", "task", "work", "home", "family"
    ]
    
    # Phrases indicating action items
    ACTION_PHRASES = [
        "need to", "have to", "should", "must", "don't forget",
        "remind me", "remember to", "make sure", "need",
        "follow up", "check", "call", "email", "send"
    ]
    
    # Phrases for commitments
    COMMITMENT_PHRASES = [
        "will", "going to", "plan to", "promise to",
        "agreed to", "commit to", "committed"
    ]
    
    def __init__(self):
        self.processed_ids = self.load_processed()
    
    def load_processed(self) -> Set[str]:
        """Load set of already-processed transcript IDs."""
        if PROCESSED_LOG.exists():
            with open(PROCESSED_LOG) as f:
                data = json.load(f)
                return set(data.get("processed_ids", []))
        return set()
    
    def save_processed(self, transcript_id: str):
        """Mark transcript as processed."""
        self.processed_ids.add(transcript_id)
        
        PROCESSED_LOG.parent.mkdir(parents=True, exist_ok=True)
        data = {
            "processed_ids": sorted(list(self.processed_ids)),
            "last_run": datetime.now().isoformat()
        }
        
        with open(PROCESSED_LOG, 'w') as f:
            json.dump(data, f, indent=2)
    
    def extract_insights(self, transcript: Dict) -> Dict:
        """Extract insights from transcript."""
        insights = {
            "id": transcript.get("id", "unknown"),
            "timestamp": transcript.get("processed_at") or transcript.get("created_at") or datetime.now().isoformat(),
            "source": transcript.get("source", "unknown"),
            "duration_seconds": transcript.get("duration", 0),
            "language": transcript.get("language", "unknown"),
            "speakers": [],
            "topics": [],
            "action_items": [],
            "commitments": [],
            "summary": ""
        }
        
        # Get transcript text
        text = ""
        
        # Local BLE format: "text" field in Whisper output
        if "text" in transcript:
            text = transcript["text"]
        # Cloud format: transcript_segments array
        elif "transcript_segments" in transcript:
            segments = transcript.get("transcript_segments", [])
            text = " ".join([s.get("text", "") for s in segments])
            
            # Extract speakers
            speakers = set()
            for seg in segments:
                if seg.get("speaker"):
                    speakers.add(seg["speaker"])
            insights["speakers"] = list(speakers)
        
        # Analyze text
        if text:
            text_lower = text.lower()
            
            # Topic detection
            found_topics = [kw for kw in self.TOPIC_KEYWORDS if kw in text_lower]
            insights["topics"] = list(set(found_topics))
            
            # Action item detection
            for phrase in self.ACTION_PHRASES:
                if phrase in text_lower:
                    insights["action_items"].append({
                        "type": "detected",
                        "phrase": phrase,
                        "note": "Review transcript for specific action items"
                    })
            
            # Commitment detection
            found_commitments = []
            for phrase in self.COMMITMENT_PHRASES:
                if phrase in text_lower:
                    found_commitments.append(phrase)
            
            if found_commitments:
                insights["commitments"] = list(set(found_commitments))
            
            # Build summary
            word_count = len(text.split())
            segment_count = len(transcript.get("transcript_segments", [1]))
            insights["summary"] = {
                "word_count": word_count,
                "segment_count": segment_count,
                "has_action_items": len(insights["action_items"]) > 0,
                "has_commitments": len(insights["commitments"]) > 0,
                "topics_detected": len(insights["topics"]),
                "first_100_chars": text[:100]
            }
        
        return insights
    
    def get_local_unprocessed_transcripts(self, days: int = 7) -> List[Path]:
        """Find unprocessed transcripts from local BLE pipeline."""
        unprocessed = []
        
        if not LOCAL_TRANSCRIPTS_DIR.exists():
            return unprocessed
        
        # Check last N days
        for i in range(days):
            date = datetime.now() - timedelta(days=i)
            date_folder = LOCAL_TRANSCRIPTS_DIR / date.strftime("%Y-%m-%d")
            
            if date_folder.exists():
                for transcript_file in date_folder.glob("*.json"):
                    transcript_id = transcript_file.stem
                    if transcript_id not in self.processed_ids:
                        unprocessed.append(transcript_file)
        
        return sorted(unprocessed, reverse=True)
    
    def get_cloud_unprocessed_transcripts(self, days: int = 7) -> List[Path]:
        """Find unprocessed transcripts from cloud webhook."""
        unprocessed = []
        
        if not TRANSCRIPTS_DIR.exists():
            return unprocessed
        
        # Check last N days
        for i in range(days):
            date = datetime.now() - timedelta(days=i)
            date_folder = TRANSCRIPTS_DIR / date.strftime("%Y-%m-%d")
            
            if date_folder.exists():
                for transcript_file in date_folder.glob("*.json"):
                    transcript_id = transcript_file.stem
                    if transcript_id not in self.processed_ids:
                        unprocessed.append(transcript_file)
        
        return sorted(unprocessed, reverse=True)
    
    def process_transcripts(self, mode: str = "both", days: int = 7) -> Dict:
        """Process transcripts based on mode."""
        results = {
            "local_processed": 0,
            "cloud_processed": 0,
            "total_insights": [],
            "errors": []
        }
        
        files_to_process = []
        
        # Collect files based on mode
        if mode in ["local", "both"]:
            files_to_process.extend(self.get_local_unprocessed_transcripts(days))
        
        if mode in ["cloud", "both"]:
            files_to_process.extend(self.get_cloud_unprocessed_transcripts(days))
        
        if not files_to_process:
            logger.info("No new transcripts to process")
            return results
        
        logger.info(f"Processing {len(files_to_process)} transcripts...")
        
        for transcript_path in files_to_process:
            try:
                with open(transcript_path) as f:
                    transcript = json.load(f)
                
                # Extract insights
                insights = self.extract_insights(transcript)
                results["total_insights"].append(insights)
                
                # Mark as processed
                self.save_processed(transcript_path.stem)
                
                # Track source
                if "local" in str(transcript_path):
                    results["local_processed"] += 1
                else:
                    results["cloud_processed"] += 1
                
                logger.info(f"✓ Processed: {transcript_path.name}")
                
                # Flag if contains action items
                if insights["action_items"]:
                    logger.info(f"  ⚠️ Contains potential action items")
                
            except Exception as e:
                logger.error(f"✗ Error processing {transcript_path.name}: {e}")
                results["errors"].append(str(transcript_path))
        
        # Save insights
        if results["total_insights"]:
            self.save_insights(results["total_insights"])
            logger.info(f"\n=== Summary ===")
            logger.info(f"Local: {results['local_processed']} | Cloud: {results['cloud_processed']} | Total: {len(results['total_insights'])}")
        
        return results
    
    def save_insights(self, insights: List[Dict]):
        """Save insights to date-organized folder."""
        LOCAL_INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)
        INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)
        
        today = datetime.now().strftime("%Y-%m-%d")
        timestamp = datetime.now().strftime("%H%M%S")
        
        # Save to both local and main insights directories
        for insights_dir in [LOCAL_INSIGHTS_DIR, INSIGHTS_DIR]:
            insights_file = insights_dir / f"insights-{today}-{timestamp}.json"
            
            # Merge with existing insights for today
            existing = []
            base_file = insights_dir / f"insights-{today}.json"
            if base_file.exists():
                with open(base_file) as f:
                    existing = json.load(f)
            
            existing.extend(insights)
            
            with open(insights_file, 'w') as f:
                json.dump(insights, f, indent=2)
            
            with open(base_file, 'w') as f:
                json.dump(existing, f, indent=2)
        
        logger.info(f"Saved {len(insights)} insights")


def main():
    import argparse
    
    parser = argparse.ArgumentParser(description="Omi Transcript Processor")
    parser.add_argument("--mode", choices=["local", "cloud", "both"], default="both",
                        help="Which transcripts to process")
    parser.add_argument("--days", type=int, default=7,
                        help="Look back this many days for new transcripts")
    
    args = parser.parse_args()
    
    processor = TranscriptProcessor()
    results = processor.process_transcripts(mode=args.mode, days=args.days)
    
    print(f"\n✅ Processed {results['local_processed'] + results['cloud_processed']} transcripts")
    if results["errors"]:
        print(f"⚠️ Errors: {len(results['errors'])}")
    
    sys.exit(0 if not results["errors"] else 1)


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        logger.info("Interrupted by user")
    except Exception as e:
        logger.error(f"Fatal error: {e}")
        sys.exit(1)
