#!/usr/bin/env python3
"""
voice.py - Voice cloning and TTS for macOS
Three-tier system: macOS native TTS, Coqui XTTS v2 (Docker), ElevenLabs API
"""

import argparse
import json
import os
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional

# Workspace paths
HOME = Path.home()
WORKSPACE = HOME / ".openclaw" / "workspace" / "voice-clone"
MODELS_DIR = WORKSPACE / "models"
SAMPLES_DIR = WORKSPACE / "samples"
OUTPUT_DIR = WORKSPACE / "output"
LOGS_DIR = WORKSPACE / "logs"

# Ensure directories exist
for directory in [WORKSPACE, MODELS_DIR, SAMPLES_DIR, OUTPUT_DIR, LOGS_DIR]:
    directory.mkdir(parents=True, exist_ok=True)


def log_operation(operation: str, details: Dict):
    """Log all voice operations for audit trail"""
    log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m-%d')}.log"
    log_entry = {
        "timestamp": datetime.now().isoformat(),
        "operation": operation,
        "details": details
    }
    with open(log_file, 'a') as f:
        f.write(json.dumps(log_entry) + "\n")


def check_command(command: str) -> bool:
    """Check if a command is available"""
    try:
        subprocess.run([command, "--version"], capture_output=True, check=False)
        return True
    except FileNotFoundError:
        return False


def check_docker() -> bool:
    """Check if Docker is available and running"""
    if not check_command("docker"):
        return False
    try:
        result = subprocess.run(["docker", "ps"], capture_output=True, check=False)
        return result.returncode == 0
    except:
        return False


def check_elevenlabs() -> bool:
    """Check if ElevenLabs API key is configured"""
    return bool(os.environ.get("ELEVENLABS_API_KEY"))


def get_system_voices() -> List[Dict[str, str]]:
    """Get list of macOS system voices"""
    try:
        result = subprocess.run(["say", "-v", "?"], capture_output=True, text=True, check=True)
        voices = []
        for line in result.stdout.strip().split("\n"):
            parts = line.split()
            if len(parts) >= 2:
                name = parts[0]
                # Language code is usually in format: en_US
                lang = parts[1] if len(parts) > 1 else "unknown"
                voices.append({"name": name, "language": lang, "tier": "macos"})
        return voices
    except:
        return []


def get_cloned_voices() -> List[Dict[str, str]]:
    """Get list of cloned voice samples"""
    voices = []
    if SAMPLES_DIR.exists():
        for sample_file in SAMPLES_DIR.glob("*.wav"):
            voices.append({
                "name": sample_file.stem,
                "file": str(sample_file),
                "tier": "coqui"
            })
    return voices


def list_voices():
    """List all available voices (system + cloned)"""
    print("🎤 Available Voices")
    print("=" * 50)
    
    # System voices (Tier 1)
    print("\n📱 macOS System Voices (Tier 1)")
    system_voices = get_system_voices()
    if system_voices:
        for voice in system_voices[:10]:  # Show first 10
            print(f"  • {voice['name']} ({voice['language']})")
        if len(system_voices) > 10:
            print(f"  ... and {len(system_voices) - 10} more")
        print(f"\n  Total: {len(system_voices)} system voices")
    else:
        print("  ⚠️  No system voices found")
    
    # Cloned voices (Tier 2)
    print("\n🔬 Cloned Voices (Tier 2)")
    cloned_voices = get_cloned_voices()
    if cloned_voices:
        for voice in cloned_voices:
            print(f"  • {voice['name']}")
        print(f"\n  Total: {len(cloned_voices)} cloned voices")
    else:
        print("  No cloned voices yet. Use 'add-sample' to create one.")
    
    # ElevenLabs (Tier 3)
    print("\n☁️  ElevenLabs API (Tier 3)")
    if check_elevenlabs():
        print("  ✓ API key configured")
    else:
        print("  ⚠️  API key not set (export ELEVENLABS_API_KEY='...')")


def list_samples():
    """List voice samples with details"""
    print("📦 Voice Samples")
    print("=" * 50)
    
    cloned = get_cloned_voices()
    if not cloned:
        print("No voice samples found.")
        print("\nAdd a sample with:")
        print("  python3 scripts/voice.py add-sample <name> <audio-file>")
        return
    
    for voice in cloned:
        path = Path(voice['file'])
        size = path.stat().st_size / 1024  # KB
        modified = datetime.fromtimestamp(path.stat().st_mtime).strftime('%Y-%m-%d %H:%M')
        print(f"\n• {voice['name']}")
        print(f"  File: {path.name}")
        print(f"  Size: {size:.1f} KB")
        print(f"  Modified: {modified}")


def add_sample(name: str, audio_file: str):
    """Add a voice sample for cloning"""
    source = Path(audio_file).expanduser().resolve()
    
    if not source.exists():
        print(f"❌ Error: Audio file not found: {source}")
        sys.exit(1)
    
    # Check file extension
    if source.suffix.lower() not in ['.wav', '.mp3', '.m4a', '.aiff', '.flac']:
        print(f"⚠️  Warning: {source.suffix} might not be supported. WAV recommended.")
    
    # Copy to samples directory
    dest = SAMPLES_DIR / f"{name}.wav"
    
    # Convert to WAV if needed using ffmpeg
    if source.suffix.lower() != '.wav':
        if not check_command("ffmpeg"):
            print("❌ Error: ffmpeg required for format conversion")
            print("   Install with: brew install ffmpeg")
            sys.exit(1)
        
        print(f"Converting {source.suffix} to WAV...")
        try:
            subprocess.run([
                "ffmpeg", "-i", str(source), "-ar", "22050", "-ac", "1", str(dest)
            ], check=True, capture_output=True)
        except subprocess.CalledProcessError as e:
            print(f"❌ Error converting audio: {e}")
            sys.exit(1)
    else:
        # Just copy the WAV file
        import shutil
        shutil.copy2(source, dest)
    
    log_operation("add_sample", {
        "name": name,
        "source": str(source),
        "destination": str(dest)
    })
    
    print(f"✅ Voice sample '{name}' added successfully")
    print(f"   Location: {dest}")
    print(f"\n💡 Usage: python3 scripts/voice.py speak \"text\" --clone {name} --output out.wav")


def remove_sample(name: str):
    """Remove a voice sample (move to trash)"""
    sample_file = SAMPLES_DIR / f"{name}.wav"
    
    if not sample_file.exists():
        print(f"❌ Error: Voice sample '{name}' not found")
        sys.exit(1)
    
    # Move to trash using macOS 'trash' or fallback to .Trash
    try:
        # Try using trash command if available
        if check_command("trash"):
            subprocess.run(["trash", str(sample_file)], check=True)
        else:
            # Fallback: move to user's Trash
            trash_dir = HOME / ".Trash"
            trash_path = trash_dir / sample_file.name
            sample_file.rename(trash_path)
        
        log_operation("remove_sample", {"name": name, "file": str(sample_file)})
        print(f"✅ Voice sample '{name}' moved to trash")
    except Exception as e:
        print(f"❌ Error removing sample: {e}")
        sys.exit(1)


def speak_macos(text: str, voice: Optional[str], output: str):
    """Generate speech using macOS say command (Tier 1)"""
    print(f"🎙️  Generating speech with macOS TTS...")
    
    # Generate to AIFF first (macOS native format)
    temp_aiff = OUTPUT_DIR / "temp.aiff"
    
    cmd = ["say", "-o", str(temp_aiff)]
    if voice:
        cmd.extend(["-v", voice])
    cmd.append(text)
    
    try:
        subprocess.run(cmd, check=True, capture_output=True)
    except subprocess.CalledProcessError as e:
        print(f"❌ Error: macOS TTS failed: {e}")
        sys.exit(1)
    
    # Convert to desired format if not AIFF
    output_path = Path(output).expanduser().resolve()
    output_ext = output_path.suffix.lower()
    
    if output_ext == '.aiff':
        temp_aiff.rename(output_path)
    else:
        if not check_command("ffmpeg"):
            print("⚠️  Warning: ffmpeg not found, output will be AIFF format")
            final_path = output_path.with_suffix('.aiff')
            temp_aiff.rename(final_path)
            print(f"✅ Speech generated: {final_path}")
            return
        
        # Convert with ffmpeg
        try:
            subprocess.run([
                "ffmpeg", "-i", str(temp_aiff), "-y", str(output_path)
            ], check=True, capture_output=True)
            temp_aiff.unlink()  # Remove temp file
        except subprocess.CalledProcessError as e:
            print(f"❌ Error converting audio: {e}")
            sys.exit(1)
    
    log_operation("speak_macos", {
        "text": text[:100],
        "voice": voice,
        "output": str(output_path)
    })
    
    print(f"✅ Speech generated: {output_path}")


def speak_coqui(text: str, clone_voice: str, output: str):
    """Generate speech using Coqui XTTS v2 via Docker (Tier 2)"""
    if not check_docker():
        print("❌ Error: Docker is not available or not running")
        print("   Start Docker Desktop or install from: https://docs.docker.com/desktop/install/mac-install/")
        sys.exit(1)
    
    sample_file = SAMPLES_DIR / f"{clone_voice}.wav"
    if not sample_file.exists():
        print(f"❌ Error: Voice sample '{clone_voice}' not found")
        print("   Add a sample with: python3 scripts/voice.py add-sample <name> <audio-file>")
        sys.exit(1)
    
    print(f"🔬 Generating speech with Coqui XTTS v2 (voice: {clone_voice})...")
    print("   This may take a moment on first run (downloading model)...")
    
    output_path = Path(output).expanduser().resolve()
    
    # Run Coqui TTS in Docker
    # Mount volumes: models cache, sample file, output directory
    docker_cmd = [
        "docker", "run", "--rm",
        "-v", f"{MODELS_DIR}:/root/.local/share/tts",
        "-v", f"{sample_file}:/tmp/speaker.wav:ro",
        "-v", f"{output_path.parent}:/tmp/output",
        "ghcr.io/coqui-ai/tts:latest",
        "tts",
        "--model_name", "tts_models/multilingual/multi-dataset/xtts_v2",
        "--text", text,
        "--speaker_wav", "/tmp/speaker.wav",
        "--language_idx", "en",
        "--out_path", f"/tmp/output/{output_path.name}"
    ]
    
    try:
        result = subprocess.run(docker_cmd, check=True, capture_output=True, text=True)
        log_operation("speak_coqui", {
            "text": text[:100],
            "clone_voice": clone_voice,
            "output": str(output_path)
        })
        print(f"✅ Speech generated: {output_path}")
    except subprocess.CalledProcessError as e:
        print(f"❌ Error: Coqui TTS failed")
        print(f"   {e.stderr}")
        sys.exit(1)


def speak_elevenlabs(text: str, output: str):
    """Generate speech using ElevenLabs API (Tier 3)"""
    api_key = os.environ.get("ELEVENLABS_API_KEY")
    if not api_key:
        print("❌ Error: ELEVENLABS_API_KEY not set")
        print("   Set with: export ELEVENLABS_API_KEY='your-key'")
        sys.exit(1)
    
    print("☁️  Generating speech with ElevenLabs API...")
    print("⚠️  Note: This uses cloud processing and may incur costs")
    
    # Simple implementation using curl
    output_path = Path(output).expanduser().resolve()
    
    # Use a default voice ID (you can make this configurable)
    voice_id = "21m00Tcm4TlvDq8ikWAM"  # Rachel voice
    
    curl_cmd = [
        "curl", "-X", "POST",
        f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
        "-H", f"xi-api-key: {api_key}",
        "-H", "Content-Type: application/json",
        "-d", json.dumps({"text": text, "model_id": "eleven_monolingual_v1"}),
        "-o", str(output_path)
    ]
    
    try:
        subprocess.run(curl_cmd, check=True, capture_output=True)
        log_operation("speak_elevenlabs", {
            "text": text[:100],
            "output": str(output_path)
        })
        print(f"✅ Speech generated: {output_path}")
    except subprocess.CalledProcessError as e:
        print(f"❌ Error: ElevenLabs API call failed: {e}")
        sys.exit(1)


def speak(text: str, voice: Optional[str], clone: Optional[str], 
          elevenlabs: bool, output: str):
    """Main speak function - routes to appropriate tier"""
    
    if elevenlabs:
        # Tier 3: ElevenLabs
        speak_elevenlabs(text, output)
    elif clone:
        # Tier 2: Coqui with voice cloning
        speak_coqui(text, clone, output)
    else:
        # Tier 1: macOS native TTS
        speak_macos(text, voice, output)


def main():
    parser = argparse.ArgumentParser(
        description="Voice cloning and TTS for macOS",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # List available voices
  %(prog)s voices
  
  # Basic TTS with system voice
  %(prog)s speak "Hello world" --voice Alex --output hello.wav
  
  # Add voice sample for cloning
  %(prog)s add-sample my-voice ~/audio/sample.wav
  
  # Generate speech with cloned voice
  %(prog)s speak "Hello" --clone my-voice --output out.wav
  
  # List voice samples
  %(prog)s samples
  
  # Remove voice sample
  %(prog)s remove-sample my-voice
        """
    )
    
    subparsers = parser.add_subparsers(dest="command", help="Command to run")
    
    # voices command
    subparsers.add_parser("voices", help="List available voices")
    
    # samples command
    subparsers.add_parser("samples", help="List voice samples")
    
    # speak command
    speak_parser = subparsers.add_parser("speak", help="Generate speech")
    speak_parser.add_argument("text", help="Text to convert to speech")
    speak_parser.add_argument("--voice", help="System voice name (Tier 1)")
    speak_parser.add_argument("--clone", help="Cloned voice name (Tier 2)")
    speak_parser.add_argument("--elevenlabs", action="store_true", 
                             help="Use ElevenLabs API (Tier 3)")
    speak_parser.add_argument("--output", required=True, 
                             help="Output file path (WAV, MP3, OGG)")
    
    # add-sample command
    add_parser = subparsers.add_parser("add-sample", help="Add voice sample for cloning")
    add_parser.add_argument("name", help="Name for the voice sample")
    add_parser.add_argument("audio_file", help="Path to audio file (6-30 seconds)")
    
    # remove-sample command
    remove_parser = subparsers.add_parser("remove-sample", help="Remove voice sample")
    remove_parser.add_argument("name", help="Name of voice sample to remove")
    
    args = parser.parse_args()
    
    if not args.command:
        parser.print_help()
        sys.exit(1)
    
    # Route to appropriate function
    if args.command == "voices":
        list_voices()
    elif args.command == "samples":
        list_samples()
    elif args.command == "speak":
        speak(args.text, args.voice, args.clone, args.elevenlabs, args.output)
    elif args.command == "add-sample":
        add_sample(args.name, args.audio_file)
    elif args.command == "remove-sample":
        remove_sample(args.name)


if __name__ == "__main__":
    main()
