#!/usr/bin/env python3
"""
generate_mfm_audio.py — TTS generator for Multi-Family Millions
Generates OpenAI TTS audio files for any chapter of the book.

Usage:
  python3 generate_mfm_audio.py --chapter 2          # Generate Ch 2 audio
  python3 generate_mfm_audio.py --chapter 1 --regen  # Regenerate Ch 1
  python3 generate_mfm_audio.py --chapter 2 --dry-run  # Preview chunks only
  python3 generate_mfm_audio.py --chapter 2 --voice shimmer
  python3 generate_mfm_audio.py --list               # Show all chapter info

Fix applied: Proper paragraph-boundary chunking (no mid-sentence cuts).
Previous issue: Ch 1 was generated without clean chunk boundaries; Ch 2 was never generated.

Generated audio uses OpenAI TTS "echo" voice (natural, warm tone).
Cost: ~$0.015 per 1,000 characters (gpt-4o-mini-tts pricing).
"""

import os
import sys
import json
import re
import subprocess
import argparse
from pathlib import Path

try:
    from openai import OpenAI
except ImportError:
    print("ERROR: openai package not installed. Run: pip3 install openai")
    sys.exit(1)

# ─────────────────────────────────────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────────────────────────────────────

AUTH_PROFILE = os.path.expanduser("~/.openclaw/auth-profiles.json")
PDF_PATH = os.path.expanduser("~/Documents/MF Investing/Multi-Family Millions.pdf")
OUTPUT_BASE = os.path.expanduser("~/Documents/MF Investing")

# Line ranges for each chapter in the extracted PDF text (1-indexed)
# Determined by grepping body text (after TOC)
CHAPTER_LINES = {
    1:  (377,   570),
    2:  (571,  1105),
    3:  (1106, 2120),
    4:  (2121, 2754),
    5:  (2755, 3459),
    6:  (3460, 3935),
    7:  (3936, 4532),
    8:  (4533, 5047),
    9:  (5048, 5513),
    10: (5514, 6026),
    11: (6027, 6658),
    12: (6659, 7009),
    13: (7010, 7542),
    14: (7543, 8012),
    15: (8013, 9999),  # Last chapter — runs to end
}

# Output folder mapping per chapter
CHAPTER_FOLDERS = {
    1:  "Phase 1 Week 1 Reading",
    2:  "Phase 1 Week 1 Reading",
    3:  "Phase 1 Week 2 Reading",
    4:  "Phase 1 Week 2 Reading",
    5:  "Phase 2 Week 5 Reading",
    6:  "Phase 2 Week 6 Reading",
    7:  "Phase 2 Week 7 Reading",
    8:  "Phase 2 Week 8 Reading",
    9:  "Phase 3 Week 9 Reading",
    10: "Phase 3 Week 10 Reading",
    11: "Phase 3 Week 11 Reading",
    12: "Phase 3 Week 12 Reading",
    13: "Phase 3 Week 9 Reading",   # Adjust if needed
    14: "Phase 3 Week 10 Reading",  # Adjust if needed
    15: "Phase 3 Week 11 Reading",  # Adjust if needed
}

CHAPTER_TITLES = {
    1:  "A Different Approach that Creates Huge Real Estate Profits",
    2:  "Why Invest in Apartments?",
    3:  "An Overview of How to Get Your First Deal",
    4:  "Where to Find Enormous Profits from Repositioning",
    5:  "How to Attract Deals to You with a Minimum of Time and Money",
    6:  "Separating the Gold Mines from the Land Mines",
    7:  "How to Analyze a Property Using the Fewest Numbers for the Most Profit",
    8:  "Where to Get the Money for All Your Deals",
    9:  "Twelve Negotiating Secrets of the Pros",
    10: "The 80/20 Rule of Rehabbing",
    11: "How to Avoid Being a Landlord: Secrets to Hiring Great Property Managers",
    12: "Reselling for Huge Profits",
    13: "The 10 Biggest Mistakes Repositioners Make, and How to Avoid Them",
    14: "Creating Your Success Team",
    15: "Next Steps on Your Road to Wealth",
}

MAX_CHUNK_CHARS = 4000   # OpenAI TTS limit is 4096; stay safe
DEFAULT_VOICE   = "echo" # Natural, warm voice (per TOOLS.md)
TTS_MODEL       = "tts-1" # Standard quality; use tts-1-hd for higher quality

# ─────────────────────────────────────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────────────────────────────────────

def load_api_key():
    with open(AUTH_PROFILE) as f:
        profiles = json.load(f)
    key = profiles.get("openai:manual", {}).get("token")
    if not key:
        raise ValueError("No OpenAI token found in auth-profiles.json")
    return key


def extract_pdf_text():
    """Extract full text from PDF using pdftotext."""
    result = subprocess.run(
        ["pdftotext", PDF_PATH, "-"],
        capture_output=True, text=True, check=True
    )
    return result.stdout


def get_chapter_text(full_text, chapter_num):
    """Extract text for a specific chapter using known line ranges."""
    lines = full_text.split('\n')
    start, end = CHAPTER_LINES[chapter_num]
    # Convert to 0-indexed
    chapter_lines = lines[start - 1 : end]
    return '\n'.join(chapter_lines)


def clean_text(text):
    """
    Clean extracted PDF text for TTS:
    - Rejoin hyphenated line-breaks (e.g., "real-\nestate" → "real-estate")
    - Normalize multiple blank lines to single paragraph breaks
    - Remove page numbers (lone digits on their own line)
    - Fix common PDF extraction artifacts
    """
    # Remove form-feed characters (page breaks)
    text = text.replace('\x0c', '\n\n')
    
    # Rejoin hyphenated words split across lines
    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)
    
    # Rejoin words split across lines without hyphen (lines ending mid-word at page boundary)
    # Only when the line doesn't end with punctuation
    text = re.sub(r'([a-z])\n([a-z])', r'\1 \2', text)
    
    # Remove lone page numbers (lines with just a number, possibly surrounded by whitespace)
    text = re.sub(r'^\s*\d{1,3}\s*$', '', text, flags=re.MULTILINE)
    
    # Normalize multiple blank lines
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    # Strip trailing whitespace on each line
    text = '\n'.join(line.rstrip() for line in text.split('\n'))
    
    return text.strip()


def chunk_text(text, max_chars=MAX_CHUNK_CHARS):
    """
    Split text into TTS-safe chunks at paragraph/sentence boundaries.
    Never splits mid-sentence. Prefers paragraph boundaries, falls back to
    sentence boundaries (period/question/exclamation), then clause boundaries.
    
    Returns list of (chunk_text, approx_section_name) tuples.
    """
    # Split into paragraphs
    paragraphs = re.split(r'\n\n+', text)
    
    chunks = []
    current_chunk = []
    current_len = 0
    
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        
        # If this paragraph alone is too long, split at sentences
        if len(para) > max_chars:
            # Flush current chunk first
            if current_chunk:
                chunks.append('\n\n'.join(current_chunk))
                current_chunk = []
                current_len = 0
            
            # Split oversized paragraph at sentence boundaries
            sentences = re.split(r'(?<=[.!?])\s+', para)
            sub_chunk = []
            sub_len = 0
            
            for sentence in sentences:
                if sub_len + len(sentence) + 1 > max_chars and sub_chunk:
                    chunks.append(' '.join(sub_chunk))
                    sub_chunk = [sentence]
                    sub_len = len(sentence)
                else:
                    sub_chunk.append(sentence)
                    sub_len += len(sentence) + 1
            
            if sub_chunk:
                chunks.append(' '.join(sub_chunk))
        
        elif current_len + len(para) + 2 > max_chars:
            # Adding this paragraph would exceed limit — flush current
            if current_chunk:
                chunks.append('\n\n'.join(current_chunk))
            current_chunk = [para]
            current_len = len(para)
        
        else:
            current_chunk.append(para)
            current_len += len(para) + 2  # +2 for '\n\n'
    
    # Flush any remaining
    if current_chunk:
        chunks.append('\n\n'.join(current_chunk))
    
    return chunks


def estimate_cost(text):
    """Estimate OpenAI TTS cost in USD."""
    chars = len(text)
    # tts-1: $15 per 1M chars = $0.000015 per char
    cost = chars * 0.000015
    return cost, chars


def generate_chunk_audio(client, text, output_path, voice, model):
    """Generate audio for a single text chunk and save to file."""
    response = client.audio.speech.create(
        model=model,
        voice=voice,
        input=text,
        response_format="mp3",
    )
    response.stream_to_file(output_path)


# ─────────────────────────────────────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(
        description="Generate TTS audio for Multi-Family Millions chapters"
    )
    parser.add_argument("--chapter", "-c", type=int, default=2,
                        help="Chapter number to generate (default: 2)")
    parser.add_argument("--voice", "-v", default=DEFAULT_VOICE,
                        help=f"OpenAI voice (default: {DEFAULT_VOICE})")
    parser.add_argument("--model", "-m", default=TTS_MODEL,
                        help=f"TTS model (default: {TTS_MODEL})")
    parser.add_argument("--dry-run", "-n", action="store_true",
                        help="Show chunk plan without generating audio")
    parser.add_argument("--regen", action="store_true",
                        help="Regenerate even if files already exist")
    parser.add_argument("--list", action="store_true",
                        help="List all chapters with status")
    args = parser.parse_args()
    
    # Extract PDF text (always needed)
    print(f"📖 Extracting text from: {PDF_PATH}")
    full_text = extract_pdf_text()
    
    if args.list:
        print("\n📚 Multi-Family Millions — Chapter Status\n")
        for ch_num in sorted(CHAPTER_LINES.keys()):
            folder = CHAPTER_FOLDERS.get(ch_num, "Unknown")
            folder_path = Path(OUTPUT_BASE) / folder
            existing = (list(folder_path.glob(f"MFM_Ch{ch_num}_*.mp3")) + 
                       list(folder_path.glob(f"MFM_Ch{ch_num}_*.opus"))) if folder_path.exists() else []
            start, end = CHAPTER_LINES[ch_num]
            ch_text = get_chapter_text(full_text, ch_num)
            cleaned = clean_text(ch_text)
            cost, chars = estimate_cost(cleaned)
            status = f"✅ {len(existing)} files" if existing else "❌ Missing"
            print(f"  Ch {ch_num:2d}: {CHAPTER_TITLES[ch_num][:50]:<50}  {status:<20}  {chars:,} chars  ${cost:.3f}")
        print()
        return
    
    ch_num = args.chapter
    if ch_num not in CHAPTER_LINES:
        print(f"ERROR: Chapter {ch_num} not found. Valid: {sorted(CHAPTER_LINES.keys())}")
        sys.exit(1)
    
    # Get output folder
    folder_name = CHAPTER_FOLDERS.get(ch_num, "Phase 1 Week 1 Reading")
    output_dir = Path(OUTPUT_BASE) / folder_name
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"\n📖 Chapter {ch_num}: {CHAPTER_TITLES[ch_num]}")
    print(f"📁 Output: {output_dir}")
    print(f"🎙️  Voice: {args.voice} ({args.model})")
    
    # Extract and clean chapter text
    ch_text = get_chapter_text(full_text, ch_num)
    cleaned = clean_text(ch_text)
    
    # Chunk it
    chunks = chunk_text(cleaned, MAX_CHUNK_CHARS)
    total_cost, total_chars = estimate_cost(cleaned)
    
    print(f"\n📊 Stats:")
    print(f"   Total characters: {total_chars:,}")
    print(f"   Chunks: {len(chunks)}")
    print(f"   Estimated cost: ${total_cost:.4f}")
    print(f"   Chunk sizes: {[len(c) for c in chunks]}\n")
    
    if args.dry_run:
        print("─" * 60)
        for i, chunk in enumerate(chunks, 1):
            preview = chunk[:120].replace('\n', ' ')
            print(f"  Chunk {i:02d} ({len(chunk):,} chars): {preview}...")
        print(f"\n✅ Dry run complete — no audio generated.")
        return
    
    # Load OpenAI client
    api_key = load_api_key()
    client = OpenAI(api_key=api_key)
    
    # Generate audio
    num_chunks = len(chunks)
    print(f"🎙️  Generating {num_chunks} audio files...\n")
    
    generated = 0
    skipped = 0
    
    for i, chunk in enumerate(chunks, 1):
        filename = f"MFM_Ch{ch_num}_Part{i:02d}_of{num_chunks:02d}.mp3"
        output_path = output_dir / filename
        
        if output_path.exists() and not args.regen:
            print(f"  ⏭️  {filename} already exists — skipping")
            skipped += 1
            continue
        
        print(f"  🔊 Generating {filename} ({len(chunk):,} chars)... ", end="", flush=True)
        
        try:
            generate_chunk_audio(client, chunk, str(output_path), args.voice, args.model)
            size_kb = output_path.stat().st_size // 1024
            print(f"✅ {size_kb} KB")
            generated += 1
        except Exception as e:
            print(f"❌ ERROR: {e}")
            # Continue with remaining chunks
    
    print(f"\n✅ Done! Generated: {generated}, Skipped: {skipped}")
    print(f"📁 Files in: {output_dir}")
    
    if generated > 0:
        print(f"\n💡 Tip: Load all Chapter {ch_num} files into AudioPlayer.html")
        print(f"   Use 'Sort by name' to play in order.")


if __name__ == "__main__":
    main()

# TONY-APPROVED: 2026-03-01 | sha:eb7fcd74