#!/usr/bin/env python3
"""
Voice Reference Clip Extractor

Give it a YouTube URL (or any audio/video file) and a speaker name.
It downloads the audio, transcribes it, finds the cleanest solo segments,
and extracts a ready-to-use reference clip.

Usage:
    python extract_reference.py --url "https://youtube.com/watch?v=..." --name mccoy
    python extract_reference.py --file interview.mp3 --name morgan_freeman
    python extract_reference.py --url "https://..." --name mccoy --samples 5

Requirements:
    pip install openai  (for Whisper API transcription)
    yt-dlp              (for YouTube downloads)
    ffmpeg              (for audio extraction/trimming)

Set OPENAI_API_KEY env variable for transcription.
"""

import argparse
import json
import os
import re
import subprocess
import sys
import tempfile

VOICES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "voices")


def run(cmd, check=True):
    """Run a shell command and return output."""
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if check and result.returncode != 0:
        print(f"Command failed: {cmd}")
        print(result.stderr)
        sys.exit(1)
    return result.stdout.strip()


def download_audio(url, output_path):
    """Download audio from YouTube or any URL via yt-dlp."""
    print(f"Downloading audio from: {url}")
    run(f'yt-dlp -x --audio-format wav --audio-quality 0 -o "{output_path}" "{url}"')
    # yt-dlp may add extension
    if not os.path.exists(output_path):
        # Try with .wav extension added
        wav_path = output_path.rsplit(".", 1)[0] + ".wav"
        if os.path.exists(wav_path):
            return wav_path
    return output_path


def get_duration(audio_path):
    """Get audio duration in seconds."""
    out = run(f'ffprobe -i "{audio_path}" -show_entries format=duration -v quiet -of csv="p=0"')
    return float(out)


def extract_segment(source, start_sec, duration_sec, output_path):
    """Extract a segment from audio file."""
    run(f'ffmpeg -y -ss {start_sec} -t {duration_sec} -i "{source}" -ar 24000 -ac 1 "{output_path}"')
    return output_path


def transcribe_segment(audio_path, api_key):
    """Transcribe audio using OpenAI Whisper API. Returns text."""
    import urllib.request
    import urllib.error

    # Use curl since it handles multipart reliably
    result = subprocess.run(
        ["curl", "-s", "https://api.openai.com/v1/audio/transcriptions",
         "-H", f"Authorization: Bearer {api_key}",
         "-F", f"file=@{audio_path}",
         "-F", "model=whisper-1",
         "-F", "response_format=verbose_json"],
        capture_output=True, text=True
    )
    try:
        data = json.loads(result.stdout)
        return data
    except json.JSONDecodeError:
        return {"text": "", "segments": []}


def score_segment(transcript_data, segment_audio_path):
    """Score a segment for reference clip quality (0-100).

    Good reference clip:
    - Continuous speech (few long pauses)
    - Single speaker feel (consistent pacing)
    - Enough words (not mostly silence)
    - No music indicators
    """
    text = transcript_data.get("text", "")
    segments = transcript_data.get("segments", [])
    words = text.split()
    word_count = len(words)

    score = 50  # baseline

    # Word count: 30-80 words in 20s is ideal speaking pace
    if 25 <= word_count <= 90:
        score += 20
    elif 15 <= word_count < 25:
        score += 10
    elif word_count < 10:
        score -= 20  # mostly silence
    elif word_count > 90:
        score += 10  # fast but ok

    # Segment continuity: fewer segments = more continuous speech
    if segments:
        seg_count = len(segments)
        if seg_count <= 3:
            score += 15  # nice continuous speech
        elif seg_count <= 6:
            score += 5
        else:
            score -= 5  # choppy, multiple speakers likely

    # Penalize music/sound indicators
    music_indicators = ["[music]", "[applause]", "[laughter]", "♪", "🎵"]
    for ind in music_indicators:
        if ind.lower() in text.lower():
            score -= 15

    # Penalize question marks (might be interviewer, not subject)
    question_count = text.count("?")
    if question_count >= 2:
        score -= 10

    # Bonus for declarative statements (more authoritative = better clone)
    period_count = text.count(".")
    if period_count >= 2:
        score += 10

    return max(0, min(100, score))


def find_best_segments(source_audio, api_key, num_samples=8, segment_duration=22):
    """Sample segments across the audio and find the best ones."""
    total_duration = get_duration(source_audio)
    print(f"Total audio: {total_duration:.0f}s ({total_duration/60:.1f} min)")

    if total_duration < 30:
        print("Audio is short enough to use directly as reference.")
        return [(0, total_duration, 100, "Full clip")]

    # Sample evenly across the audio, avoiding first/last 10s (intros/outros)
    start = 10
    end = total_duration - 10
    interval = (end - start) / num_samples

    candidates = []
    tmpdir = tempfile.mkdtemp()

    print(f"Scanning {num_samples} segments for clean solo speech...\n")

    for i in range(num_samples):
        offset = start + (i * interval)
        seg_path = os.path.join(tmpdir, f"seg_{i}.wav")

        extract_segment(source_audio, offset, segment_duration, seg_path)

        # Transcribe
        transcript = transcribe_segment(seg_path, api_key)
        text = transcript.get("text", "").strip()

        # Score
        quality = score_segment(transcript, seg_path)

        candidates.append({
            "index": i,
            "offset": offset,
            "duration": segment_duration,
            "text": text,
            "score": quality,
            "path": seg_path,
        })

        timestamp = f"{int(offset//60)}:{int(offset%60):02d}"
        icon = "🟢" if quality >= 70 else "🟡" if quality >= 50 else "🔴"
        print(f"  {icon} [{timestamp}] Score: {quality}  \"{text[:70]}{'...' if len(text) > 70 else ''}\"")

    return candidates


def main():
    parser = argparse.ArgumentParser(
        description="Extract a voice reference clip from YouTube or audio file",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python extract_reference.py --url "https://youtube.com/watch?v=abc" --name mccoy
  python extract_reference.py --file podcast.mp3 --name narrator
  python extract_reference.py --url "https://..." --name voice --samples 12 --top 3
        """
    )
    parser.add_argument("--url", "-u", help="YouTube or video URL to download")
    parser.add_argument("--file", "-f", help="Local audio/video file")
    parser.add_argument("--name", "-n", required=True, help="Voice name (used for output filename)")
    parser.add_argument("--samples", "-s", type=int, default=8,
                       help="Number of segments to sample (default: 8)")
    parser.add_argument("--top", "-t", type=int, default=1,
                       help="Save top N candidates (default: 1 best)")
    parser.add_argument("--duration", "-d", type=int, default=22,
                       help="Segment duration in seconds (default: 22)")

    args = parser.parse_args()

    # Check API key
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        print("ERROR: Set OPENAI_API_KEY environment variable for transcription.")
        print("Get one at https://platform.openai.com/api-keys")
        sys.exit(1)

    # Check tools
    for tool in ["ffmpeg", "ffprobe"]:
        if subprocess.run(["which", tool], capture_output=True).returncode != 0:
            print(f"ERROR: {tool} not found. Install ffmpeg.")
            sys.exit(1)

    # Get source audio
    if args.url:
        if subprocess.run(["which", "yt-dlp"], capture_output=True).returncode != 0:
            print("ERROR: yt-dlp not found. Install: pip install yt-dlp")
            sys.exit(1)
        tmpdir = tempfile.mkdtemp()
        source = download_audio(args.url, os.path.join(tmpdir, "source.wav"))
    elif args.file:
        if not os.path.exists(args.file):
            print(f"ERROR: File not found: {args.file}")
            sys.exit(1)
        source = args.file
    else:
        print("Provide --url or --file")
        sys.exit(1)

    # Find best segments
    candidates = find_best_segments(source, api_key, args.samples, args.duration)

    # Sort by score
    candidates.sort(key=lambda x: x["score"], reverse=True)

    # Save top N
    os.makedirs(VOICES_DIR, exist_ok=True)
    print(f"\n{'='*60}")

    for i, seg in enumerate(candidates[:args.top]):
        if args.top == 1:
            out_name = f"{args.name}_reference.wav"
        else:
            out_name = f"{args.name}_reference_{i+1}.wav"

        out_path = os.path.join(VOICES_DIR, out_name)
        extract_segment(source, seg["offset"], seg["duration"], out_path)

        timestamp = f"{int(seg['offset']//60)}:{int(seg['offset']%60):02d}"
        print(f"\n✅ Saved: voices/{out_name}")
        print(f"   Score: {seg['score']}/100")
        print(f"   From:  {timestamp}")
        print(f"   Text:  \"{seg['text'][:100]}{'...' if len(seg['text']) > 100 else ''}\"")

    print(f"\n{'='*60}")
    print(f"\nReference clip ready! Generate speech with:")
    best_name = f"{args.name}_reference.wav" if args.top == 1 else f"{args.name}_reference_1.wav"
    print(f"  python gen_voice.py --voice voices/{best_name} \"Your text here\"\n")


if __name__ == "__main__":
    main()
