Vox/tts_handler.py

import io
import numpy as np
import scipy.io.wavfile as wavfile
from typing import Any
from pocket_tts import TTSModel

from audio_preprocessor import (
    AudioPreprocessor,
    PreprocessingConfig,
    print_audio_analysis,
)


class TTSHandler:
    """Handles text-to-speech generation using Pocket TTS."""

    DISCORD_SAMPLE_RATE = 48000

    def __init__(self, voice_wav_path: str, preprocess_audio: bool = True):
        self.voice_wav_path = voice_wav_path
        self.preprocess_audio = preprocess_audio
        self.model: TTSModel | None = None
        self.voice_state: Any = None
        self._preprocessed_path: str | None = None

    def load(self) -> None:
        """Load the TTS model and voice state from the WAV file."""
        print("Loading Pocket TTS model...")
        self.model = TTSModel.load_model()

        voice_path = self.voice_wav_path

        # Analyze and preprocess the audio if enabled
        if self.preprocess_audio:
            print("\nAnalyzing original audio...")
            print_audio_analysis(self.voice_wav_path)

            print("Preprocessing audio for optimal voice cloning...")
            config = PreprocessingConfig(
                target_sample_rate=22050,
                normalize=True,
                trim_silence=True,
                trim_top_db=20,
                reduce_noise=True,
                target_length_seconds=15.0,  # Limit to 15 seconds for best results
            )
            preprocessor = AudioPreprocessor(config)
            voice_path = preprocessor.preprocess_file(self.voice_wav_path)
            self._preprocessed_path = voice_path
            print("")

        print(f"Loading voice state from: {voice_path}")
        self.voice_state = self.model.get_state_for_audio_prompt(voice_path)
        print("TTS handler ready!")

    def generate_wav_bytes(self, text: str) -> bytes:
        """Generate audio and return as WAV file bytes (for FFmpeg)."""
        if self.model is None or self.voice_state is None:
            raise RuntimeError("TTS handler not loaded. Call load() first.")

        audio = self.model.generate_audio(self.voice_state, text)
        audio_np = audio.numpy()

        if audio_np.ndim == 1:
            audio_np = audio_np.reshape(-1, 1)

        max_val = np.max(np.abs(audio_np))
        if max_val > 0:
            audio_np = audio_np / max_val
        audio_int16 = (audio_np * 32767).astype(np.int16)

        wav_buffer = io.BytesIO()
        wavfile.write(wav_buffer, self.model.sample_rate, audio_int16)
        wav_buffer.seek(0)
        return wav_buffer.read()