commit ae1c2a65d30b82c95500ff9704b6030b1a31c8aa Author: Spencer Grimes Date: Sun Jan 18 17:08:37 2026 -0600 Initial commit diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..87ddf19 --- /dev/null +++ b/.env.example @@ -0,0 +1,9 @@ +# Discord Bot Token (from Discord Developer Portal) +DISCORD_TOKEN=your_discord_bot_token_here + +# Channel ID to monitor for TTS messages +# Right-click the channel in Discord and copy ID (enable Developer Mode in settings) +TEXT_CHANNEL_ID=123456789012345678 + +# Path to the voice reference WAV file for voice cloning +VOICE_WAV_PATH=./voice.wav diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ebc9ccb --- /dev/null +++ b/.gitignore @@ -0,0 +1,126 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyderworkspace + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +/venv +# Gemini files +GEMINI.md +PROGRESS.md \ No newline at end of file diff --git a/Estinien.wav b/Estinien.wav new file mode 100644 index 0000000..1b52cc1 Binary files /dev/null and b/Estinien.wav differ diff --git a/Gaius.wav b/Gaius.wav new file mode 100644 index 0000000..9674d9b Binary files /dev/null and b/Gaius.wav differ diff --git a/Gibralter_funny.wav b/Gibralter_funny.wav new file mode 100644 index 0000000..8b7c8d0 Binary files /dev/null and b/Gibralter_funny.wav differ diff --git a/Gibralter_good.wav b/Gibralter_good.wav new file mode 100644 index 0000000..9ebc7ea Binary files /dev/null and b/Gibralter_good.wav differ diff --git a/HankHill.wav b/HankHill.wav new file mode 100644 index 0000000..f2a6fb1 Binary files /dev/null and b/HankHill.wav differ diff --git a/Johnny.wav b/Johnny.wav new file mode 100644 index 0000000..a791073 Binary files /dev/null and b/Johnny.wav differ diff --git a/MasterChief.wav b/MasterChief.wav new file mode 100644 index 0000000..cf2830e Binary files /dev/null and b/MasterChief.wav differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..a139090 --- /dev/null +++ b/README.md @@ -0,0 +1,138 @@ +# Pocket TTS Discord Bot + +A Discord bot that reads messages aloud using [Pocket TTS](https://github.com/kyutai-labs/pocket-tts) with voice cloning from a reference WAV file. + +## Features + +- 🎤 **Voice Cloning**: Uses a reference WAV file to clone a voice +- 📝 **Auto-read Messages**: Automatically reads all messages from a configured text channel +- 🔊 **Voice Channel Streaming**: Streams generated audio to the voice channel where the message author is +- 📋 **Message Queue**: Messages are queued and spoken in order + +## Prerequisites + +- Python 3.10+ +- FFmpeg installed and available in PATH +- A Discord bot token +- A reference voice WAV file (3-10 seconds of clear speech recommended) + +## Installation + +1. **Clone the repository**: + ```bash + git clone + cd PocketTTSBot + ``` + +2. **Create a virtual environment**: + ```bash + python -m venv venv + + # Windows + venv\Scripts\activate + + # Linux/macOS + source venv/bin/activate + ``` + +3. **Install dependencies**: + ```bash + pip install -r requirements.txt + ``` + +4. **Install FFmpeg**: + - **Windows**: Download from [ffmpeg.org](https://ffmpeg.org/download.html) and add to PATH + - **Linux**: `sudo apt install ffmpeg` + - **macOS**: `brew install ffmpeg` + +## Configuration + +1. **Create a Discord Bot**: + - Go to [Discord Developer Portal](https://discord.com/developers/applications) + - Create a new application + - Go to the "Bot" section and create a bot + - Copy the bot token + - Enable these Privileged Gateway Intents: + - Message Content Intent + - Server Members Intent (optional) + +2. **Invite the Bot to your server**: + - Go to OAuth2 > URL Generator + - Select scopes: `bot` + - Select permissions: `Connect`, `Speak`, `Send Messages`, `Read Message History` + - Use the generated URL to invite the bot + +3. **Get Channel ID**: + - Enable Developer Mode in Discord (Settings > Advanced > Developer Mode) + - Right-click the text channel you want to monitor and click "Copy ID" + +4. **Create `.env` file**: + ```bash + cp .env.example .env + ``` + + Edit `.env` with your values: + ```env + DISCORD_TOKEN=your_bot_token_here + TEXT_CHANNEL_ID=123456789012345678 + VOICE_WAV_PATH=./voice.wav + ``` + +5. **Add a voice reference file**: + - Place a WAV file named `voice.wav` in the project directory + - The file should contain 3-10 seconds of clear speech + - Higher quality audio = better voice cloning results + +## Usage + +1. **Start the bot**: + ```bash + python bot.py + ``` + +2. **Using the bot**: + - Join a voice channel in your Discord server + - Type a message in the configured text channel + - The bot will join your voice channel and read your message aloud + - Messages are queued if the bot is already speaking + +## How It Works + +``` +┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐ +│ Text Channel │ --> │ Pocket TTS │ --> │ Voice Channel │ +│ (configured) │ │ (generate) │ │ (user's VC) │ +└─────────────────┘ └──────────────────┘ └─────────────────┘ + ▲ + │ + ┌─────┴─────┐ + │ voice.wav │ + │ (speaker) │ + └───────────┘ +``` + +1. Bot monitors the configured text channel for new messages +2. When a message is received, it's added to the queue +3. The bot generates speech using Pocket TTS with the cloned voice +4. Audio is streamed to the voice channel where the message author is + +## Troubleshooting + +### Bot doesn't respond to messages +- Ensure Message Content Intent is enabled in Discord Developer Portal +- Check that the TEXT_CHANNEL_ID is correct +- Verify the bot has permissions to read the channel + +### No audio in voice channel +- Ensure FFmpeg is installed and in PATH +- Check that the bot has Connect and Speak permissions +- Verify your voice.wav file is valid + +### Voice quality issues +- Use a higher quality reference WAV file +- Ensure the reference audio is clear with minimal background noise +- Try a longer reference clip (5-10 seconds) + +## License + +MIT License diff --git a/Trump.wav b/Trump.wav new file mode 100644 index 0000000..1c921d4 Binary files /dev/null and b/Trump.wav differ diff --git a/audio_preprocessor.py b/audio_preprocessor.py new file mode 100644 index 0000000..d12945b --- /dev/null +++ b/audio_preprocessor.py @@ -0,0 +1,205 @@ +"""Audio preprocessing utilities for improving voice cloning quality.""" + +import os +import tempfile +from dataclasses import dataclass + +import librosa +import noisereduce as nr +import numpy as np +import soundfile as sf + + +@dataclass +class PreprocessingConfig: + """Configuration for audio preprocessing.""" + + target_sample_rate: int = 22050 + normalize: bool = True + trim_silence: bool = True + trim_top_db: int = 20 + reduce_noise: bool = True + target_length_seconds: float | None = None # None means keep original length + + +class AudioPreprocessor: + """Preprocesses audio files for optimal voice cloning.""" + + def __init__(self, config: PreprocessingConfig | None = None): + self.config = config or PreprocessingConfig() + + def preprocess_file(self, input_path: str, output_path: str | None = None) -> str: + """ + Preprocess an audio file for voice cloning. + + Args: + input_path: Path to the input audio file + output_path: Optional path for the output file. If None, creates a temp file. + + Returns: + Path to the preprocessed audio file + """ + print(f"Preprocessing audio: {input_path}") + + # Load audio with librosa (automatically converts to mono and resamples) + audio, sr = librosa.load( + input_path, + sr=self.config.target_sample_rate, + mono=True + ) + print(f" Loaded audio: {len(audio) / sr:.2f}s at {sr}Hz") + + # Apply preprocessing steps + audio = self._normalize(audio) + audio = self._trim_silence(audio, sr) + audio = self._reduce_noise(audio, sr) + audio = self._limit_length(audio, sr) + + # Ensure we have valid audio + if len(audio) < sr * 0.5: # Less than 0.5 seconds + print(" Warning: Audio is very short after preprocessing!") + + # Save to output path + if output_path is None: + fd, output_path = tempfile.mkstemp(suffix=".wav") + os.close(fd) + + sf.write(output_path, audio, sr, subtype="PCM_16") + print(f" Saved preprocessed audio: {output_path} ({len(audio) / sr:.2f}s)") + + return output_path + + def preprocess_to_array(self, input_path: str) -> tuple[np.ndarray, int]: + """ + Preprocess an audio file and return as numpy array. + + Args: + input_path: Path to the input audio file + + Returns: + Tuple of (audio array, sample rate) + """ + # Use temporary file approach for consistency + temp_path = self.preprocess_file(input_path) + audio, sr = librosa.load(temp_path, sr=None, mono=True) + os.unlink(temp_path) + return audio, sr + + def _normalize(self, audio: np.ndarray) -> np.ndarray: + """Normalize audio to a consistent volume level.""" + if not self.config.normalize: + return audio + + max_val = np.max(np.abs(audio)) + if max_val > 0: + # Normalize to 95% of max to avoid clipping + audio = audio / max_val * 0.95 + print(" Applied volume normalization") + return audio + + def _trim_silence(self, audio: np.ndarray, sr: int) -> np.ndarray: + """Trim silence from the beginning and end of audio.""" + if not self.config.trim_silence: + return audio + + trimmed, _ = librosa.effects.trim( + audio, + top_db=self.config.trim_top_db + ) + trimmed_duration = len(audio) - len(trimmed) + if trimmed_duration > 0: + print(f" Trimmed {trimmed_duration / sr:.2f}s of silence") + return trimmed + + def _reduce_noise(self, audio: np.ndarray, sr: int) -> np.ndarray: + """Apply noise reduction to the audio.""" + if not self.config.reduce_noise: + return audio + + try: + reduced = nr.reduce_noise( + y=audio, + sr=sr, + stationary=True, + prop_decrease=0.75 + ) + print(" Applied noise reduction") + return reduced + except Exception as e: + print(f" Warning: Noise reduction failed: {e}") + return audio + + def _limit_length(self, audio: np.ndarray, sr: int) -> np.ndarray: + """Limit audio to target length if specified.""" + if self.config.target_length_seconds is None: + return audio + + max_samples = int(self.config.target_length_seconds * sr) + if len(audio) > max_samples: + audio = audio[:max_samples] + print(f" Trimmed to {self.config.target_length_seconds}s") + return audio + + +def analyze_audio(file_path: str) -> dict: + """ + Analyze an audio file and return its properties. + Useful for debugging voice cloning issues. + """ + audio, sr = librosa.load(file_path, sr=None, mono=False) + + is_stereo = audio.ndim > 1 + if is_stereo: + audio_mono = librosa.to_mono(audio) + else: + audio_mono = audio + + duration = len(audio_mono) / sr + max_amplitude = np.max(np.abs(audio_mono)) + rms = np.sqrt(np.mean(audio_mono**2)) + + # Estimate noise level from quietest parts + frame_length = int(sr * 0.025) + hop_length = int(sr * 0.010) + rms_frames = librosa.feature.rms( + y=audio_mono, + frame_length=frame_length, + hop_length=hop_length + )[0] + noise_floor = np.percentile(rms_frames, 10) + + return { + "path": file_path, + "sample_rate": sr, + "duration_seconds": duration, + "is_stereo": is_stereo, + "max_amplitude": float(max_amplitude), + "rms_level": float(rms), + "estimated_noise_floor": float(noise_floor), + "is_normalized": max_amplitude > 0.8, + "is_too_short": duration < 3, + "is_too_long": duration > 30, + "needs_resampling": sr != 22050, + } + + +def print_audio_analysis(file_path: str) -> None: + """Print a formatted analysis of an audio file.""" + info = analyze_audio(file_path) + + print(f"\n{'=' * 50}") + print(f"Audio Analysis: {info['path']}") + print(f"{'=' * 50}") + print(f" Sample Rate: {info['sample_rate']} Hz {'⚠️ (should be 22050)' if info['needs_resampling'] else '✓'}") + print(f" Duration: {info['duration_seconds']:.2f}s", end="") + if info['is_too_short']: + print(" ⚠️ (too short, aim for 5-15s)") + elif info['is_too_long']: + print(" ⚠️ (quite long, 5-15s is ideal)") + else: + print(" ✓") + print(f" Channels: {'Stereo' if info['is_stereo'] else 'Mono'} {'⚠️ (will convert to mono)' if info['is_stereo'] else '✓'}") + print(f" Max Amplitude: {info['max_amplitude']:.3f} {'✓' if info['is_normalized'] else '⚠️ (low volume)'}") + print(f" RMS Level: {info['rms_level']:.4f}") + print(f" Noise Floor: {info['estimated_noise_floor']:.4f}") + print(f"{'=' * 50}\n") diff --git a/bot.py b/bot.py new file mode 100644 index 0000000..17d7e13 --- /dev/null +++ b/bot.py @@ -0,0 +1,133 @@ +import asyncio +import io +import discord +from discord.ext import commands +from config import Config +from tts_handler import TTSHandler + + +class TTSBot(commands.Bot): + """Discord bot that reads messages aloud using Pocket TTS.""" + + def __init__(self): + intents = discord.Intents.default() + intents.message_content = True + intents.voice_states = True + super().__init__(command_prefix="!", intents=intents) + + self.tts_handler = TTSHandler(Config.VOICE_WAV_PATH) + self.message_queue: asyncio.Queue[tuple[discord.Message, str]] = asyncio.Queue() + + async def setup_hook(self) -> None: + """Called when the bot is starting up.""" + print("Initializing TTS...") + await asyncio.to_thread(self.tts_handler.load) + self.loop.create_task(self.process_queue()) + + async def on_ready(self) -> None: + print(f"Logged in as {self.user}") + print(f"Monitoring channel ID: {Config.TEXT_CHANNEL_ID}") + print("Bot is ready!") + + async def on_message(self, message: discord.Message) -> None: + if message.author.bot: + return + + if message.channel.id != Config.TEXT_CHANNEL_ID: + return + + if not message.content.strip(): + return + + if message.author.voice is None: + await message.channel.send( + f"{message.author.mention}, you need to be in a voice channel for me to speak!", + delete_after=5 + ) + return + + await self.message_queue.put((message, message.content)) + print(f"Queued message from {message.author}: {message.content[:50]}...") + + await self.process_commands(message) + + async def process_queue(self) -> None: + """Process messages from the queue one at a time.""" + while True: + message, text = await self.message_queue.get() + + try: + await self.speak_message(message, text) + except Exception as e: + print(f"Error processing message: {e}") + finally: + self.message_queue.task_done() + + async def speak_message(self, message: discord.Message, text: str) -> None: + """Generate TTS and play it in the user's voice channel.""" + if message.author.voice is None: + return + + voice_channel = message.author.voice.channel + + voice_client = await self.ensure_voice_connection(voice_channel) + if voice_client is None: + return + + print(f"Generating TTS for: {text[:50]}...") + wav_bytes = await asyncio.to_thread(self.tts_handler.generate_wav_bytes, text) + + audio_source = discord.FFmpegPCMAudio( + io.BytesIO(wav_bytes), + pipe=True, + options="-loglevel panic" + ) + + if voice_client.is_playing(): + voice_client.stop() + + play_complete = asyncio.Event() + + def after_playing(error): + if error: + print(f"Playback error: {error}") + self.loop.call_soon_threadsafe(play_complete.set) + + voice_client.play(audio_source, after=after_playing) + print(f"Playing audio in {voice_channel.name}") + + await play_complete.wait() + + async def ensure_voice_connection(self, channel: discord.VoiceChannel) -> discord.VoiceClient | None: + """Ensure we're connected to the specified voice channel.""" + guild = channel.guild + + if guild.voice_client is not None: + if guild.voice_client.channel.id == channel.id: + return guild.voice_client + await guild.voice_client.move_to(channel) + return guild.voice_client + + try: + voice_client = await channel.connect(timeout=10.0) + return voice_client + except Exception as e: + print(f"Failed to connect to voice channel: {e}") + return None + + +def main(): + errors = Config.validate() + if errors: + print("Configuration errors:") + for error in errors: + print(f" - {error}") + print("\nPlease create a .env file based on .env.example") + return + + bot = TTSBot() + bot.run(Config.DISCORD_TOKEN) + + +if __name__ == "__main__": + main() diff --git a/config.py b/config.py new file mode 100644 index 0000000..8bd6635 --- /dev/null +++ b/config.py @@ -0,0 +1,22 @@ +import os +from dotenv import load_dotenv + +load_dotenv() + + +class Config: + DISCORD_TOKEN: str = os.getenv("DISCORD_TOKEN", "") + TEXT_CHANNEL_ID: int = int(os.getenv("TEXT_CHANNEL_ID", "0")) + VOICE_WAV_PATH: str = os.getenv("VOICE_WAV_PATH", "./voice.wav") + + @classmethod + def validate(cls) -> list[str]: + """Validate configuration and return list of errors.""" + errors = [] + if not cls.DISCORD_TOKEN: + errors.append("DISCORD_TOKEN is not set") + if cls.TEXT_CHANNEL_ID == 0: + errors.append("TEXT_CHANNEL_ID is not set") + if not os.path.exists(cls.VOICE_WAV_PATH): + errors.append(f"Voice WAV file not found: {cls.VOICE_WAV_PATH}") + return errors diff --git a/media/Subnautica/CyclopsEngineOff.oga b/media/Subnautica/CyclopsEngineOff.oga new file mode 100644 index 0000000..fde4234 Binary files /dev/null and b/media/Subnautica/CyclopsEngineOff.oga differ diff --git a/media/Subnautica/CyclopsEngineOn.oga b/media/Subnautica/CyclopsEngineOn.oga new file mode 100644 index 0000000..798ffb2 Binary files /dev/null and b/media/Subnautica/CyclopsEngineOn.oga differ diff --git a/media/Subnautica/CyclopsOverheat.oga b/media/Subnautica/CyclopsOverheat.oga new file mode 100644 index 0000000..7c6dde4 Binary files /dev/null and b/media/Subnautica/CyclopsOverheat.oga differ diff --git a/media/Subnautica/Cyclops_Welcome.oga b/media/Subnautica/Cyclops_Welcome.oga new file mode 100644 index 0000000..65d9a05 Binary files /dev/null and b/media/Subnautica/Cyclops_Welcome.oga differ diff --git a/media/Subnautica/Cyclops_Welcome2.oga b/media/Subnautica/Cyclops_Welcome2.oga new file mode 100644 index 0000000..5809407 Binary files /dev/null and b/media/Subnautica/Cyclops_Welcome2.oga differ diff --git a/media/TF2/Ronin/diag_gs_titanRonin_embark_03.wav b/media/TF2/Ronin/diag_gs_titanRonin_embark_03.wav new file mode 100644 index 0000000..303db46 Binary files /dev/null and b/media/TF2/Ronin/diag_gs_titanRonin_embark_03.wav differ diff --git a/media/TF2/Ronin/diag_gs_titanRonin_embark_05.wav b/media/TF2/Ronin/diag_gs_titanRonin_embark_05.wav new file mode 100644 index 0000000..c18c3b6 Binary files /dev/null and b/media/TF2/Ronin/diag_gs_titanRonin_embark_05.wav differ diff --git a/media/TF2/Ronin/diag_gs_titanRonin_embark_06.wav b/media/TF2/Ronin/diag_gs_titanRonin_embark_06.wav new file mode 100644 index 0000000..0bf2ea8 Binary files /dev/null and b/media/TF2/Ronin/diag_gs_titanRonin_embark_06.wav differ diff --git a/media/TF2/Ronin/diag_gs_titanRonin_embark_08.wav b/media/TF2/Ronin/diag_gs_titanRonin_embark_08.wav new file mode 100644 index 0000000..a23b52e Binary files /dev/null and b/media/TF2/Ronin/diag_gs_titanRonin_embark_08.wav differ diff --git a/media/TF2/Ronin/diag_gs_titanRonin_embark_09.wav b/media/TF2/Ronin/diag_gs_titanRonin_embark_09.wav new file mode 100644 index 0000000..442af3a Binary files /dev/null and b/media/TF2/Ronin/diag_gs_titanRonin_embark_09.wav differ diff --git a/media/TF2/Ronin/diag_gs_titanRonin_embark_10.wav b/media/TF2/Ronin/diag_gs_titanRonin_embark_10.wav new file mode 100644 index 0000000..d4cd458 Binary files /dev/null and b/media/TF2/Ronin/diag_gs_titanRonin_embark_10.wav differ diff --git a/media/TF2/Ronin/diag_gs_titanRonin_embark_11.wav b/media/TF2/Ronin/diag_gs_titanRonin_embark_11.wav new file mode 100644 index 0000000..5d0ffc9 Binary files /dev/null and b/media/TF2/Ronin/diag_gs_titanRonin_embark_11.wav differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ec168a8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +discord.py[voice]>=2.3.0 +pocket-tts>=0.1.0 +scipy>=1.10.0 +PyNaCl>=1.5.0 +python-dotenv>=1.0.0 +numpy>=1.24.0 +librosa>=0.10.0 +noisereduce>=3.0.0 +soundfile>=0.12.0 diff --git a/tts_handler.py b/tts_handler.py new file mode 100644 index 0000000..f6aec23 --- /dev/null +++ b/tts_handler.py @@ -0,0 +1,77 @@ +import io +import numpy as np +import scipy.io.wavfile as wavfile +from typing import Any +from pocket_tts import TTSModel + +from audio_preprocessor import ( + AudioPreprocessor, + PreprocessingConfig, + print_audio_analysis, +) + + +class TTSHandler: + """Handles text-to-speech generation using Pocket TTS.""" + + DISCORD_SAMPLE_RATE = 48000 + + def __init__(self, voice_wav_path: str, preprocess_audio: bool = True): + self.voice_wav_path = voice_wav_path + self.preprocess_audio = preprocess_audio + self.model: TTSModel | None = None + self.voice_state: Any = None + self._preprocessed_path: str | None = None + + def load(self) -> None: + """Load the TTS model and voice state from the WAV file.""" + print("Loading Pocket TTS model...") + self.model = TTSModel.load_model() + + voice_path = self.voice_wav_path + + # Analyze and preprocess the audio if enabled + if self.preprocess_audio: + print("\nAnalyzing original audio...") + print_audio_analysis(self.voice_wav_path) + + print("Preprocessing audio for optimal voice cloning...") + config = PreprocessingConfig( + target_sample_rate=22050, + normalize=True, + trim_silence=True, + trim_top_db=20, + reduce_noise=True, + target_length_seconds=15.0, # Limit to 15 seconds for best results + ) + preprocessor = AudioPreprocessor(config) + voice_path = preprocessor.preprocess_file(self.voice_wav_path) + self._preprocessed_path = voice_path + print("") + + print(f"Loading voice state from: {voice_path}") + self.voice_state = self.model.get_state_for_audio_prompt(voice_path) + print("TTS handler ready!") + + def generate_wav_bytes(self, text: str) -> bytes: + """Generate audio and return as WAV file bytes (for FFmpeg).""" + if self.model is None or self.voice_state is None: + raise RuntimeError("TTS handler not loaded. Call load() first.") + + audio = self.model.generate_audio(self.voice_state, text) + audio_np = audio.numpy() + + if audio_np.ndim == 1: + audio_np = audio_np.reshape(-1, 1) + + max_val = np.max(np.abs(audio_np)) + if max_val > 0: + audio_np = audio_np / max_val + audio_int16 = (audio_np * 32767).astype(np.int16) + + wav_buffer = io.BytesIO() + wavfile.write(wav_buffer, self.model.sample_rate, audio_int16) + wav_buffer.seek(0) + return wav_buffer.read() + +