Initial commit

2026-01-18 17:08:37 -06:00
commit ae1c2a65d3
28 changed files with 719 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,9 @@
+# Discord Bot Token (from Discord Developer Portal)
+DISCORD_TOKEN=your_discord_bot_token_here
+
+# Channel ID to monitor for TTS messages
+# Right-click the channel in Discord and copy ID (enable Developer Mode in settings)
+TEXT_CHANNEL_ID=123456789012345678
+
+# Path to the voice reference WAV file for voice cloning
+VOICE_WAV_PATH=./voice.wav
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,126 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyderworkspace
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+/venv
+# Gemini files
+GEMINI.md
+PROGRESS.md
--- a/Estinien.wav
+++ b/Estinien.wav
--- a/Gaius.wav
+++ b/Gaius.wav
--- a/Gibralter_funny.wav
+++ b/Gibralter_funny.wav
--- a/Gibralter_good.wav
+++ b/Gibralter_good.wav
--- a/HankHill.wav
+++ b/HankHill.wav
--- a/Johnny.wav
+++ b/Johnny.wav
--- a/MasterChief.wav
+++ b/MasterChief.wav
--- a/README.md
+++ b/README.md
@@ -0,0 +1,138 @@
+# Pocket TTS Discord Bot
+
+A Discord bot that reads messages aloud using [Pocket TTS](https://github.com/kyutai-labs/pocket-tts) with voice cloning from a reference WAV file.
+
+## Features
+
+- 🎤 **Voice Cloning**: Uses a reference WAV file to clone a voice
+- 📝 **Auto-read Messages**: Automatically reads all messages from a configured text channel
+- 🔊 **Voice Channel Streaming**: Streams generated audio to the voice channel where the message author is
+- 📋 **Message Queue**: Messages are queued and spoken in order
+
+## Prerequisites
+
+- Python 3.10+
+- FFmpeg installed and available in PATH
+- A Discord bot token
+- A reference voice WAV file (3-10 seconds of clear speech recommended)
+
+## Installation
+
+1. **Clone the repository**:
+   ```bash
+   git clone <repository-url>
+   cd PocketTTSBot
+   ```
+
+2. **Create a virtual environment**:
+   ```bash
+   python -m venv venv
+   
+   # Windows
+   venv\Scripts\activate
+   
+   # Linux/macOS
+   source venv/bin/activate
+   ```
+
+3. **Install dependencies**:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+4. **Install FFmpeg**:
+   - **Windows**: Download from [ffmpeg.org](https://ffmpeg.org/download.html) and add to PATH
+   - **Linux**: `sudo apt install ffmpeg`
+   - **macOS**: `brew install ffmpeg`
+
+## Configuration
+
+1. **Create a Discord Bot**:
+   - Go to [Discord Developer Portal](https://discord.com/developers/applications)
+   - Create a new application
+   - Go to the "Bot" section and create a bot
+   - Copy the bot token
+   - Enable these Privileged Gateway Intents:
+     - Message Content Intent
+     - Server Members Intent (optional)
+
+2. **Invite the Bot to your server**:
+   - Go to OAuth2 > URL Generator
+   - Select scopes: `bot`
+   - Select permissions: `Connect`, `Speak`, `Send Messages`, `Read Message History`
+   - Use the generated URL to invite the bot
+
+3. **Get Channel ID**:
+   - Enable Developer Mode in Discord (Settings > Advanced > Developer Mode)
+   - Right-click the text channel you want to monitor and click "Copy ID"
+
+4. **Create `.env` file**:
+   ```bash
+   cp .env.example .env
+   ```
+   
+   Edit `.env` with your values:
+   ```env
+   DISCORD_TOKEN=your_bot_token_here
+   TEXT_CHANNEL_ID=123456789012345678
+   VOICE_WAV_PATH=./voice.wav
+   ```
+
+5. **Add a voice reference file**:
+   - Place a WAV file named `voice.wav` in the project directory
+   - The file should contain 3-10 seconds of clear speech
+   - Higher quality audio = better voice cloning results
+
+## Usage
+
+1. **Start the bot**:
+   ```bash
+   python bot.py
+   ```
+
+2. **Using the bot**:
+   - Join a voice channel in your Discord server
+   - Type a message in the configured text channel
+   - The bot will join your voice channel and read your message aloud
+   - Messages are queued if the bot is already speaking
+
+## How It Works
+
+```
+┌─────────────────┐     ┌──────────────────┐     ┌─────────────────┐
+│  Text Channel   │ --> │   Pocket TTS     │ --> │  Voice Channel  │
+│  (configured)   │     │   (generate)     │     │  (user's VC)    │
+└─────────────────┘     └──────────────────┘     └─────────────────┘
+                              ▲
+                              │
+                        ┌─────┴─────┐
+                        │ voice.wav │
+                        │ (speaker) │
+                        └───────────┘
+```
+
+1. Bot monitors the configured text channel for new messages
+2. When a message is received, it's added to the queue
+3. The bot generates speech using Pocket TTS with the cloned voice
+4. Audio is streamed to the voice channel where the message author is
+
+## Troubleshooting
+
+### Bot doesn't respond to messages
+- Ensure Message Content Intent is enabled in Discord Developer Portal
+- Check that the TEXT_CHANNEL_ID is correct
+- Verify the bot has permissions to read the channel
+
+### No audio in voice channel
+- Ensure FFmpeg is installed and in PATH
+- Check that the bot has Connect and Speak permissions
+- Verify your voice.wav file is valid
+
+### Voice quality issues
+- Use a higher quality reference WAV file
+- Ensure the reference audio is clear with minimal background noise
+- Try a longer reference clip (5-10 seconds)
+
+## License
+
+MIT License
--- a/Trump.wav
+++ b/Trump.wav
--- a/audio_preprocessor.py
+++ b/audio_preprocessor.py
@@ -0,0 +1,205 @@
+"""Audio preprocessing utilities for improving voice cloning quality."""
+
+import os
+import tempfile
+from dataclasses import dataclass
+
+import librosa
+import noisereduce as nr
+import numpy as np
+import soundfile as sf
+
+
+@dataclass
+class PreprocessingConfig:
+    """Configuration for audio preprocessing."""
+
+    target_sample_rate: int = 22050
+    normalize: bool = True
+    trim_silence: bool = True
+    trim_top_db: int = 20
+    reduce_noise: bool = True
+    target_length_seconds: float | None = None  # None means keep original length
+
+
+class AudioPreprocessor:
+    """Preprocesses audio files for optimal voice cloning."""
+
+    def __init__(self, config: PreprocessingConfig | None = None):
+        self.config = config or PreprocessingConfig()
+
+    def preprocess_file(self, input_path: str, output_path: str | None = None) -> str:
+        """
+        Preprocess an audio file for voice cloning.
+
+        Args:
+            input_path: Path to the input audio file
+            output_path: Optional path for the output file. If None, creates a temp file.
+
+        Returns:
+            Path to the preprocessed audio file
+        """
+        print(f"Preprocessing audio: {input_path}")
+
+        # Load audio with librosa (automatically converts to mono and resamples)
+        audio, sr = librosa.load(
+            input_path,
+            sr=self.config.target_sample_rate,
+            mono=True
+        )
+        print(f"  Loaded audio: {len(audio) / sr:.2f}s at {sr}Hz")
+
+        # Apply preprocessing steps
+        audio = self._normalize(audio)
+        audio = self._trim_silence(audio, sr)
+        audio = self._reduce_noise(audio, sr)
+        audio = self._limit_length(audio, sr)
+
+        # Ensure we have valid audio
+        if len(audio) < sr * 0.5:  # Less than 0.5 seconds
+            print("  Warning: Audio is very short after preprocessing!")
+
+        # Save to output path
+        if output_path is None:
+            fd, output_path = tempfile.mkstemp(suffix=".wav")
+            os.close(fd)
+
+        sf.write(output_path, audio, sr, subtype="PCM_16")
+        print(f"  Saved preprocessed audio: {output_path} ({len(audio) / sr:.2f}s)")
+
+        return output_path
+
+    def preprocess_to_array(self, input_path: str) -> tuple[np.ndarray, int]:
+        """
+        Preprocess an audio file and return as numpy array.
+
+        Args:
+            input_path: Path to the input audio file
+
+        Returns:
+            Tuple of (audio array, sample rate)
+        """
+        # Use temporary file approach for consistency
+        temp_path = self.preprocess_file(input_path)
+        audio, sr = librosa.load(temp_path, sr=None, mono=True)
+        os.unlink(temp_path)
+        return audio, sr
+
+    def _normalize(self, audio: np.ndarray) -> np.ndarray:
+        """Normalize audio to a consistent volume level."""
+        if not self.config.normalize:
+            return audio
+
+        max_val = np.max(np.abs(audio))
+        if max_val > 0:
+            # Normalize to 95% of max to avoid clipping
+            audio = audio / max_val * 0.95
+            print("  Applied volume normalization")
+        return audio
+
+    def _trim_silence(self, audio: np.ndarray, sr: int) -> np.ndarray:
+        """Trim silence from the beginning and end of audio."""
+        if not self.config.trim_silence:
+            return audio
+
+        trimmed, _ = librosa.effects.trim(
+            audio,
+            top_db=self.config.trim_top_db
+        )
+        trimmed_duration = len(audio) - len(trimmed)
+        if trimmed_duration > 0:
+            print(f"  Trimmed {trimmed_duration / sr:.2f}s of silence")
+        return trimmed
+
+    def _reduce_noise(self, audio: np.ndarray, sr: int) -> np.ndarray:
+        """Apply noise reduction to the audio."""
+        if not self.config.reduce_noise:
+            return audio
+
+        try:
+            reduced = nr.reduce_noise(
+                y=audio,
+                sr=sr,
+                stationary=True,
+                prop_decrease=0.75
+            )
+            print("  Applied noise reduction")
+            return reduced
+        except Exception as e:
+            print(f"  Warning: Noise reduction failed: {e}")
+            return audio
+
+    def _limit_length(self, audio: np.ndarray, sr: int) -> np.ndarray:
+        """Limit audio to target length if specified."""
+        if self.config.target_length_seconds is None:
+            return audio
+
+        max_samples = int(self.config.target_length_seconds * sr)
+        if len(audio) > max_samples:
+            audio = audio[:max_samples]
+            print(f"  Trimmed to {self.config.target_length_seconds}s")
+        return audio
+
+
+def analyze_audio(file_path: str) -> dict:
+    """
+    Analyze an audio file and return its properties.
+    Useful for debugging voice cloning issues.
+    """
+    audio, sr = librosa.load(file_path, sr=None, mono=False)
+
+    is_stereo = audio.ndim > 1
+    if is_stereo:
+        audio_mono = librosa.to_mono(audio)
+    else:
+        audio_mono = audio
+
+    duration = len(audio_mono) / sr
+    max_amplitude = np.max(np.abs(audio_mono))
+    rms = np.sqrt(np.mean(audio_mono**2))
+
+    # Estimate noise level from quietest parts
+    frame_length = int(sr * 0.025)
+    hop_length = int(sr * 0.010)
+    rms_frames = librosa.feature.rms(
+        y=audio_mono,
+        frame_length=frame_length,
+        hop_length=hop_length
+    )[0]
+    noise_floor = np.percentile(rms_frames, 10)
+
+    return {
+        "path": file_path,
+        "sample_rate": sr,
+        "duration_seconds": duration,
+        "is_stereo": is_stereo,
+        "max_amplitude": float(max_amplitude),
+        "rms_level": float(rms),
+        "estimated_noise_floor": float(noise_floor),
+        "is_normalized": max_amplitude > 0.8,
+        "is_too_short": duration < 3,
+        "is_too_long": duration > 30,
+        "needs_resampling": sr != 22050,
+    }
+
+
+def print_audio_analysis(file_path: str) -> None:
+    """Print a formatted analysis of an audio file."""
+    info = analyze_audio(file_path)
+
+    print(f"\n{'=' * 50}")
+    print(f"Audio Analysis: {info['path']}")
+    print(f"{'=' * 50}")
+    print(f"  Sample Rate:    {info['sample_rate']} Hz {'⚠️  (should be 22050)' if info['needs_resampling'] else '✓'}")
+    print(f"  Duration:       {info['duration_seconds']:.2f}s", end="")
+    if info['is_too_short']:
+        print(" ⚠️  (too short, aim for 5-15s)")
+    elif info['is_too_long']:
+        print(" ⚠️  (quite long, 5-15s is ideal)")
+    else:
+        print(" ✓")
+    print(f"  Channels:       {'Stereo' if info['is_stereo'] else 'Mono'} {'⚠️  (will convert to mono)' if info['is_stereo'] else '✓'}")
+    print(f"  Max Amplitude:  {info['max_amplitude']:.3f} {'✓' if info['is_normalized'] else '⚠️  (low volume)'}")
+    print(f"  RMS Level:      {info['rms_level']:.4f}")
+    print(f"  Noise Floor:    {info['estimated_noise_floor']:.4f}")
+    print(f"{'=' * 50}\n")
--- a/bot.py
+++ b/bot.py
@@ -0,0 +1,133 @@
+import asyncio
+import io
+import discord
+from discord.ext import commands
+from config import Config
+from tts_handler import TTSHandler
+
+
+class TTSBot(commands.Bot):
+    """Discord bot that reads messages aloud using Pocket TTS."""
+
+    def __init__(self):
+        intents = discord.Intents.default()
+        intents.message_content = True
+        intents.voice_states = True
+        super().__init__(command_prefix="!", intents=intents)
+
+        self.tts_handler = TTSHandler(Config.VOICE_WAV_PATH)
+        self.message_queue: asyncio.Queue[tuple[discord.Message, str]] = asyncio.Queue()
+
+    async def setup_hook(self) -> None:
+        """Called when the bot is starting up."""
+        print("Initializing TTS...")
+        await asyncio.to_thread(self.tts_handler.load)
+        self.loop.create_task(self.process_queue())
+
+    async def on_ready(self) -> None:
+        print(f"Logged in as {self.user}")
+        print(f"Monitoring channel ID: {Config.TEXT_CHANNEL_ID}")
+        print("Bot is ready!")
+
+    async def on_message(self, message: discord.Message) -> None:
+        if message.author.bot:
+            return
+
+        if message.channel.id != Config.TEXT_CHANNEL_ID:
+            return
+
+        if not message.content.strip():
+            return
+
+        if message.author.voice is None:
+            await message.channel.send(
+                f"{message.author.mention}, you need to be in a voice channel for me to speak!",
+                delete_after=5
+            )
+            return
+
+        await self.message_queue.put((message, message.content))
+        print(f"Queued message from {message.author}: {message.content[:50]}...")
+
+        await self.process_commands(message)
+
+    async def process_queue(self) -> None:
+        """Process messages from the queue one at a time."""
+        while True:
+            message, text = await self.message_queue.get()
+
+            try:
+                await self.speak_message(message, text)
+            except Exception as e:
+                print(f"Error processing message: {e}")
+            finally:
+                self.message_queue.task_done()
+
+    async def speak_message(self, message: discord.Message, text: str) -> None:
+        """Generate TTS and play it in the user's voice channel."""
+        if message.author.voice is None:
+            return
+
+        voice_channel = message.author.voice.channel
+
+        voice_client = await self.ensure_voice_connection(voice_channel)
+        if voice_client is None:
+            return
+
+        print(f"Generating TTS for: {text[:50]}...")
+        wav_bytes = await asyncio.to_thread(self.tts_handler.generate_wav_bytes, text)
+
+        audio_source = discord.FFmpegPCMAudio(
+            io.BytesIO(wav_bytes),
+            pipe=True,
+            options="-loglevel panic"
+        )
+
+        if voice_client.is_playing():
+            voice_client.stop()
+
+        play_complete = asyncio.Event()
+
+        def after_playing(error):
+            if error:
+                print(f"Playback error: {error}")
+            self.loop.call_soon_threadsafe(play_complete.set)
+
+        voice_client.play(audio_source, after=after_playing)
+        print(f"Playing audio in {voice_channel.name}")
+
+        await play_complete.wait()
+
+    async def ensure_voice_connection(self, channel: discord.VoiceChannel) -> discord.VoiceClient | None:
+        """Ensure we're connected to the specified voice channel."""
+        guild = channel.guild
+
+        if guild.voice_client is not None:
+            if guild.voice_client.channel.id == channel.id:
+                return guild.voice_client
+            await guild.voice_client.move_to(channel)
+            return guild.voice_client
+
+        try:
+            voice_client = await channel.connect(timeout=10.0)
+            return voice_client
+        except Exception as e:
+            print(f"Failed to connect to voice channel: {e}")
+            return None
+
+
+def main():
+    errors = Config.validate()
+    if errors:
+        print("Configuration errors:")
+        for error in errors:
+            print(f"  - {error}")
+        print("\nPlease create a .env file based on .env.example")
+        return
+
+    bot = TTSBot()
+    bot.run(Config.DISCORD_TOKEN)
+
+
+if __name__ == "__main__":
+    main()
--- a/config.py
+++ b/config.py
@@ -0,0 +1,22 @@
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+class Config:
+    DISCORD_TOKEN: str = os.getenv("DISCORD_TOKEN", "")
+    TEXT_CHANNEL_ID: int = int(os.getenv("TEXT_CHANNEL_ID", "0"))
+    VOICE_WAV_PATH: str = os.getenv("VOICE_WAV_PATH", "./voice.wav")
+
+    @classmethod
+    def validate(cls) -> list[str]:
+        """Validate configuration and return list of errors."""
+        errors = []
+        if not cls.DISCORD_TOKEN:
+            errors.append("DISCORD_TOKEN is not set")
+        if cls.TEXT_CHANNEL_ID == 0:
+            errors.append("TEXT_CHANNEL_ID is not set")
+        if not os.path.exists(cls.VOICE_WAV_PATH):
+            errors.append(f"Voice WAV file not found: {cls.VOICE_WAV_PATH}")
+        return errors
--- a/media/Subnautica/CyclopsEngineOff.oga
+++ b/media/Subnautica/CyclopsEngineOff.oga
--- a/media/Subnautica/CyclopsEngineOn.oga
+++ b/media/Subnautica/CyclopsEngineOn.oga
--- a/media/Subnautica/CyclopsOverheat.oga
+++ b/media/Subnautica/CyclopsOverheat.oga
--- a/media/Subnautica/Cyclops_Welcome.oga
+++ b/media/Subnautica/Cyclops_Welcome.oga
--- a/media/Subnautica/Cyclops_Welcome2.oga
+++ b/media/Subnautica/Cyclops_Welcome2.oga
--- a/media/TF2/Ronin/diag_gs_titanRonin_embark_03.wav
+++ b/media/TF2/Ronin/diag_gs_titanRonin_embark_03.wav
--- a/media/TF2/Ronin/diag_gs_titanRonin_embark_05.wav
+++ b/media/TF2/Ronin/diag_gs_titanRonin_embark_05.wav
--- a/media/TF2/Ronin/diag_gs_titanRonin_embark_06.wav
+++ b/media/TF2/Ronin/diag_gs_titanRonin_embark_06.wav
--- a/media/TF2/Ronin/diag_gs_titanRonin_embark_08.wav
+++ b/media/TF2/Ronin/diag_gs_titanRonin_embark_08.wav
--- a/media/TF2/Ronin/diag_gs_titanRonin_embark_09.wav
+++ b/media/TF2/Ronin/diag_gs_titanRonin_embark_09.wav
--- a/media/TF2/Ronin/diag_gs_titanRonin_embark_10.wav
+++ b/media/TF2/Ronin/diag_gs_titanRonin_embark_10.wav
--- a/media/TF2/Ronin/diag_gs_titanRonin_embark_11.wav
+++ b/media/TF2/Ronin/diag_gs_titanRonin_embark_11.wav
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+discord.py[voice]>=2.3.0
+pocket-tts>=0.1.0
+scipy>=1.10.0
+PyNaCl>=1.5.0
+python-dotenv>=1.0.0
+numpy>=1.24.0
+librosa>=0.10.0
+noisereduce>=3.0.0
+soundfile>=0.12.0
--- a/tts_handler.py
+++ b/tts_handler.py
@@ -0,0 +1,77 @@
+import io
+import numpy as np
+import scipy.io.wavfile as wavfile
+from typing import Any
+from pocket_tts import TTSModel
+
+from audio_preprocessor import (
+    AudioPreprocessor,
+    PreprocessingConfig,
+    print_audio_analysis,
+)
+
+
+class TTSHandler:
+    """Handles text-to-speech generation using Pocket TTS."""
+
+    DISCORD_SAMPLE_RATE = 48000
+
+    def __init__(self, voice_wav_path: str, preprocess_audio: bool = True):
+        self.voice_wav_path = voice_wav_path
+        self.preprocess_audio = preprocess_audio
+        self.model: TTSModel | None = None
+        self.voice_state: Any = None
+        self._preprocessed_path: str | None = None
+
+    def load(self) -> None:
+        """Load the TTS model and voice state from the WAV file."""
+        print("Loading Pocket TTS model...")
+        self.model = TTSModel.load_model()
+
+        voice_path = self.voice_wav_path
+
+        # Analyze and preprocess the audio if enabled
+        if self.preprocess_audio:
+            print("\nAnalyzing original audio...")
+            print_audio_analysis(self.voice_wav_path)
+
+            print("Preprocessing audio for optimal voice cloning...")
+            config = PreprocessingConfig(
+                target_sample_rate=22050,
+                normalize=True,
+                trim_silence=True,
+                trim_top_db=20,
+                reduce_noise=True,
+                target_length_seconds=15.0,  # Limit to 15 seconds for best results
+            )
+            preprocessor = AudioPreprocessor(config)
+            voice_path = preprocessor.preprocess_file(self.voice_wav_path)
+            self._preprocessed_path = voice_path
+            print("")
+
+        print(f"Loading voice state from: {voice_path}")
+        self.voice_state = self.model.get_state_for_audio_prompt(voice_path)
+        print("TTS handler ready!")
+
+    def generate_wav_bytes(self, text: str) -> bytes:
+        """Generate audio and return as WAV file bytes (for FFmpeg)."""
+        if self.model is None or self.voice_state is None:
+            raise RuntimeError("TTS handler not loaded. Call load() first.")
+
+        audio = self.model.generate_audio(self.voice_state, text)
+        audio_np = audio.numpy()
+
+        if audio_np.ndim == 1:
+            audio_np = audio_np.reshape(-1, 1)
+
+        max_val = np.max(np.abs(audio_np))
+        if max_val > 0:
+            audio_np = audio_np / max_val
+        audio_int16 = (audio_np * 32767).astype(np.int16)
+
+        wav_buffer = io.BytesIO()
+        wavfile.write(wav_buffer, self.model.sample_rate, audio_int16)
+        wav_buffer.seek(0)
+        return wav_buffer.read()
+
+