Initial commit
This commit is contained in:
9
.env.example
Normal file
9
.env.example
Normal file
@@ -0,0 +1,9 @@
|
||||
# Discord Bot Token (from Discord Developer Portal)
|
||||
DISCORD_TOKEN=your_discord_bot_token_here
|
||||
|
||||
# Channel ID to monitor for TTS messages
|
||||
# Right-click the channel in Discord and copy ID (enable Developer Mode in settings)
|
||||
TEXT_CHANNEL_ID=123456789012345678
|
||||
|
||||
# Path to the voice reference WAV file for voice cloning
|
||||
VOICE_WAV_PATH=./voice.wav
|
||||
126
.gitignore
vendored
Normal file
126
.gitignore
vendored
Normal file
@@ -0,0 +1,126 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyderworkspace
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
/venv
|
||||
# Gemini files
|
||||
GEMINI.md
|
||||
PROGRESS.md
|
||||
BIN
Estinien.wav
Normal file
BIN
Estinien.wav
Normal file
Binary file not shown.
BIN
Gibralter_funny.wav
Normal file
BIN
Gibralter_funny.wav
Normal file
Binary file not shown.
BIN
Gibralter_good.wav
Normal file
BIN
Gibralter_good.wav
Normal file
Binary file not shown.
BIN
HankHill.wav
Normal file
BIN
HankHill.wav
Normal file
Binary file not shown.
BIN
Johnny.wav
Normal file
BIN
Johnny.wav
Normal file
Binary file not shown.
BIN
MasterChief.wav
Normal file
BIN
MasterChief.wav
Normal file
Binary file not shown.
138
README.md
Normal file
138
README.md
Normal file
@@ -0,0 +1,138 @@
|
||||
# Pocket TTS Discord Bot
|
||||
|
||||
A Discord bot that reads messages aloud using [Pocket TTS](https://github.com/kyutai-labs/pocket-tts) with voice cloning from a reference WAV file.
|
||||
|
||||
## Features
|
||||
|
||||
- 🎤 **Voice Cloning**: Uses a reference WAV file to clone a voice
|
||||
- 📝 **Auto-read Messages**: Automatically reads all messages from a configured text channel
|
||||
- 🔊 **Voice Channel Streaming**: Streams generated audio to the voice channel where the message author is
|
||||
- 📋 **Message Queue**: Messages are queued and spoken in order
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.10+
|
||||
- FFmpeg installed and available in PATH
|
||||
- A Discord bot token
|
||||
- A reference voice WAV file (3-10 seconds of clear speech recommended)
|
||||
|
||||
## Installation
|
||||
|
||||
1. **Clone the repository**:
|
||||
```bash
|
||||
git clone <repository-url>
|
||||
cd PocketTTSBot
|
||||
```
|
||||
|
||||
2. **Create a virtual environment**:
|
||||
```bash
|
||||
python -m venv venv
|
||||
|
||||
# Windows
|
||||
venv\Scripts\activate
|
||||
|
||||
# Linux/macOS
|
||||
source venv/bin/activate
|
||||
```
|
||||
|
||||
3. **Install dependencies**:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
4. **Install FFmpeg**:
|
||||
- **Windows**: Download from [ffmpeg.org](https://ffmpeg.org/download.html) and add to PATH
|
||||
- **Linux**: `sudo apt install ffmpeg`
|
||||
- **macOS**: `brew install ffmpeg`
|
||||
|
||||
## Configuration
|
||||
|
||||
1. **Create a Discord Bot**:
|
||||
- Go to [Discord Developer Portal](https://discord.com/developers/applications)
|
||||
- Create a new application
|
||||
- Go to the "Bot" section and create a bot
|
||||
- Copy the bot token
|
||||
- Enable these Privileged Gateway Intents:
|
||||
- Message Content Intent
|
||||
- Server Members Intent (optional)
|
||||
|
||||
2. **Invite the Bot to your server**:
|
||||
- Go to OAuth2 > URL Generator
|
||||
- Select scopes: `bot`
|
||||
- Select permissions: `Connect`, `Speak`, `Send Messages`, `Read Message History`
|
||||
- Use the generated URL to invite the bot
|
||||
|
||||
3. **Get Channel ID**:
|
||||
- Enable Developer Mode in Discord (Settings > Advanced > Developer Mode)
|
||||
- Right-click the text channel you want to monitor and click "Copy ID"
|
||||
|
||||
4. **Create `.env` file**:
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
Edit `.env` with your values:
|
||||
```env
|
||||
DISCORD_TOKEN=your_bot_token_here
|
||||
TEXT_CHANNEL_ID=123456789012345678
|
||||
VOICE_WAV_PATH=./voice.wav
|
||||
```
|
||||
|
||||
5. **Add a voice reference file**:
|
||||
- Place a WAV file named `voice.wav` in the project directory
|
||||
- The file should contain 3-10 seconds of clear speech
|
||||
- Higher quality audio = better voice cloning results
|
||||
|
||||
## Usage
|
||||
|
||||
1. **Start the bot**:
|
||||
```bash
|
||||
python bot.py
|
||||
```
|
||||
|
||||
2. **Using the bot**:
|
||||
- Join a voice channel in your Discord server
|
||||
- Type a message in the configured text channel
|
||||
- The bot will join your voice channel and read your message aloud
|
||||
- Messages are queued if the bot is already speaking
|
||||
|
||||
## How It Works
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
|
||||
│ Text Channel │ --> │ Pocket TTS │ --> │ Voice Channel │
|
||||
│ (configured) │ │ (generate) │ │ (user's VC) │
|
||||
└─────────────────┘ └──────────────────┘ └─────────────────┘
|
||||
▲
|
||||
│
|
||||
┌─────┴─────┐
|
||||
│ voice.wav │
|
||||
│ (speaker) │
|
||||
└───────────┘
|
||||
```
|
||||
|
||||
1. Bot monitors the configured text channel for new messages
|
||||
2. When a message is received, it's added to the queue
|
||||
3. The bot generates speech using Pocket TTS with the cloned voice
|
||||
4. Audio is streamed to the voice channel where the message author is
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Bot doesn't respond to messages
|
||||
- Ensure Message Content Intent is enabled in Discord Developer Portal
|
||||
- Check that the TEXT_CHANNEL_ID is correct
|
||||
- Verify the bot has permissions to read the channel
|
||||
|
||||
### No audio in voice channel
|
||||
- Ensure FFmpeg is installed and in PATH
|
||||
- Check that the bot has Connect and Speak permissions
|
||||
- Verify your voice.wav file is valid
|
||||
|
||||
### Voice quality issues
|
||||
- Use a higher quality reference WAV file
|
||||
- Ensure the reference audio is clear with minimal background noise
|
||||
- Try a longer reference clip (5-10 seconds)
|
||||
|
||||
## License
|
||||
|
||||
MIT License
|
||||
205
audio_preprocessor.py
Normal file
205
audio_preprocessor.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""Audio preprocessing utilities for improving voice cloning quality."""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
|
||||
import librosa
|
||||
import noisereduce as nr
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
@dataclass
|
||||
class PreprocessingConfig:
|
||||
"""Configuration for audio preprocessing."""
|
||||
|
||||
target_sample_rate: int = 22050
|
||||
normalize: bool = True
|
||||
trim_silence: bool = True
|
||||
trim_top_db: int = 20
|
||||
reduce_noise: bool = True
|
||||
target_length_seconds: float | None = None # None means keep original length
|
||||
|
||||
|
||||
class AudioPreprocessor:
|
||||
"""Preprocesses audio files for optimal voice cloning."""
|
||||
|
||||
def __init__(self, config: PreprocessingConfig | None = None):
|
||||
self.config = config or PreprocessingConfig()
|
||||
|
||||
def preprocess_file(self, input_path: str, output_path: str | None = None) -> str:
|
||||
"""
|
||||
Preprocess an audio file for voice cloning.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input audio file
|
||||
output_path: Optional path for the output file. If None, creates a temp file.
|
||||
|
||||
Returns:
|
||||
Path to the preprocessed audio file
|
||||
"""
|
||||
print(f"Preprocessing audio: {input_path}")
|
||||
|
||||
# Load audio with librosa (automatically converts to mono and resamples)
|
||||
audio, sr = librosa.load(
|
||||
input_path,
|
||||
sr=self.config.target_sample_rate,
|
||||
mono=True
|
||||
)
|
||||
print(f" Loaded audio: {len(audio) / sr:.2f}s at {sr}Hz")
|
||||
|
||||
# Apply preprocessing steps
|
||||
audio = self._normalize(audio)
|
||||
audio = self._trim_silence(audio, sr)
|
||||
audio = self._reduce_noise(audio, sr)
|
||||
audio = self._limit_length(audio, sr)
|
||||
|
||||
# Ensure we have valid audio
|
||||
if len(audio) < sr * 0.5: # Less than 0.5 seconds
|
||||
print(" Warning: Audio is very short after preprocessing!")
|
||||
|
||||
# Save to output path
|
||||
if output_path is None:
|
||||
fd, output_path = tempfile.mkstemp(suffix=".wav")
|
||||
os.close(fd)
|
||||
|
||||
sf.write(output_path, audio, sr, subtype="PCM_16")
|
||||
print(f" Saved preprocessed audio: {output_path} ({len(audio) / sr:.2f}s)")
|
||||
|
||||
return output_path
|
||||
|
||||
def preprocess_to_array(self, input_path: str) -> tuple[np.ndarray, int]:
|
||||
"""
|
||||
Preprocess an audio file and return as numpy array.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input audio file
|
||||
|
||||
Returns:
|
||||
Tuple of (audio array, sample rate)
|
||||
"""
|
||||
# Use temporary file approach for consistency
|
||||
temp_path = self.preprocess_file(input_path)
|
||||
audio, sr = librosa.load(temp_path, sr=None, mono=True)
|
||||
os.unlink(temp_path)
|
||||
return audio, sr
|
||||
|
||||
def _normalize(self, audio: np.ndarray) -> np.ndarray:
|
||||
"""Normalize audio to a consistent volume level."""
|
||||
if not self.config.normalize:
|
||||
return audio
|
||||
|
||||
max_val = np.max(np.abs(audio))
|
||||
if max_val > 0:
|
||||
# Normalize to 95% of max to avoid clipping
|
||||
audio = audio / max_val * 0.95
|
||||
print(" Applied volume normalization")
|
||||
return audio
|
||||
|
||||
def _trim_silence(self, audio: np.ndarray, sr: int) -> np.ndarray:
|
||||
"""Trim silence from the beginning and end of audio."""
|
||||
if not self.config.trim_silence:
|
||||
return audio
|
||||
|
||||
trimmed, _ = librosa.effects.trim(
|
||||
audio,
|
||||
top_db=self.config.trim_top_db
|
||||
)
|
||||
trimmed_duration = len(audio) - len(trimmed)
|
||||
if trimmed_duration > 0:
|
||||
print(f" Trimmed {trimmed_duration / sr:.2f}s of silence")
|
||||
return trimmed
|
||||
|
||||
def _reduce_noise(self, audio: np.ndarray, sr: int) -> np.ndarray:
|
||||
"""Apply noise reduction to the audio."""
|
||||
if not self.config.reduce_noise:
|
||||
return audio
|
||||
|
||||
try:
|
||||
reduced = nr.reduce_noise(
|
||||
y=audio,
|
||||
sr=sr,
|
||||
stationary=True,
|
||||
prop_decrease=0.75
|
||||
)
|
||||
print(" Applied noise reduction")
|
||||
return reduced
|
||||
except Exception as e:
|
||||
print(f" Warning: Noise reduction failed: {e}")
|
||||
return audio
|
||||
|
||||
def _limit_length(self, audio: np.ndarray, sr: int) -> np.ndarray:
|
||||
"""Limit audio to target length if specified."""
|
||||
if self.config.target_length_seconds is None:
|
||||
return audio
|
||||
|
||||
max_samples = int(self.config.target_length_seconds * sr)
|
||||
if len(audio) > max_samples:
|
||||
audio = audio[:max_samples]
|
||||
print(f" Trimmed to {self.config.target_length_seconds}s")
|
||||
return audio
|
||||
|
||||
|
||||
def analyze_audio(file_path: str) -> dict:
|
||||
"""
|
||||
Analyze an audio file and return its properties.
|
||||
Useful for debugging voice cloning issues.
|
||||
"""
|
||||
audio, sr = librosa.load(file_path, sr=None, mono=False)
|
||||
|
||||
is_stereo = audio.ndim > 1
|
||||
if is_stereo:
|
||||
audio_mono = librosa.to_mono(audio)
|
||||
else:
|
||||
audio_mono = audio
|
||||
|
||||
duration = len(audio_mono) / sr
|
||||
max_amplitude = np.max(np.abs(audio_mono))
|
||||
rms = np.sqrt(np.mean(audio_mono**2))
|
||||
|
||||
# Estimate noise level from quietest parts
|
||||
frame_length = int(sr * 0.025)
|
||||
hop_length = int(sr * 0.010)
|
||||
rms_frames = librosa.feature.rms(
|
||||
y=audio_mono,
|
||||
frame_length=frame_length,
|
||||
hop_length=hop_length
|
||||
)[0]
|
||||
noise_floor = np.percentile(rms_frames, 10)
|
||||
|
||||
return {
|
||||
"path": file_path,
|
||||
"sample_rate": sr,
|
||||
"duration_seconds": duration,
|
||||
"is_stereo": is_stereo,
|
||||
"max_amplitude": float(max_amplitude),
|
||||
"rms_level": float(rms),
|
||||
"estimated_noise_floor": float(noise_floor),
|
||||
"is_normalized": max_amplitude > 0.8,
|
||||
"is_too_short": duration < 3,
|
||||
"is_too_long": duration > 30,
|
||||
"needs_resampling": sr != 22050,
|
||||
}
|
||||
|
||||
|
||||
def print_audio_analysis(file_path: str) -> None:
|
||||
"""Print a formatted analysis of an audio file."""
|
||||
info = analyze_audio(file_path)
|
||||
|
||||
print(f"\n{'=' * 50}")
|
||||
print(f"Audio Analysis: {info['path']}")
|
||||
print(f"{'=' * 50}")
|
||||
print(f" Sample Rate: {info['sample_rate']} Hz {'⚠️ (should be 22050)' if info['needs_resampling'] else '✓'}")
|
||||
print(f" Duration: {info['duration_seconds']:.2f}s", end="")
|
||||
if info['is_too_short']:
|
||||
print(" ⚠️ (too short, aim for 5-15s)")
|
||||
elif info['is_too_long']:
|
||||
print(" ⚠️ (quite long, 5-15s is ideal)")
|
||||
else:
|
||||
print(" ✓")
|
||||
print(f" Channels: {'Stereo' if info['is_stereo'] else 'Mono'} {'⚠️ (will convert to mono)' if info['is_stereo'] else '✓'}")
|
||||
print(f" Max Amplitude: {info['max_amplitude']:.3f} {'✓' if info['is_normalized'] else '⚠️ (low volume)'}")
|
||||
print(f" RMS Level: {info['rms_level']:.4f}")
|
||||
print(f" Noise Floor: {info['estimated_noise_floor']:.4f}")
|
||||
print(f"{'=' * 50}\n")
|
||||
133
bot.py
Normal file
133
bot.py
Normal file
@@ -0,0 +1,133 @@
|
||||
import asyncio
|
||||
import io
|
||||
import discord
|
||||
from discord.ext import commands
|
||||
from config import Config
|
||||
from tts_handler import TTSHandler
|
||||
|
||||
|
||||
class TTSBot(commands.Bot):
|
||||
"""Discord bot that reads messages aloud using Pocket TTS."""
|
||||
|
||||
def __init__(self):
|
||||
intents = discord.Intents.default()
|
||||
intents.message_content = True
|
||||
intents.voice_states = True
|
||||
super().__init__(command_prefix="!", intents=intents)
|
||||
|
||||
self.tts_handler = TTSHandler(Config.VOICE_WAV_PATH)
|
||||
self.message_queue: asyncio.Queue[tuple[discord.Message, str]] = asyncio.Queue()
|
||||
|
||||
async def setup_hook(self) -> None:
|
||||
"""Called when the bot is starting up."""
|
||||
print("Initializing TTS...")
|
||||
await asyncio.to_thread(self.tts_handler.load)
|
||||
self.loop.create_task(self.process_queue())
|
||||
|
||||
async def on_ready(self) -> None:
|
||||
print(f"Logged in as {self.user}")
|
||||
print(f"Monitoring channel ID: {Config.TEXT_CHANNEL_ID}")
|
||||
print("Bot is ready!")
|
||||
|
||||
async def on_message(self, message: discord.Message) -> None:
|
||||
if message.author.bot:
|
||||
return
|
||||
|
||||
if message.channel.id != Config.TEXT_CHANNEL_ID:
|
||||
return
|
||||
|
||||
if not message.content.strip():
|
||||
return
|
||||
|
||||
if message.author.voice is None:
|
||||
await message.channel.send(
|
||||
f"{message.author.mention}, you need to be in a voice channel for me to speak!",
|
||||
delete_after=5
|
||||
)
|
||||
return
|
||||
|
||||
await self.message_queue.put((message, message.content))
|
||||
print(f"Queued message from {message.author}: {message.content[:50]}...")
|
||||
|
||||
await self.process_commands(message)
|
||||
|
||||
async def process_queue(self) -> None:
|
||||
"""Process messages from the queue one at a time."""
|
||||
while True:
|
||||
message, text = await self.message_queue.get()
|
||||
|
||||
try:
|
||||
await self.speak_message(message, text)
|
||||
except Exception as e:
|
||||
print(f"Error processing message: {e}")
|
||||
finally:
|
||||
self.message_queue.task_done()
|
||||
|
||||
async def speak_message(self, message: discord.Message, text: str) -> None:
|
||||
"""Generate TTS and play it in the user's voice channel."""
|
||||
if message.author.voice is None:
|
||||
return
|
||||
|
||||
voice_channel = message.author.voice.channel
|
||||
|
||||
voice_client = await self.ensure_voice_connection(voice_channel)
|
||||
if voice_client is None:
|
||||
return
|
||||
|
||||
print(f"Generating TTS for: {text[:50]}...")
|
||||
wav_bytes = await asyncio.to_thread(self.tts_handler.generate_wav_bytes, text)
|
||||
|
||||
audio_source = discord.FFmpegPCMAudio(
|
||||
io.BytesIO(wav_bytes),
|
||||
pipe=True,
|
||||
options="-loglevel panic"
|
||||
)
|
||||
|
||||
if voice_client.is_playing():
|
||||
voice_client.stop()
|
||||
|
||||
play_complete = asyncio.Event()
|
||||
|
||||
def after_playing(error):
|
||||
if error:
|
||||
print(f"Playback error: {error}")
|
||||
self.loop.call_soon_threadsafe(play_complete.set)
|
||||
|
||||
voice_client.play(audio_source, after=after_playing)
|
||||
print(f"Playing audio in {voice_channel.name}")
|
||||
|
||||
await play_complete.wait()
|
||||
|
||||
async def ensure_voice_connection(self, channel: discord.VoiceChannel) -> discord.VoiceClient | None:
|
||||
"""Ensure we're connected to the specified voice channel."""
|
||||
guild = channel.guild
|
||||
|
||||
if guild.voice_client is not None:
|
||||
if guild.voice_client.channel.id == channel.id:
|
||||
return guild.voice_client
|
||||
await guild.voice_client.move_to(channel)
|
||||
return guild.voice_client
|
||||
|
||||
try:
|
||||
voice_client = await channel.connect(timeout=10.0)
|
||||
return voice_client
|
||||
except Exception as e:
|
||||
print(f"Failed to connect to voice channel: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
errors = Config.validate()
|
||||
if errors:
|
||||
print("Configuration errors:")
|
||||
for error in errors:
|
||||
print(f" - {error}")
|
||||
print("\nPlease create a .env file based on .env.example")
|
||||
return
|
||||
|
||||
bot = TTSBot()
|
||||
bot.run(Config.DISCORD_TOKEN)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
22
config.py
Normal file
22
config.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
class Config:
|
||||
DISCORD_TOKEN: str = os.getenv("DISCORD_TOKEN", "")
|
||||
TEXT_CHANNEL_ID: int = int(os.getenv("TEXT_CHANNEL_ID", "0"))
|
||||
VOICE_WAV_PATH: str = os.getenv("VOICE_WAV_PATH", "./voice.wav")
|
||||
|
||||
@classmethod
|
||||
def validate(cls) -> list[str]:
|
||||
"""Validate configuration and return list of errors."""
|
||||
errors = []
|
||||
if not cls.DISCORD_TOKEN:
|
||||
errors.append("DISCORD_TOKEN is not set")
|
||||
if cls.TEXT_CHANNEL_ID == 0:
|
||||
errors.append("TEXT_CHANNEL_ID is not set")
|
||||
if not os.path.exists(cls.VOICE_WAV_PATH):
|
||||
errors.append(f"Voice WAV file not found: {cls.VOICE_WAV_PATH}")
|
||||
return errors
|
||||
BIN
media/Subnautica/CyclopsEngineOff.oga
Normal file
BIN
media/Subnautica/CyclopsEngineOff.oga
Normal file
Binary file not shown.
BIN
media/Subnautica/CyclopsEngineOn.oga
Normal file
BIN
media/Subnautica/CyclopsEngineOn.oga
Normal file
Binary file not shown.
BIN
media/Subnautica/CyclopsOverheat.oga
Normal file
BIN
media/Subnautica/CyclopsOverheat.oga
Normal file
Binary file not shown.
BIN
media/Subnautica/Cyclops_Welcome.oga
Normal file
BIN
media/Subnautica/Cyclops_Welcome.oga
Normal file
Binary file not shown.
BIN
media/Subnautica/Cyclops_Welcome2.oga
Normal file
BIN
media/Subnautica/Cyclops_Welcome2.oga
Normal file
Binary file not shown.
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_03.wav
Normal file
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_03.wav
Normal file
Binary file not shown.
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_05.wav
Normal file
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_05.wav
Normal file
Binary file not shown.
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_06.wav
Normal file
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_06.wav
Normal file
Binary file not shown.
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_08.wav
Normal file
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_08.wav
Normal file
Binary file not shown.
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_09.wav
Normal file
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_09.wav
Normal file
Binary file not shown.
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_10.wav
Normal file
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_10.wav
Normal file
Binary file not shown.
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_11.wav
Normal file
BIN
media/TF2/Ronin/diag_gs_titanRonin_embark_11.wav
Normal file
Binary file not shown.
9
requirements.txt
Normal file
9
requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
discord.py[voice]>=2.3.0
|
||||
pocket-tts>=0.1.0
|
||||
scipy>=1.10.0
|
||||
PyNaCl>=1.5.0
|
||||
python-dotenv>=1.0.0
|
||||
numpy>=1.24.0
|
||||
librosa>=0.10.0
|
||||
noisereduce>=3.0.0
|
||||
soundfile>=0.12.0
|
||||
77
tts_handler.py
Normal file
77
tts_handler.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import io
|
||||
import numpy as np
|
||||
import scipy.io.wavfile as wavfile
|
||||
from typing import Any
|
||||
from pocket_tts import TTSModel
|
||||
|
||||
from audio_preprocessor import (
|
||||
AudioPreprocessor,
|
||||
PreprocessingConfig,
|
||||
print_audio_analysis,
|
||||
)
|
||||
|
||||
|
||||
class TTSHandler:
|
||||
"""Handles text-to-speech generation using Pocket TTS."""
|
||||
|
||||
DISCORD_SAMPLE_RATE = 48000
|
||||
|
||||
def __init__(self, voice_wav_path: str, preprocess_audio: bool = True):
|
||||
self.voice_wav_path = voice_wav_path
|
||||
self.preprocess_audio = preprocess_audio
|
||||
self.model: TTSModel | None = None
|
||||
self.voice_state: Any = None
|
||||
self._preprocessed_path: str | None = None
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load the TTS model and voice state from the WAV file."""
|
||||
print("Loading Pocket TTS model...")
|
||||
self.model = TTSModel.load_model()
|
||||
|
||||
voice_path = self.voice_wav_path
|
||||
|
||||
# Analyze and preprocess the audio if enabled
|
||||
if self.preprocess_audio:
|
||||
print("\nAnalyzing original audio...")
|
||||
print_audio_analysis(self.voice_wav_path)
|
||||
|
||||
print("Preprocessing audio for optimal voice cloning...")
|
||||
config = PreprocessingConfig(
|
||||
target_sample_rate=22050,
|
||||
normalize=True,
|
||||
trim_silence=True,
|
||||
trim_top_db=20,
|
||||
reduce_noise=True,
|
||||
target_length_seconds=15.0, # Limit to 15 seconds for best results
|
||||
)
|
||||
preprocessor = AudioPreprocessor(config)
|
||||
voice_path = preprocessor.preprocess_file(self.voice_wav_path)
|
||||
self._preprocessed_path = voice_path
|
||||
print("")
|
||||
|
||||
print(f"Loading voice state from: {voice_path}")
|
||||
self.voice_state = self.model.get_state_for_audio_prompt(voice_path)
|
||||
print("TTS handler ready!")
|
||||
|
||||
def generate_wav_bytes(self, text: str) -> bytes:
|
||||
"""Generate audio and return as WAV file bytes (for FFmpeg)."""
|
||||
if self.model is None or self.voice_state is None:
|
||||
raise RuntimeError("TTS handler not loaded. Call load() first.")
|
||||
|
||||
audio = self.model.generate_audio(self.voice_state, text)
|
||||
audio_np = audio.numpy()
|
||||
|
||||
if audio_np.ndim == 1:
|
||||
audio_np = audio_np.reshape(-1, 1)
|
||||
|
||||
max_val = np.max(np.abs(audio_np))
|
||||
if max_val > 0:
|
||||
audio_np = audio_np / max_val
|
||||
audio_int16 = (audio_np * 32767).astype(np.int16)
|
||||
|
||||
wav_buffer = io.BytesIO()
|
||||
wavfile.write(wav_buffer, self.model.sample_rate, audio_int16)
|
||||
wav_buffer.seek(0)
|
||||
return wav_buffer.read()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user