feat: add audio effects (pitch and speed control)

- Added new audio_effects.py module with pitch shift and speed change
- Pitch range: -12 to +12 semitones (higher = chipmunk, lower = deeper)
- Speed range: 0.5 to 2.0x (higher = faster, lower = slower)
- Maximum 2 active effects per user (performance optimization)
- Added /effects command group:
  - /effects list - Shows current effects with descriptions
  - /effects set pitch|speed <value> - Apply effects
  - /effects reset - Confirmation UI to clear all effects
- Effects persist across restarts in preferences.json
- Updated /voice preview to support optional pitch/speed parameters
- Effects applied in _generate_wav_bytes using librosa
- Added performance warnings when processing takes >1 second
- Updated README with effects documentation
This commit is contained in:
2026-01-31 15:43:29 -06:00
parent 4a2d72517f
commit 9f14e8c745
4 changed files with 527 additions and 29 deletions

292
bot.py
View File

@@ -24,6 +24,7 @@ import scipy.io.wavfile as wavfile
from discord import app_commands
from discord.ext import commands
from audio_effects import AudioEffects
from config import Config
from voice_manager import VoiceManager
@@ -56,8 +57,9 @@ class TTSBot(commands.Bot):
self.voice_manager = VoiceManager(Config.VOICES_DIR, Config.DEFAULT_VOICE)
self.message_queue: asyncio.Queue[tuple[discord.Message, str] | tuple[discord.Message, str, str]] = asyncio.Queue()
self.last_activity: float = 0.0
self._setup_slash_commands()
self._setup_effects_commands()
def _setup_slash_commands(self) -> None:
"""Set up slash commands for voice management."""
@@ -65,7 +67,9 @@ class TTSBot(commands.Bot):
@self.tree.command(name="voice", description="Manage your TTS voice")
@app_commands.describe(
action="What to do",
voice_name="Name of the voice (for 'set' action)"
voice_name="Name of the voice (for 'set' or 'preview' action)",
preview_pitch="Optional pitch for preview (-12 to 12, default: use your settings)",
preview_speed="Optional speed for preview (0.5 to 2.0, default: use your settings)",
)
@app_commands.choices(action=[
app_commands.Choice(name="list", value="list"),
@@ -77,7 +81,9 @@ class TTSBot(commands.Bot):
async def voice_command(
interaction: discord.Interaction,
action: app_commands.Choice[str],
voice_name: str | None = None
voice_name: str | None = None,
preview_pitch: int | None = None,
preview_speed: float | None = None,
):
if action.value == "list":
await self._handle_voice_list(interaction)
@@ -88,7 +94,7 @@ class TTSBot(commands.Bot):
elif action.value == "refresh":
await self._handle_voice_refresh(interaction)
elif action.value == "preview":
await self._handle_voice_preview(interaction, voice_name)
await self._handle_voice_preview(interaction, voice_name, preview_pitch, preview_speed)
@voice_command.autocomplete("voice_name")
async def voice_name_autocomplete(
@@ -102,6 +108,161 @@ class TTSBot(commands.Bot):
if current.lower() in v.lower()
][:25]
def _setup_effects_commands(self) -> None:
"""Set up slash commands for audio effects management."""
@self.tree.command(name="effects", description="Manage your TTS audio effects")
@app_commands.describe(
action="What to do",
effect_name="Name of the effect (for 'set' action)",
value="Value for the effect (for 'set' action)"
)
@app_commands.choices(action=[
app_commands.Choice(name="list", value="list"),
app_commands.Choice(name="set", value="set"),
app_commands.Choice(name="reset", value="reset"),
])
@app_commands.choices(effect_name=[
app_commands.Choice(name="pitch", value="pitch"),
app_commands.Choice(name="speed", value="speed"),
])
async def effects_command(
interaction: discord.Interaction,
action: app_commands.Choice[str],
effect_name: app_commands.Choice[str] | None = None,
value: str | None = None
):
if action.value == "list":
await self._handle_effects_list(interaction)
elif action.value == "set":
await self._handle_effects_set(interaction, effect_name, value)
elif action.value == "reset":
await self._handle_effects_reset(interaction)
async def _handle_effects_list(self, interaction: discord.Interaction) -> None:
"""Handle /effects list command."""
effects = self.voice_manager.get_user_effects(interaction.user.id)
active_count = self.voice_manager.count_active_effects(interaction.user.id)
lines = ["**Your Audio Effects:**\n"]
# Pitch
pitch_desc = AudioEffects.get_effect_description("pitch")
pitch_val = AudioEffects.format_effect_value("pitch", effects["pitch"])
lines.append(f"🎵 **Pitch**: {pitch_val}")
lines.append(f" {pitch_desc}\n")
# Speed
speed_desc = AudioEffects.get_effect_description("speed")
speed_val = AudioEffects.format_effect_value("speed", effects["speed"])
lines.append(f"⚡ **Speed**: {speed_val}")
lines.append(f" {speed_desc}\n")
# Active count warning
lines.append(f"**Active Effects**: {active_count}/{AudioEffects.MAX_ACTIVE_EFFECTS}")
if active_count >= AudioEffects.MAX_ACTIVE_EFFECTS:
lines.append("⚠️ Max effects reached. More effects = slower processing time.")
elif active_count > 0:
lines.append(f" You can add {AudioEffects.MAX_ACTIVE_EFFECTS - active_count} more effect(s).")
lines.append(f"\n*Use `/effects set <effect> <value>` to change settings*")
lines.append(f"*Use `/effects reset` to clear all effects*")
await interaction.response.send_message(
"\n".join(lines),
ephemeral=True
)
async def _handle_effects_set(
self,
interaction: discord.Interaction,
effect_name: app_commands.Choice[str] | None,
value: str | None
) -> None:
"""Handle /effects set command."""
if not effect_name or value is None:
await interaction.response.send_message(
"❌ Please provide both effect name and value. Example: `/effects set pitch 3`",
ephemeral=True
)
return
success, message = self.voice_manager.set_user_effect(
interaction.user.id,
effect_name.value,
value
)
if success:
await interaction.response.send_message(
f"{message}",
ephemeral=True
)
else:
await interaction.response.send_message(
f"{message}",
ephemeral=True
)
async def _handle_effects_reset(self, interaction: discord.Interaction) -> None:
"""Handle /effects reset command with confirmation UI."""
# Check if user has any effects to reset
active_count = self.voice_manager.count_active_effects(interaction.user.id)
if active_count == 0:
await interaction.response.send_message(
" You don't have any active effects to reset.",
ephemeral=True
)
return
# Create confirmation buttons
class ConfirmResetView(discord.ui.View):
def __init__(self, voice_manager, user_id):
super().__init__(timeout=30)
self.voice_manager = voice_manager
self.user_id = user_id
self.confirmed = False
@discord.ui.button(label="✅ Yes, Reset All", style=discord.ButtonStyle.danger)
async def confirm_button(self, interaction: discord.Interaction, button: discord.ui.Button):
if interaction.user.id != self.user_id:
await interaction.response.send_message("This button is not for you!", ephemeral=True)
return
self.voice_manager.reset_user_effects(self.user_id)
self.confirmed = True
await interaction.response.edit_message(
content="✅ All audio effects have been reset to defaults!",
view=None
)
self.stop()
@discord.ui.button(label="❌ Cancel", style=discord.ButtonStyle.secondary)
async def cancel_button(self, interaction: discord.Interaction, button: discord.ui.Button):
if interaction.user.id != self.user_id:
await interaction.response.send_message("This button is not for you!", ephemeral=True)
return
await interaction.response.edit_message(
content="❌ Reset cancelled. Your effects remain unchanged.",
view=None
)
self.stop()
view = ConfirmResetView(self.voice_manager, interaction.user.id)
await interaction.response.send_message(
f"⚠️ **Reset Confirmation**\n\n"
f"You have {active_count} active effect(s).\n"
f"This will reset **all** your audio effects to defaults:\n"
f"• Pitch: 0 (normal)\n"
f"• Speed: 1.0x (normal)\n\n"
f"Are you sure you want to continue?",
view=view,
ephemeral=True
)
async def _handle_voice_list(self, interaction: discord.Interaction) -> None:
"""Handle /voice list command."""
voices = self.voice_manager.get_available_voices()
@@ -222,7 +383,13 @@ class TTSBot(commands.Bot):
ephemeral=True
)
async def _handle_voice_preview(self, interaction: discord.Interaction, voice_name: str | None) -> None:
async def _handle_voice_preview(
self,
interaction: discord.Interaction,
voice_name: str | None,
preview_pitch: int | None = None,
preview_speed: float | None = None,
) -> None:
"""Handle /voice preview command."""
if not voice_name:
await interaction.response.send_message(
@@ -230,7 +397,7 @@ class TTSBot(commands.Bot):
ephemeral=True
)
return
# Check if user is in a voice channel
if interaction.user.voice is None:
await interaction.response.send_message(
@@ -238,9 +405,9 @@ class TTSBot(commands.Bot):
ephemeral=True
)
return
voice_name = voice_name.lower()
# Validate voice exists
if not self.voice_manager.is_voice_available(voice_name):
voices = self.voice_manager.get_available_voices()
@@ -250,35 +417,69 @@ class TTSBot(commands.Bot):
ephemeral=True
)
return
# Validate pitch if provided
if preview_pitch is not None:
is_valid, error_msg = AudioEffects.validate_effect("pitch", preview_pitch)
if not is_valid:
await interaction.response.send_message(
f"❌ Invalid pitch value: {error_msg}",
ephemeral=True
)
return
# Validate speed if provided
if preview_speed is not None:
is_valid, error_msg = AudioEffects.validate_effect("speed", preview_speed)
if not is_valid:
await interaction.response.send_message(
f"❌ Invalid speed value: {error_msg}",
ephemeral=True
)
return
# Select a random preview line
preview_text = random.choice(PREVIEW_LINES)
# Create a preview message object with all necessary attributes
class PreviewMessage:
def __init__(self, user, channel, voice_channel):
self.author = user
self.channel = channel
self._voice_channel = voice_channel
@property
def voice(self):
class VoiceState:
def __init__(self, channel):
self.channel = channel
return VoiceState(self._voice_channel)
preview_message = PreviewMessage(
interaction.user,
interaction.channel,
interaction.user.voice.channel
)
# Queue the preview with voice override
await self.message_queue.put((preview_message, preview_text, voice_name))
# Use user's current effects if not overridden
user_effects = self.voice_manager.get_user_effects(interaction.user.id)
final_pitch = preview_pitch if preview_pitch is not None else user_effects["pitch"]
final_speed = preview_speed if preview_speed is not None else user_effects["speed"]
# Queue the preview with voice override and effects
await self.message_queue.put((preview_message, preview_text, voice_name, final_pitch, final_speed))
# Build effect description
effect_desc = []
if final_pitch != 0:
effect_desc.append(f"pitch: {final_pitch:+d}")
if final_speed != 1.0:
effect_desc.append(f"speed: {final_speed:.1f}x")
effect_str = f" (with {', '.join(effect_desc)})" if effect_desc else ""
await interaction.response.send_message(
f"⏳ Queued preview for `{voice_name}`. Sample: \"{preview_text[:50]}{'...' if len(preview_text) > 50 else ''}\"",
f"⏳ Queued preview for `{voice_name}`{effect_str}. Sample: \"{preview_text[:50]}{'...' if len(preview_text) > 50 else ''}\"",
ephemeral=True
)
@@ -335,22 +536,38 @@ class TTSBot(commands.Bot):
"""Process messages from the queue one at a time."""
while True:
queue_item = await self.message_queue.get()
# Handle both regular messages (message, text) and previews (message, text, voice_name)
if len(queue_item) == 3:
# Handle queue items of different lengths:
# - (message, text) - regular message
# - (message, text, voice_name) - preview with voice override
# - (message, text, voice_name, pitch, speed) - preview with effects
if len(queue_item) == 5:
message, text, voice_override, pitch, speed = queue_item
elif len(queue_item) == 3:
message, text, voice_override = queue_item
pitch = None
speed = None
else:
message, text = queue_item
voice_override = None
pitch = None
speed = None
try:
await self.speak_message(message, text, voice_override)
await self.speak_message(message, text, voice_override, pitch, speed)
except Exception as e:
print(f"Error processing message: {e}")
finally:
self.message_queue.task_done()
async def speak_message(self, message: discord.Message, text: str, voice_override: str | None = None) -> None:
async def speak_message(
self,
message: discord.Message,
text: str,
voice_override: str | None = None,
pitch: int | None = None,
speed: float | None = None,
) -> None:
"""Generate TTS and play it in the user's voice channel."""
if message.author.voice is None:
return
@@ -382,9 +599,17 @@ class TTSBot(commands.Bot):
delete_after=5
)
return
# Get user's effects if not overridden
if pitch is None or speed is None:
user_effects = self.voice_manager.get_user_effects(message.author.id)
if pitch is None:
pitch = user_effects["pitch"]
if speed is None:
speed = user_effects["speed"]
wav_bytes = await asyncio.to_thread(
self._generate_wav_bytes, voice_state, text
self._generate_wav_bytes, voice_state, text, pitch, speed
)
audio_source = discord.FFmpegPCMAudio(
@@ -409,18 +634,33 @@ class TTSBot(commands.Bot):
await play_complete.wait()
def _generate_wav_bytes(self, voice_state: Any, text: str) -> bytes:
def _generate_wav_bytes(
self,
voice_state: Any,
text: str,
pitch: int = 0,
speed: float = 1.0,
) -> bytes:
"""Generate audio and return as WAV file bytes."""
model = self.voice_manager.model
if model is None:
raise RuntimeError("Model not loaded")
audio = model.generate_audio(voice_state, text)
audio_np = audio.numpy()
if audio_np.ndim == 1:
audio_np = audio_np.reshape(-1, 1)
# Apply audio effects if any are active
if pitch != 0 or speed != 1.0:
print(f"Applying effects - Pitch: {pitch:+d}, Speed: {speed:.1f}x")
audio_np, show_processing = AudioEffects.apply_effects(
audio_np, model.sample_rate, pitch, speed
)
if show_processing:
print("⚠️ Audio processing took longer than expected due to effects")
max_val = np.max(np.abs(audio_np))
if max_val > 0:
audio_np = audio_np / max_val