feat: add audio effects (pitch and speed control)

- Added new audio_effects.py module with pitch shift and speed change - Pitch range: -12 to +12 semitones (higher = chipmunk, lower = deeper) - Speed range: 0.5 to 2.0x (higher = faster, lower = slower) - Maximum 2 active effects per user (performance optimization) - Added /effects command group: - /effects list - Shows current effects with descriptions - /effects set pitch|speed <value> - Apply effects - /effects reset - Confirmation UI to clear all effects - Effects persist across restarts in preferences.json - Updated /voice preview to support optional pitch/speed parameters - Effects applied in _generate_wav_bytes using librosa - Added performance warnings when processing takes >1 second - Updated README with effects documentation
2026-01-31 15:43:29 -06:00
parent 4a2d72517f
commit 9f14e8c745
4 changed files with 527 additions and 29 deletions
--- a/bot.py
+++ b/bot.py
@@ -24,6 +24,7 @@ import scipy.io.wavfile as wavfile
 from discord import app_commands
 from discord.ext import commands

+from audio_effects import AudioEffects
 from config import Config
 from voice_manager import VoiceManager

@@ -56,8 +57,9 @@ class TTSBot(commands.Bot):
        self.voice_manager = VoiceManager(Config.VOICES_DIR, Config.DEFAULT_VOICE)
        self.message_queue: asyncio.Queue[tuple[discord.Message, str] | tuple[discord.Message, str, str]] = asyncio.Queue()
        self.last_activity: float = 0.0
-        
+
        self._setup_slash_commands()
+        self._setup_effects_commands()

    def _setup_slash_commands(self) -> None:
        """Set up slash commands for voice management."""
@@ -65,7 +67,9 @@ class TTSBot(commands.Bot):
        @self.tree.command(name="voice", description="Manage your TTS voice")
        @app_commands.describe(
            action="What to do",
-            voice_name="Name of the voice (for 'set' action)"
+            voice_name="Name of the voice (for 'set' or 'preview' action)",
+            preview_pitch="Optional pitch for preview (-12 to 12, default: use your settings)",
+            preview_speed="Optional speed for preview (0.5 to 2.0, default: use your settings)",
        )
        @app_commands.choices(action=[
            app_commands.Choice(name="list", value="list"),
@@ -77,7 +81,9 @@ class TTSBot(commands.Bot):
        async def voice_command(
            interaction: discord.Interaction,
            action: app_commands.Choice[str],
-            voice_name: str | None = None
+            voice_name: str | None = None,
+            preview_pitch: int | None = None,
+            preview_speed: float | None = None,
        ):
            if action.value == "list":
                await self._handle_voice_list(interaction)
@@ -88,7 +94,7 @@ class TTSBot(commands.Bot):
            elif action.value == "refresh":
                await self._handle_voice_refresh(interaction)
            elif action.value == "preview":
-                await self._handle_voice_preview(interaction, voice_name)
+                await self._handle_voice_preview(interaction, voice_name, preview_pitch, preview_speed)
        
        @voice_command.autocomplete("voice_name")
        async def voice_name_autocomplete(
@@ -102,6 +108,161 @@ class TTSBot(commands.Bot):
                if current.lower() in v.lower()
            ][:25]

+    def _setup_effects_commands(self) -> None:
+        """Set up slash commands for audio effects management."""
+
+        @self.tree.command(name="effects", description="Manage your TTS audio effects")
+        @app_commands.describe(
+            action="What to do",
+            effect_name="Name of the effect (for 'set' action)",
+            value="Value for the effect (for 'set' action)"
+        )
+        @app_commands.choices(action=[
+            app_commands.Choice(name="list", value="list"),
+            app_commands.Choice(name="set", value="set"),
+            app_commands.Choice(name="reset", value="reset"),
+        ])
+        @app_commands.choices(effect_name=[
+            app_commands.Choice(name="pitch", value="pitch"),
+            app_commands.Choice(name="speed", value="speed"),
+        ])
+        async def effects_command(
+            interaction: discord.Interaction,
+            action: app_commands.Choice[str],
+            effect_name: app_commands.Choice[str] | None = None,
+            value: str | None = None
+        ):
+            if action.value == "list":
+                await self._handle_effects_list(interaction)
+            elif action.value == "set":
+                await self._handle_effects_set(interaction, effect_name, value)
+            elif action.value == "reset":
+                await self._handle_effects_reset(interaction)
+
+    async def _handle_effects_list(self, interaction: discord.Interaction) -> None:
+        """Handle /effects list command."""
+        effects = self.voice_manager.get_user_effects(interaction.user.id)
+        active_count = self.voice_manager.count_active_effects(interaction.user.id)
+
+        lines = ["**Your Audio Effects:**\n"]
+
+        # Pitch
+        pitch_desc = AudioEffects.get_effect_description("pitch")
+        pitch_val = AudioEffects.format_effect_value("pitch", effects["pitch"])
+        lines.append(f"🎵 **Pitch**: {pitch_val}")
+        lines.append(f"   {pitch_desc}\n")
+
+        # Speed
+        speed_desc = AudioEffects.get_effect_description("speed")
+        speed_val = AudioEffects.format_effect_value("speed", effects["speed"])
+        lines.append(f"⚡ **Speed**: {speed_val}")
+        lines.append(f"   {speed_desc}\n")
+
+        # Active count warning
+        lines.append(f"**Active Effects**: {active_count}/{AudioEffects.MAX_ACTIVE_EFFECTS}")
+        if active_count >= AudioEffects.MAX_ACTIVE_EFFECTS:
+            lines.append("⚠️ Max effects reached. More effects = slower processing time.")
+        elif active_count > 0:
+            lines.append(f"ℹ️ You can add {AudioEffects.MAX_ACTIVE_EFFECTS - active_count} more effect(s).")
+
+        lines.append(f"\n*Use `/effects set <effect> <value>` to change settings*")
+        lines.append(f"*Use `/effects reset` to clear all effects*")
+
+        await interaction.response.send_message(
+            "\n".join(lines),
+            ephemeral=True
+        )
+
+    async def _handle_effects_set(
+        self,
+        interaction: discord.Interaction,
+        effect_name: app_commands.Choice[str] | None,
+        value: str | None
+    ) -> None:
+        """Handle /effects set command."""
+        if not effect_name or value is None:
+            await interaction.response.send_message(
+                "❌ Please provide both effect name and value. Example: `/effects set pitch 3`",
+                ephemeral=True
+            )
+            return
+
+        success, message = self.voice_manager.set_user_effect(
+            interaction.user.id,
+            effect_name.value,
+            value
+        )
+
+        if success:
+            await interaction.response.send_message(
+                f"✅ {message}",
+                ephemeral=True
+            )
+        else:
+            await interaction.response.send_message(
+                f"❌ {message}",
+                ephemeral=True
+            )
+
+    async def _handle_effects_reset(self, interaction: discord.Interaction) -> None:
+        """Handle /effects reset command with confirmation UI."""
+        # Check if user has any effects to reset
+        active_count = self.voice_manager.count_active_effects(interaction.user.id)
+
+        if active_count == 0:
+            await interaction.response.send_message(
+                "ℹ️ You don't have any active effects to reset.",
+                ephemeral=True
+            )
+            return
+
+        # Create confirmation buttons
+        class ConfirmResetView(discord.ui.View):
+            def __init__(self, voice_manager, user_id):
+                super().__init__(timeout=30)
+                self.voice_manager = voice_manager
+                self.user_id = user_id
+                self.confirmed = False
+
+            @discord.ui.button(label="✅ Yes, Reset All", style=discord.ButtonStyle.danger)
+            async def confirm_button(self, interaction: discord.Interaction, button: discord.ui.Button):
+                if interaction.user.id != self.user_id:
+                    await interaction.response.send_message("This button is not for you!", ephemeral=True)
+                    return
+
+                self.voice_manager.reset_user_effects(self.user_id)
+                self.confirmed = True
+                await interaction.response.edit_message(
+                    content="✅ All audio effects have been reset to defaults!",
+                    view=None
+                )
+                self.stop()
+
+            @discord.ui.button(label="❌ Cancel", style=discord.ButtonStyle.secondary)
+            async def cancel_button(self, interaction: discord.Interaction, button: discord.ui.Button):
+                if interaction.user.id != self.user_id:
+                    await interaction.response.send_message("This button is not for you!", ephemeral=True)
+                    return
+
+                await interaction.response.edit_message(
+                    content="❌ Reset cancelled. Your effects remain unchanged.",
+                    view=None
+                )
+                self.stop()
+
+        view = ConfirmResetView(self.voice_manager, interaction.user.id)
+
+        await interaction.response.send_message(
+            f"⚠️ **Reset Confirmation**\n\n"
+            f"You have {active_count} active effect(s).\n"
+            f"This will reset **all** your audio effects to defaults:\n"
+            f"• Pitch: 0 (normal)\n"
+            f"• Speed: 1.0x (normal)\n\n"
+            f"Are you sure you want to continue?",
+            view=view,
+            ephemeral=True
+        )
+
    async def _handle_voice_list(self, interaction: discord.Interaction) -> None:
        """Handle /voice list command."""
        voices = self.voice_manager.get_available_voices()
@@ -222,7 +383,13 @@ class TTSBot(commands.Bot):
            ephemeral=True
        )

-    async def _handle_voice_preview(self, interaction: discord.Interaction, voice_name: str | None) -> None:
+    async def _handle_voice_preview(
+        self,
+        interaction: discord.Interaction,
+        voice_name: str | None,
+        preview_pitch: int | None = None,
+        preview_speed: float | None = None,
+    ) -> None:
        """Handle /voice preview command."""
        if not voice_name:
            await interaction.response.send_message(
@@ -230,7 +397,7 @@ class TTSBot(commands.Bot):
                ephemeral=True
            )
            return
-        
+
        # Check if user is in a voice channel
        if interaction.user.voice is None:
            await interaction.response.send_message(
@@ -238,9 +405,9 @@ class TTSBot(commands.Bot):
                ephemeral=True
            )
            return
-        
+
        voice_name = voice_name.lower()
-        
+
        # Validate voice exists
        if not self.voice_manager.is_voice_available(voice_name):
            voices = self.voice_manager.get_available_voices()
@@ -250,35 +417,69 @@ class TTSBot(commands.Bot):
                ephemeral=True
            )
            return
-        
+
+        # Validate pitch if provided
+        if preview_pitch is not None:
+            is_valid, error_msg = AudioEffects.validate_effect("pitch", preview_pitch)
+            if not is_valid:
+                await interaction.response.send_message(
+                    f"❌ Invalid pitch value: {error_msg}",
+                    ephemeral=True
+                )
+                return
+
+        # Validate speed if provided
+        if preview_speed is not None:
+            is_valid, error_msg = AudioEffects.validate_effect("speed", preview_speed)
+            if not is_valid:
+                await interaction.response.send_message(
+                    f"❌ Invalid speed value: {error_msg}",
+                    ephemeral=True
+                )
+                return
+
        # Select a random preview line
        preview_text = random.choice(PREVIEW_LINES)
-        
+
        # Create a preview message object with all necessary attributes
        class PreviewMessage:
            def __init__(self, user, channel, voice_channel):
                self.author = user
                self.channel = channel
                self._voice_channel = voice_channel
-            
+
            @property
            def voice(self):
                class VoiceState:
                    def __init__(self, channel):
                        self.channel = channel
                return VoiceState(self._voice_channel)
-        
+
        preview_message = PreviewMessage(
            interaction.user,
            interaction.channel,
            interaction.user.voice.channel
        )
-        
-        # Queue the preview with voice override
-        await self.message_queue.put((preview_message, preview_text, voice_name))
-        
+
+        # Use user's current effects if not overridden
+        user_effects = self.voice_manager.get_user_effects(interaction.user.id)
+        final_pitch = preview_pitch if preview_pitch is not None else user_effects["pitch"]
+        final_speed = preview_speed if preview_speed is not None else user_effects["speed"]
+
+        # Queue the preview with voice override and effects
+        await self.message_queue.put((preview_message, preview_text, voice_name, final_pitch, final_speed))
+
+        # Build effect description
+        effect_desc = []
+        if final_pitch != 0:
+            effect_desc.append(f"pitch: {final_pitch:+d}")
+        if final_speed != 1.0:
+            effect_desc.append(f"speed: {final_speed:.1f}x")
+
+        effect_str = f" (with {', '.join(effect_desc)})" if effect_desc else ""
+
        await interaction.response.send_message(
-            f"⏳ Queued preview for `{voice_name}`. Sample: \"{preview_text[:50]}{'...' if len(preview_text) > 50 else ''}\"",
+            f"⏳ Queued preview for `{voice_name}`{effect_str}. Sample: \"{preview_text[:50]}{'...' if len(preview_text) > 50 else ''}\"",
            ephemeral=True
        )

@@ -335,22 +536,38 @@ class TTSBot(commands.Bot):
        """Process messages from the queue one at a time."""
        while True:
            queue_item = await self.message_queue.get()
-            
-            # Handle both regular messages (message, text) and previews (message, text, voice_name)
-            if len(queue_item) == 3:
+
+            # Handle queue items of different lengths:
+            # - (message, text) - regular message
+            # - (message, text, voice_name) - preview with voice override
+            # - (message, text, voice_name, pitch, speed) - preview with effects
+            if len(queue_item) == 5:
+                message, text, voice_override, pitch, speed = queue_item
+            elif len(queue_item) == 3:
                message, text, voice_override = queue_item
+                pitch = None
+                speed = None
            else:
                message, text = queue_item
                voice_override = None
+                pitch = None
+                speed = None

            try:
-                await self.speak_message(message, text, voice_override)
+                await self.speak_message(message, text, voice_override, pitch, speed)
            except Exception as e:
                print(f"Error processing message: {e}")
            finally:
                self.message_queue.task_done()

-    async def speak_message(self, message: discord.Message, text: str, voice_override: str | None = None) -> None:
+    async def speak_message(
+        self,
+        message: discord.Message,
+        text: str,
+        voice_override: str | None = None,
+        pitch: int | None = None,
+        speed: float | None = None,
+    ) -> None:
        """Generate TTS and play it in the user's voice channel."""
        if message.author.voice is None:
            return
@@ -382,9 +599,17 @@ class TTSBot(commands.Bot):
                    delete_after=5
                )
            return
-        
+
+        # Get user's effects if not overridden
+        if pitch is None or speed is None:
+            user_effects = self.voice_manager.get_user_effects(message.author.id)
+            if pitch is None:
+                pitch = user_effects["pitch"]
+            if speed is None:
+                speed = user_effects["speed"]
+
        wav_bytes = await asyncio.to_thread(
-            self._generate_wav_bytes, voice_state, text
+            self._generate_wav_bytes, voice_state, text, pitch, speed
        )

        audio_source = discord.FFmpegPCMAudio(
@@ -409,18 +634,33 @@ class TTSBot(commands.Bot):

        await play_complete.wait()

-    def _generate_wav_bytes(self, voice_state: Any, text: str) -> bytes:
+    def _generate_wav_bytes(
+        self,
+        voice_state: Any,
+        text: str,
+        pitch: int = 0,
+        speed: float = 1.0,
+    ) -> bytes:
        """Generate audio and return as WAV file bytes."""
        model = self.voice_manager.model
        if model is None:
            raise RuntimeError("Model not loaded")
-        
+
        audio = model.generate_audio(voice_state, text)
        audio_np = audio.numpy()

        if audio_np.ndim == 1:
            audio_np = audio_np.reshape(-1, 1)

+        # Apply audio effects if any are active
+        if pitch != 0 or speed != 1.0:
+            print(f"Applying effects - Pitch: {pitch:+d}, Speed: {speed:.1f}x")
+            audio_np, show_processing = AudioEffects.apply_effects(
+                audio_np, model.sample_rate, pitch, speed
+            )
+            if show_processing:
+                print("⚠️ Audio processing took longer than expected due to effects")
+
        max_val = np.max(np.abs(audio_np))
        if max_val > 0:
            audio_np = audio_np / max_val