feat: add 4 new voice effects (echo, robot, chorus, tremolo)

- Removed MAX_ACTIVE_EFFECTS limit (effects unlimited) - Added echo effect (0-100%): spatial delay/reverb - Added robot effect (0-100%): ring modulation voice - Added chorus effect (0-100%): multiple voices effect - Added tremolo depth (0.0-1.0) and rate (0.0-10.0 Hz): amplitude modulation - Effects apply in order: pitch → speed → echo → chorus → tremolo → robot - Updated /effects command with all 7 effect choices - Updated /effects list to display all 7 effects with emojis - Updated warning system: warns when > 2 active effects - Added validation and formatting for all new effects - Updated voice_manager.py to handle all 7 effect storage/loading Note: Cancel button for processing >10s not yet implemented Note: Queue system needs updating to handle all effect parameters
2026-01-31 17:10:19 -06:00
parent 8d4ac59f73
commit 795d5087e9
3 changed files with 306 additions and 49 deletions
--- a/audio_effects.py
+++ b/audio_effects.py
@@ -10,9 +10,10 @@ import numpy as np
 class AudioEffects:
    """Apply post-processing effects to TTS audio."""

-    MAX_ACTIVE_EFFECTS = 2
+    # No limit on effects, but warnings shown when > 2 active
+    MAX_ACTIVE_EFFECTS = None

-    # Effect ranges
+    # Effect ranges and defaults
    PITCH_MIN = -12
    PITCH_MAX = 12
    PITCH_DEFAULT = 0
@@ -21,6 +22,26 @@ class AudioEffects:
    SPEED_MAX = 2.0
    SPEED_DEFAULT = 1.0

+    ECHO_MIN = 0
+    ECHO_MAX = 100
+    ECHO_DEFAULT = 0
+
+    ROBOT_MIN = 0
+    ROBOT_MAX = 100
+    ROBOT_DEFAULT = 0
+
+    CHORUS_MIN = 0
+    CHORUS_MAX = 100
+    CHORUS_DEFAULT = 0
+
+    TREMOLO_DEPTH_MIN = 0.0
+    TREMOLO_DEPTH_MAX = 1.0
+    TREMOLO_DEPTH_DEFAULT = 0.0
+
+    TREMOLO_RATE_MIN = 0.0
+    TREMOLO_RATE_MAX = 10.0
+    TREMOLO_RATE_DEFAULT = 0.0
+
    @classmethod
    def apply_effects(
        cls,
@@ -28,15 +49,25 @@ class AudioEffects:
        sr: int,
        pitch: int = PITCH_DEFAULT,
        speed: float = SPEED_DEFAULT,
+        echo: int = ECHO_DEFAULT,
+        robot: int = ROBOT_DEFAULT,
+        chorus: int = CHORUS_DEFAULT,
+        tremolo_depth: float = TREMOLO_DEPTH_DEFAULT,
+        tremolo_rate: float = TREMOLO_RATE_DEFAULT,
    ) -> tuple[np.ndarray, bool]:
        """
-        Apply effects to audio.
+        Apply effects to audio in order: pitch → speed → echo → chorus → tremolo → robot

        Args:
-            audio: Input audio array
+            audio: Input audio array (1D)
            sr: Sample rate
            pitch: Pitch shift in semitones (-12 to +12, 0 = no shift)
            speed: Speed multiplier (0.5 to 2.0, 1.0 = normal)
+            echo: Echo intensity (0-100, 0 = no echo)
+            robot: Robot voice intensity (0-100, 0 = no robot)
+            chorus: Chorus intensity (0-100, 0 = no chorus)
+            tremolo_depth: Tremolo depth (0.0-1.0, 0.0 = no tremolo)
+            tremolo_rate: Tremolo rate in Hz (0.0-10.0)

        Returns:
            Tuple of (processed_audio, show_processing_message)
@@ -48,24 +79,43 @@ class AudioEffects:
        # Validate inputs
        pitch = max(cls.PITCH_MIN, min(cls.PITCH_MAX, pitch))
        speed = max(cls.SPEED_MIN, min(cls.SPEED_MAX, speed))
+        echo = max(cls.ECHO_MIN, min(cls.ECHO_MAX, echo))
+        robot = max(cls.ROBOT_MIN, min(cls.ROBOT_MAX, robot))
+        chorus = max(cls.CHORUS_MIN, min(cls.CHORUS_MAX, chorus))
+        tremolo_depth = max(cls.TREMOLO_DEPTH_MIN, min(cls.TREMOLO_DEPTH_MAX, tremolo_depth))
+        tremolo_rate = max(cls.TREMOLO_RATE_MIN, min(cls.TREMOLO_RATE_MAX, tremolo_rate))

-        print(f"Applying effects - Pitch: {pitch:+d}, Speed: {speed:.1f}x")
-
-        # Apply pitch shift first (if not default)
+        # Apply pitch shift first
        if pitch != cls.PITCH_DEFAULT:
            print(f"  Applying pitch shift: {pitch:+d} semitones...")
            audio = librosa.effects.pitch_shift(
                audio, sr=sr, n_steps=pitch, bins_per_octave=12
            )

-        # Apply speed change second (if not default)
+        # Apply speed change second
        if speed != cls.SPEED_DEFAULT:
            print(f"  Applying speed change: {speed:.1f}x...")
            audio = librosa.effects.time_stretch(audio, rate=speed)

-            # Stretching changes length, so we need to resample to maintain duration
-            # Actually, for TTS we want the new speed, so we don't resample back
-            # The audio will be shorter or longer based on speed
+        # Apply echo third
+        if echo > 0:
+            print(f"  Applying echo: {echo}%...")
+            audio = cls._apply_echo(audio, sr, echo)
+
+        # Apply chorus fourth
+        if chorus > 0:
+            print(f"  Applying chorus: {chorus}%...")
+            audio = cls._apply_chorus(audio, sr, chorus)
+
+        # Apply tremolo fifth
+        if tremolo_depth > 0 and tremolo_rate > 0:
+            print(f"  Applying tremolo: depth={tremolo_depth:.1f}, rate={tremolo_rate:.1f}Hz...")
+            audio = cls._apply_tremolo(audio, sr, tremolo_depth, tremolo_rate)
+
+        # Apply robot voice last
+        if robot > 0:
+            print(f"  Applying robot effect: {robot}%...")
+            audio = cls._apply_robot(audio, sr, robot)

        processing_time = time.time() - start_time
        print(f"  Effects applied in {processing_time:.2f}s")
@@ -75,6 +125,110 @@ class AudioEffects:

        return audio, show_message

+    @classmethod
+    def _apply_echo(cls, audio: np.ndarray, sr: int, intensity: int) -> np.ndarray:
+        """Apply simple echo/reverb effect."""
+        if intensity == 0:
+            return audio
+
+        # Calculate delay in samples (50-300ms based on intensity)
+        delay_ms = 50 + (intensity / 100) * 250
+        delay_samples = int((delay_ms / 1000) * sr)
+
+        # Create output array
+        output = np.copy(audio)
+
+        # Add delayed copy with decay
+        decay = 0.3 + (intensity / 100) * 0.4  # 0.3-0.7 decay factor
+        if delay_samples < len(audio):
+            output[delay_samples:] += audio[:-delay_samples] * decay
+
+        # Normalize
+        max_val = np.max(np.abs(output))
+        if max_val > 0:
+            output = output / max_val * np.max(np.abs(audio))
+
+        return output
+
+    @classmethod
+    def _apply_chorus(cls, audio: np.ndarray, sr: int, intensity: int) -> np.ndarray:
+        """Apply chorus effect using multiple delayed voices."""
+        if intensity == 0:
+            return audio
+
+        # Number of voices based on intensity (1-3)
+        num_voices = 1 + int((intensity / 100) * 2)
+
+        # Base delay (15-30ms)
+        base_delay_ms = 15 + (intensity / 100) * 15
+        base_delay_samples = int((base_delay_ms / 1000) * sr)
+
+        output = np.copy(audio) * 0.6  # Reduce original to make room for voices
+
+        for i in range(num_voices):
+            # Slight pitch variation for each voice (±3%)
+            pitch_var = 1.0 + (0.03 * (i - 1))
+            try:
+                voice = librosa.effects.time_stretch(audio, rate=pitch_var)
+
+                # Slight delay variation
+                delay_samples = base_delay_samples + int((i * 5 / 1000) * sr)
+
+                # Mix voice into output
+                voice_len = min(len(voice), len(output) - delay_samples)
+                if voice_len > 0:
+                    output[delay_samples:delay_samples + voice_len] += voice[:voice_len] * 0.2
+            except Exception as e:
+                print(f"    Warning: Chorus voice {i+1} failed: {e}")
+
+        # Normalize
+        max_val = np.max(np.abs(output))
+        if max_val > 0:
+            output = output / max_val * 0.95
+
+        return output
+
+    @classmethod
+    def _apply_tremolo(cls, audio: np.ndarray, sr: int, depth: float, rate: float) -> np.ndarray:
+        """Apply tremolo effect (amplitude modulation)."""
+        if depth == 0 or rate == 0:
+            return audio
+
+        # Create modulation signal
+        duration = len(audio) / sr
+        t = np.linspace(0, duration, len(audio))
+
+        # Sine wave modulation at specified rate
+        modulation = 1.0 - depth * 0.5 * (1 - np.sin(2 * np.pi * rate * t))
+
+        return audio * modulation
+
+    @classmethod
+    def _apply_robot(cls, audio: np.ndarray, sr: int, intensity: int) -> np.ndarray:
+        """Apply robot voice effect using ring modulation."""
+        if intensity == 0:
+            return audio
+
+        # Carrier frequency based on intensity (80-300 Hz)
+        carrier_freq = 80 + (intensity / 100) * 220
+
+        # Create carrier signal
+        duration = len(audio) / sr
+        t = np.linspace(0, duration, len(audio))
+        carrier = np.sin(2 * np.pi * carrier_freq * t)
+
+        # Mix original with ring-modulated version based on intensity
+        mix = intensity / 100
+        robot_signal = audio * carrier
+        output = audio * (1 - mix * 0.7) + robot_signal * mix * 0.7
+
+        # Normalize
+        max_val = np.max(np.abs(output))
+        if max_val > 0:
+            output = output / max_val * 0.95
+
+        return output
+
    @classmethod
    def validate_effect(cls, effect_name: str, value: Any) -> tuple[bool, str]:
        """
@@ -83,40 +237,47 @@ class AudioEffects:
        Returns:
            Tuple of (is_valid, error_message)
        """
-        if effect_name == "pitch":
-            try:
-                pitch = int(value)
-                if cls.PITCH_MIN <= pitch <= cls.PITCH_MAX:
-                    return True, ""
-                return (
-                    False,
-                    f"Pitch must be between {cls.PITCH_MIN} and {cls.PITCH_MAX} semitones",
-                )
-            except (ValueError, TypeError):
-                return False, "Pitch must be a whole number"
+        validators = {
+            "pitch": (int, cls.PITCH_MIN, cls.PITCH_MAX, "Pitch must be a whole number", "semitones"),
+            "speed": (float, cls.SPEED_MIN, cls.SPEED_MAX, "Speed must be a number", "x"),
+            "echo": (int, cls.ECHO_MIN, cls.ECHO_MAX, "Echo must be a whole number", "%"),
+            "robot": (int, cls.ROBOT_MIN, cls.ROBOT_MAX, "Robot must be a whole number", "%"),
+            "chorus": (int, cls.CHORUS_MIN, cls.CHORUS_MAX, "Chorus must be a whole number", "%"),
+            "tremolo_depth": (float, cls.TREMOLO_DEPTH_MIN, cls.TREMOLO_DEPTH_MAX, "Tremolo depth must be a number", ""),
+            "tremolo_rate": (float, cls.TREMOLO_RATE_MIN, cls.TREMOLO_RATE_MAX, "Tremolo rate must be a number", "Hz"),
+        }

-        elif effect_name == "speed":
-            try:
-                speed = float(value)
-                if cls.SPEED_MIN <= speed <= cls.SPEED_MAX:
-                    return True, ""
-                return (
-                    False,
-                    f"Speed must be between {cls.SPEED_MIN} and {cls.SPEED_MAX}",
-                )
-            except (ValueError, TypeError):
-                return False, "Speed must be a number"
+        if effect_name not in validators:
+            return False, f"Unknown effect: {effect_name}"

-        return False, f"Unknown effect: {effect_name}"
+        type_func, min_val, max_val, error_msg, unit = validators[effect_name]
+
+        try:
+            val = type_func(value)
+            if min_val <= val <= max_val:
+                return True, ""
+            unit_str = f" {unit}" if unit else ""
+            return False, f"{effect_name.replace('_', ' ').title()} must be between {min_val} and {max_val}{unit_str}"
+        except (ValueError, TypeError):
+            return False, error_msg

    @classmethod
-    def count_active_effects(cls, pitch: int, speed: float) -> int:
+    def count_active_effects(cls, **effects) -> int:
        """Count how many effects are active (non-default)."""
        count = 0
-        if pitch != cls.PITCH_DEFAULT:
+        if effects.get("pitch", cls.PITCH_DEFAULT) != cls.PITCH_DEFAULT:
            count += 1
-        if speed != cls.SPEED_DEFAULT:
+        if effects.get("speed", cls.SPEED_DEFAULT) != cls.SPEED_DEFAULT:
            count += 1
+        if effects.get("echo", cls.ECHO_DEFAULT) > cls.ECHO_DEFAULT:
+            count += 1
+        if effects.get("robot", cls.ROBOT_DEFAULT) > cls.ROBOT_DEFAULT:
+            count += 1
+        if effects.get("chorus", cls.CHORUS_DEFAULT) > cls.CHORUS_DEFAULT:
+            count += 1
+        if effects.get("tremolo_depth", cls.TREMOLO_DEPTH_DEFAULT) > cls.TREMOLO_DEPTH_DEFAULT:
+            count += 1
+        # tremolo_rate only counts if depth is also active
        return count

    @classmethod
@@ -125,6 +286,11 @@ class AudioEffects:
        descriptions = {
            "pitch": f"Changes voice pitch ({cls.PITCH_MIN} to {cls.PITCH_MAX} semitones). Positive = higher/chipmunk, Negative = lower/deeper.",
            "speed": f"Changes speech speed ({cls.SPEED_MIN} to {cls.SPEED_MAX}x). Higher = faster, Lower = slower.",
+            "echo": f"Adds echo/reverb ({cls.ECHO_MIN} to {cls.ECHO_MAX}%). Higher = more pronounced echo.",
+            "robot": f"Applies robot voice effect ({cls.ROBOT_MIN} to {cls.ROBOT_MAX}%). Higher = more robotic.",
+            "chorus": f"Adds chorus effect ({cls.CHORUS_MIN} to {cls.CHORUS_MAX}%). Higher = more voices/depth.",
+            "tremolo_depth": f"Tremolo amplitude modulation ({cls.TREMOLO_DEPTH_MIN} to {cls.TREMOLO_DEPTH_MAX}). Higher = more warble.",
+            "tremolo_rate": f"Tremolo speed ({cls.TREMOLO_RATE_MIN} to {cls.TREMOLO_RATE_MAX} Hz). Higher = faster warble.",
        }
        return descriptions.get(effect_name, "Unknown effect")

@@ -143,4 +309,29 @@ class AudioEffects:
                return "1.0x (normal)"
            direction = "faster" if speed > 1.0 else "slower"
            return f"{speed:.1f}x ({direction})"
+        elif effect_name == "echo":
+            echo = int(value)
+            if echo == 0:
+                return "0% (off)"
+            return f"{echo}%"
+        elif effect_name == "robot":
+            robot = int(value)
+            if robot == 0:
+                return "0% (off)"
+            return f"{robot}%"
+        elif effect_name == "chorus":
+            chorus = int(value)
+            if chorus == 0:
+                return "0% (off)"
+            return f"{chorus}%"
+        elif effect_name == "tremolo_depth":
+            depth = float(value)
+            if depth == 0.0:
+                return "0.0 (off)"
+            return f"{depth:.1f}"
+        elif effect_name == "tremolo_rate":
+            rate = float(value)
+            if rate == 0.0:
+                return "0.0 Hz (off)"
+            return f"{rate:.1f} Hz"
        return str(value)