"""Audio effects processing for TTS output.""" import time from typing import Any import librosa import numpy as np class AudioEffects: """Apply post-processing effects to TTS audio.""" # No limit on effects, but warnings shown when > 2 active MAX_ACTIVE_EFFECTS = None # Effect ranges and defaults PITCH_MIN = -12 PITCH_MAX = 12 PITCH_DEFAULT = 0 SPEED_MIN = 0.5 SPEED_MAX = 2.0 SPEED_DEFAULT = 1.0 ECHO_MIN = 0 ECHO_MAX = 100 ECHO_DEFAULT = 0 ROBOT_MIN = 0 ROBOT_MAX = 100 ROBOT_DEFAULT = 0 CHORUS_MIN = 0 CHORUS_MAX = 100 CHORUS_DEFAULT = 0 TREMOLO_DEPTH_MIN = 0.0 TREMOLO_DEPTH_MAX = 1.0 TREMOLO_DEPTH_DEFAULT = 0.0 TREMOLO_RATE_MIN = 0.0 TREMOLO_RATE_MAX = 10.0 TREMOLO_RATE_DEFAULT = 0.0 @classmethod def apply_effects( cls, audio: np.ndarray, sr: int, pitch: int = PITCH_DEFAULT, speed: float = SPEED_DEFAULT, echo: int = ECHO_DEFAULT, robot: int = ROBOT_DEFAULT, chorus: int = CHORUS_DEFAULT, tremolo_depth: float = TREMOLO_DEPTH_DEFAULT, tremolo_rate: float = TREMOLO_RATE_DEFAULT, ) -> tuple[np.ndarray, bool]: """ Apply effects to audio in order: pitch → speed → echo → chorus → tremolo → robot Args: audio: Input audio array (1D) sr: Sample rate pitch: Pitch shift in semitones (-12 to +12, 0 = no shift) speed: Speed multiplier (0.5 to 2.0, 1.0 = normal) echo: Echo intensity (0-100, 0 = no echo) robot: Robot voice intensity (0-100, 0 = no robot) chorus: Chorus intensity (0-100, 0 = no chorus) tremolo_depth: Tremolo depth (0.0-1.0, 0.0 = no tremolo) tremolo_rate: Tremolo rate in Hz (0.0-10.0) Returns: Tuple of (processed_audio, show_processing_message) show_processing_message is True if processing took > 1 second """ start_time = time.time() original_length = len(audio) # Validate inputs pitch = max(cls.PITCH_MIN, min(cls.PITCH_MAX, pitch)) speed = max(cls.SPEED_MIN, min(cls.SPEED_MAX, speed)) echo = max(cls.ECHO_MIN, min(cls.ECHO_MAX, echo)) robot = max(cls.ROBOT_MIN, min(cls.ROBOT_MAX, robot)) chorus = max(cls.CHORUS_MIN, min(cls.CHORUS_MAX, chorus)) tremolo_depth = max(cls.TREMOLO_DEPTH_MIN, min(cls.TREMOLO_DEPTH_MAX, tremolo_depth)) tremolo_rate = max(cls.TREMOLO_RATE_MIN, min(cls.TREMOLO_RATE_MAX, tremolo_rate)) # Apply pitch shift first if pitch != cls.PITCH_DEFAULT: print(f" Applying pitch shift: {pitch:+d} semitones...") audio = librosa.effects.pitch_shift( audio, sr=sr, n_steps=pitch, bins_per_octave=12 ) # Apply speed change second if speed != cls.SPEED_DEFAULT: print(f" Applying speed change: {speed:.1f}x...") audio = librosa.effects.time_stretch(audio, rate=speed) # Apply echo third if echo > 0: print(f" Applying echo: {echo}%...") audio = cls._apply_echo(audio, sr, echo) # Apply chorus fourth if chorus > 0: print(f" Applying chorus: {chorus}%...") audio = cls._apply_chorus(audio, sr, chorus) # Apply tremolo fifth if tremolo_depth > 0 and tremolo_rate > 0: print(f" Applying tremolo: depth={tremolo_depth:.1f}, rate={tremolo_rate:.1f}Hz...") audio = cls._apply_tremolo(audio, sr, tremolo_depth, tremolo_rate) # Apply robot voice last if robot > 0: print(f" Applying robot effect: {robot}%...") audio = cls._apply_robot(audio, sr, robot) processing_time = time.time() - start_time print(f" Effects applied in {processing_time:.2f}s") # Show processing message if it took more than 1 second show_message = processing_time > 1.0 return audio, show_message @classmethod def _apply_echo(cls, audio: np.ndarray, sr: int, intensity: int) -> np.ndarray: """Apply simple echo/reverb effect.""" if intensity == 0: return audio # Calculate delay in samples (50-300ms based on intensity) delay_ms = 50 + (intensity / 100) * 250 delay_samples = int((delay_ms / 1000) * sr) # Create output array output = np.copy(audio) # Add delayed copy with decay decay = 0.3 + (intensity / 100) * 0.4 # 0.3-0.7 decay factor if delay_samples < len(audio): output[delay_samples:] += audio[:-delay_samples] * decay # Normalize max_val = np.max(np.abs(output)) if max_val > 0: output = output / max_val * np.max(np.abs(audio)) return output @classmethod def _apply_chorus(cls, audio: np.ndarray, sr: int, intensity: int) -> np.ndarray: """Apply chorus effect using multiple delayed voices.""" if intensity == 0: return audio # Number of voices based on intensity (1-3) num_voices = 1 + int((intensity / 100) * 2) # Base delay (15-30ms) base_delay_ms = 15 + (intensity / 100) * 15 base_delay_samples = int((base_delay_ms / 1000) * sr) output = np.copy(audio) * 0.6 # Reduce original to make room for voices for i in range(num_voices): # Slight pitch variation for each voice (±3%) pitch_var = 1.0 + (0.03 * (i - 1)) try: voice = librosa.effects.time_stretch(audio, rate=pitch_var) # Slight delay variation delay_samples = base_delay_samples + int((i * 5 / 1000) * sr) # Mix voice into output voice_len = min(len(voice), len(output) - delay_samples) if voice_len > 0: output[delay_samples:delay_samples + voice_len] += voice[:voice_len] * 0.2 except Exception as e: print(f" Warning: Chorus voice {i+1} failed: {e}") # Normalize max_val = np.max(np.abs(output)) if max_val > 0: output = output / max_val * 0.95 return output @classmethod def _apply_tremolo(cls, audio: np.ndarray, sr: int, depth: float, rate: float) -> np.ndarray: """Apply tremolo effect (amplitude modulation).""" if depth == 0 or rate == 0: return audio # Create modulation signal duration = len(audio) / sr t = np.linspace(0, duration, len(audio)) # Sine wave modulation at specified rate modulation = 1.0 - depth * 0.5 * (1 - np.sin(2 * np.pi * rate * t)) return audio * modulation @classmethod def _apply_robot(cls, audio: np.ndarray, sr: int, intensity: int) -> np.ndarray: """Apply robot voice effect using ring modulation.""" if intensity == 0: return audio # Carrier frequency based on intensity (80-300 Hz) carrier_freq = 80 + (intensity / 100) * 220 # Create carrier signal duration = len(audio) / sr t = np.linspace(0, duration, len(audio)) carrier = np.sin(2 * np.pi * carrier_freq * t) # Mix original with ring-modulated version based on intensity mix = intensity / 100 robot_signal = audio * carrier output = audio * (1 - mix * 0.7) + robot_signal * mix * 0.7 # Normalize max_val = np.max(np.abs(output)) if max_val > 0: output = output / max_val * 0.95 return output @classmethod def validate_effect(cls, effect_name: str, value: Any) -> tuple[bool, str]: """ Validate an effect value. Returns: Tuple of (is_valid, error_message) """ validators = { "pitch": (int, cls.PITCH_MIN, cls.PITCH_MAX, "Pitch must be a whole number", "semitones"), "speed": (float, cls.SPEED_MIN, cls.SPEED_MAX, "Speed must be a number", "x"), "echo": (int, cls.ECHO_MIN, cls.ECHO_MAX, "Echo must be a whole number", "%"), "robot": (int, cls.ROBOT_MIN, cls.ROBOT_MAX, "Robot must be a whole number", "%"), "chorus": (int, cls.CHORUS_MIN, cls.CHORUS_MAX, "Chorus must be a whole number", "%"), "tremolo_depth": (float, cls.TREMOLO_DEPTH_MIN, cls.TREMOLO_DEPTH_MAX, "Tremolo depth must be a number", ""), "tremolo_rate": (float, cls.TREMOLO_RATE_MIN, cls.TREMOLO_RATE_MAX, "Tremolo rate must be a number", "Hz"), } if effect_name not in validators: return False, f"Unknown effect: {effect_name}" type_func, min_val, max_val, error_msg, unit = validators[effect_name] try: val = type_func(value) if min_val <= val <= max_val: return True, "" unit_str = f" {unit}" if unit else "" return False, f"{effect_name.replace('_', ' ').title()} must be between {min_val} and {max_val}{unit_str}" except (ValueError, TypeError): return False, error_msg @classmethod def count_active_effects(cls, **effects) -> int: """Count how many effects are active (non-default).""" count = 0 # Convert values to proper types (JSON stores them as strings) pitch = int(effects.get("pitch", cls.PITCH_DEFAULT)) speed = float(effects.get("speed", cls.SPEED_DEFAULT)) echo = int(effects.get("echo", cls.ECHO_DEFAULT)) robot = int(effects.get("robot", cls.ROBOT_DEFAULT)) chorus = int(effects.get("chorus", cls.CHORUS_DEFAULT)) tremolo_depth = float(effects.get("tremolo_depth", cls.TREMOLO_DEPTH_DEFAULT)) if pitch != cls.PITCH_DEFAULT: count += 1 if speed != cls.SPEED_DEFAULT: count += 1 if echo > cls.ECHO_DEFAULT: count += 1 if robot > cls.ROBOT_DEFAULT: count += 1 if chorus > cls.CHORUS_DEFAULT: count += 1 if tremolo_depth > cls.TREMOLO_DEPTH_DEFAULT: count += 1 # tremolo_rate only counts if depth is also active return count @classmethod def get_effect_description(cls, effect_name: str) -> str: """Get a human-readable description of what an effect does.""" descriptions = { "pitch": f"Changes voice pitch ({cls.PITCH_MIN} to {cls.PITCH_MAX} semitones). Positive = higher/chipmunk, Negative = lower/deeper.", "speed": f"Changes speech speed ({cls.SPEED_MIN} to {cls.SPEED_MAX}x). Higher = faster, Lower = slower.", "echo": f"Adds echo/reverb ({cls.ECHO_MIN} to {cls.ECHO_MAX}%). Higher = more pronounced echo.", "robot": f"Applies robot voice effect ({cls.ROBOT_MIN} to {cls.ROBOT_MAX}%). Higher = more robotic.", "chorus": f"Adds chorus effect ({cls.CHORUS_MIN} to {cls.CHORUS_MAX}%). Higher = more voices/depth.", "tremolo_depth": f"Tremolo amplitude modulation ({cls.TREMOLO_DEPTH_MIN} to {cls.TREMOLO_DEPTH_MAX}). Higher = more warble.", "tremolo_rate": f"Tremolo speed ({cls.TREMOLO_RATE_MIN} to {cls.TREMOLO_RATE_MAX} Hz). Higher = faster warble.", } return descriptions.get(effect_name, "Unknown effect") @classmethod def format_effect_value(cls, effect_name: str, value: Any) -> str: """Format an effect value for display.""" if effect_name == "pitch": pitch = int(value) if pitch == 0: return "0 (normal)" direction = "higher" if pitch > 0 else "lower" return f"{pitch:+d} ({direction})" elif effect_name == "speed": speed = float(value) if speed == 1.0: return "1.0x (normal)" direction = "faster" if speed > 1.0 else "slower" return f"{speed:.1f}x ({direction})" elif effect_name == "echo": echo = int(value) if echo == 0: return "0% (off)" return f"{echo}%" elif effect_name == "robot": robot = int(value) if robot == 0: return "0% (off)" return f"{robot}%" elif effect_name == "chorus": chorus = int(value) if chorus == 0: return "0% (off)" return f"{chorus}%" elif effect_name == "tremolo_depth": depth = float(value) if depth == 0.0: return "0.0 (off)" return f"{depth:.1f}" elif effect_name == "tremolo_rate": rate = float(value) if rate == 0.0: return "0.0 Hz (off)" return f"{rate:.1f} Hz" return str(value)