"""Audio effects processing for TTS output.""" import time from typing import Any import librosa import numpy as np class AudioEffects: """Apply post-processing effects to TTS audio.""" MAX_ACTIVE_EFFECTS = 2 # Effect ranges PITCH_MIN = -12 PITCH_MAX = 12 PITCH_DEFAULT = 0 SPEED_MIN = 0.5 SPEED_MAX = 2.0 SPEED_DEFAULT = 1.0 @classmethod def apply_effects( cls, audio: np.ndarray, sr: int, pitch: int = PITCH_DEFAULT, speed: float = SPEED_DEFAULT, ) -> tuple[np.ndarray, bool]: """ Apply effects to audio. Args: audio: Input audio array sr: Sample rate pitch: Pitch shift in semitones (-12 to +12, 0 = no shift) speed: Speed multiplier (0.5 to 2.0, 1.0 = normal) Returns: Tuple of (processed_audio, show_processing_message) show_processing_message is True if processing took > 1 second """ start_time = time.time() original_length = len(audio) # Validate inputs pitch = max(cls.PITCH_MIN, min(cls.PITCH_MAX, pitch)) speed = max(cls.SPEED_MIN, min(cls.SPEED_MAX, speed)) print(f"Applying effects - Pitch: {pitch:+d}, Speed: {speed:.1f}x") # Apply pitch shift first (if not default) if pitch != cls.PITCH_DEFAULT: print(f" Applying pitch shift: {pitch:+d} semitones...") audio = librosa.effects.pitch_shift( audio, sr=sr, n_steps=pitch, bins_per_octave=12 ) # Apply speed change second (if not default) if speed != cls.SPEED_DEFAULT: print(f" Applying speed change: {speed:.1f}x...") audio = librosa.effects.time_stretch(audio, rate=speed) # Stretching changes length, so we need to resample to maintain duration # Actually, for TTS we want the new speed, so we don't resample back # The audio will be shorter or longer based on speed processing_time = time.time() - start_time print(f" Effects applied in {processing_time:.2f}s") # Show processing message if it took more than 1 second show_message = processing_time > 1.0 return audio, show_message @classmethod def validate_effect(cls, effect_name: str, value: Any) -> tuple[bool, str]: """ Validate an effect value. Returns: Tuple of (is_valid, error_message) """ if effect_name == "pitch": try: pitch = int(value) if cls.PITCH_MIN <= pitch <= cls.PITCH_MAX: return True, "" return ( False, f"Pitch must be between {cls.PITCH_MIN} and {cls.PITCH_MAX} semitones", ) except (ValueError, TypeError): return False, "Pitch must be a whole number" elif effect_name == "speed": try: speed = float(value) if cls.SPEED_MIN <= speed <= cls.SPEED_MAX: return True, "" return ( False, f"Speed must be between {cls.SPEED_MIN} and {cls.SPEED_MAX}", ) except (ValueError, TypeError): return False, "Speed must be a number" return False, f"Unknown effect: {effect_name}" @classmethod def count_active_effects(cls, pitch: int, speed: float) -> int: """Count how many effects are active (non-default).""" count = 0 if pitch != cls.PITCH_DEFAULT: count += 1 if speed != cls.SPEED_DEFAULT: count += 1 return count @classmethod def get_effect_description(cls, effect_name: str) -> str: """Get a human-readable description of what an effect does.""" descriptions = { "pitch": f"Changes voice pitch ({cls.PITCH_MIN} to {cls.PITCH_MAX} semitones). Positive = higher/chipmunk, Negative = lower/deeper.", "speed": f"Changes speech speed ({cls.SPEED_MIN} to {cls.SPEED_MAX}x). Higher = faster, Lower = slower.", } return descriptions.get(effect_name, "Unknown effect") @classmethod def format_effect_value(cls, effect_name: str, value: Any) -> str: """Format an effect value for display.""" if effect_name == "pitch": pitch = int(value) if pitch == 0: return "0 (normal)" direction = "higher" if pitch > 0 else "lower" return f"{pitch:+d} ({direction})" elif effect_name == "speed": speed = float(value) if speed == 1.0: return "1.0x (normal)" direction = "faster" if speed > 1.0 else "slower" return f"{speed:.1f}x ({direction})" return str(value)