Vox/audio_effects.py

"""Audio effects processing for TTS output."""

import time
from typing import Any

import librosa
import numpy as np


class AudioEffects:
    """Apply post-processing effects to TTS audio."""

    MAX_ACTIVE_EFFECTS = 2

    # Effect ranges
    PITCH_MIN = -12
    PITCH_MAX = 12
    PITCH_DEFAULT = 0

    SPEED_MIN = 0.5
    SPEED_MAX = 2.0
    SPEED_DEFAULT = 1.0

    @classmethod
    def apply_effects(
        cls,
        audio: np.ndarray,
        sr: int,
        pitch: int = PITCH_DEFAULT,
        speed: float = SPEED_DEFAULT,
    ) -> tuple[np.ndarray, bool]:
        """
        Apply effects to audio.

        Args:
            audio: Input audio array
            sr: Sample rate
            pitch: Pitch shift in semitones (-12 to +12, 0 = no shift)
            speed: Speed multiplier (0.5 to 2.0, 1.0 = normal)

        Returns:
            Tuple of (processed_audio, show_processing_message)
            show_processing_message is True if processing took > 1 second
        """
        start_time = time.time()
        original_length = len(audio)

        # Validate inputs
        pitch = max(cls.PITCH_MIN, min(cls.PITCH_MAX, pitch))
        speed = max(cls.SPEED_MIN, min(cls.SPEED_MAX, speed))

        print(f"Applying effects - Pitch: {pitch:+d}, Speed: {speed:.1f}x")

        # Apply pitch shift first (if not default)
        if pitch != cls.PITCH_DEFAULT:
            print(f"  Applying pitch shift: {pitch:+d} semitones...")
            audio = librosa.effects.pitch_shift(
                audio, sr=sr, n_steps=pitch, bins_per_octave=12
            )

        # Apply speed change second (if not default)
        if speed != cls.SPEED_DEFAULT:
            print(f"  Applying speed change: {speed:.1f}x...")
            audio = librosa.effects.time_stretch(audio, rate=speed)

            # Stretching changes length, so we need to resample to maintain duration
            # Actually, for TTS we want the new speed, so we don't resample back
            # The audio will be shorter or longer based on speed

        processing_time = time.time() - start_time
        print(f"  Effects applied in {processing_time:.2f}s")

        # Show processing message if it took more than 1 second
        show_message = processing_time > 1.0

        return audio, show_message

    @classmethod
    def validate_effect(cls, effect_name: str, value: Any) -> tuple[bool, str]:
        """
        Validate an effect value.

        Returns:
            Tuple of (is_valid, error_message)
        """
        if effect_name == "pitch":
            try:
                pitch = int(value)
                if cls.PITCH_MIN <= pitch <= cls.PITCH_MAX:
                    return True, ""
                return (
                    False,
                    f"Pitch must be between {cls.PITCH_MIN} and {cls.PITCH_MAX} semitones",
                )
            except (ValueError, TypeError):
                return False, "Pitch must be a whole number"

        elif effect_name == "speed":
            try:
                speed = float(value)
                if cls.SPEED_MIN <= speed <= cls.SPEED_MAX:
                    return True, ""
                return (
                    False,
                    f"Speed must be between {cls.SPEED_MIN} and {cls.SPEED_MAX}",
                )
            except (ValueError, TypeError):
                return False, "Speed must be a number"

        return False, f"Unknown effect: {effect_name}"

    @classmethod
    def count_active_effects(cls, pitch: int, speed: float) -> int:
        """Count how many effects are active (non-default)."""
        count = 0
        if pitch != cls.PITCH_DEFAULT:
            count += 1
        if speed != cls.SPEED_DEFAULT:
            count += 1
        return count

    @classmethod
    def get_effect_description(cls, effect_name: str) -> str:
        """Get a human-readable description of what an effect does."""
        descriptions = {
            "pitch": f"Changes voice pitch ({cls.PITCH_MIN} to {cls.PITCH_MAX} semitones). Positive = higher/chipmunk, Negative = lower/deeper.",
            "speed": f"Changes speech speed ({cls.SPEED_MIN} to {cls.SPEED_MAX}x). Higher = faster, Lower = slower.",
        }
        return descriptions.get(effect_name, "Unknown effect")

    @classmethod
    def format_effect_value(cls, effect_name: str, value: Any) -> str:
        """Format an effect value for display."""
        if effect_name == "pitch":
            pitch = int(value)
            if pitch == 0:
                return "0 (normal)"
            direction = "higher" if pitch > 0 else "lower"
            return f"{pitch:+d} ({direction})"
        elif effect_name == "speed":
            speed = float(value)
            if speed == 1.0:
                return "1.0x (normal)"
            direction = "faster" if speed > 1.0 else "slower"
            return f"{speed:.1f}x ({direction})"
        return str(value)