Files
Vox/audio_effects.py
Spencer Grimes 9f14e8c745 feat: add audio effects (pitch and speed control)
- Added new audio_effects.py module with pitch shift and speed change
- Pitch range: -12 to +12 semitones (higher = chipmunk, lower = deeper)
- Speed range: 0.5 to 2.0x (higher = faster, lower = slower)
- Maximum 2 active effects per user (performance optimization)
- Added /effects command group:
  - /effects list - Shows current effects with descriptions
  - /effects set pitch|speed <value> - Apply effects
  - /effects reset - Confirmation UI to clear all effects
- Effects persist across restarts in preferences.json
- Updated /voice preview to support optional pitch/speed parameters
- Effects applied in _generate_wav_bytes using librosa
- Added performance warnings when processing takes >1 second
- Updated README with effects documentation
2026-01-31 15:43:29 -06:00

147 lines
4.8 KiB
Python

"""Audio effects processing for TTS output."""
import time
from typing import Any
import librosa
import numpy as np
class AudioEffects:
"""Apply post-processing effects to TTS audio."""
MAX_ACTIVE_EFFECTS = 2
# Effect ranges
PITCH_MIN = -12
PITCH_MAX = 12
PITCH_DEFAULT = 0
SPEED_MIN = 0.5
SPEED_MAX = 2.0
SPEED_DEFAULT = 1.0
@classmethod
def apply_effects(
cls,
audio: np.ndarray,
sr: int,
pitch: int = PITCH_DEFAULT,
speed: float = SPEED_DEFAULT,
) -> tuple[np.ndarray, bool]:
"""
Apply effects to audio.
Args:
audio: Input audio array
sr: Sample rate
pitch: Pitch shift in semitones (-12 to +12, 0 = no shift)
speed: Speed multiplier (0.5 to 2.0, 1.0 = normal)
Returns:
Tuple of (processed_audio, show_processing_message)
show_processing_message is True if processing took > 1 second
"""
start_time = time.time()
original_length = len(audio)
# Validate inputs
pitch = max(cls.PITCH_MIN, min(cls.PITCH_MAX, pitch))
speed = max(cls.SPEED_MIN, min(cls.SPEED_MAX, speed))
print(f"Applying effects - Pitch: {pitch:+d}, Speed: {speed:.1f}x")
# Apply pitch shift first (if not default)
if pitch != cls.PITCH_DEFAULT:
print(f" Applying pitch shift: {pitch:+d} semitones...")
audio = librosa.effects.pitch_shift(
audio, sr=sr, n_steps=pitch, bins_per_octave=12
)
# Apply speed change second (if not default)
if speed != cls.SPEED_DEFAULT:
print(f" Applying speed change: {speed:.1f}x...")
audio = librosa.effects.time_stretch(audio, rate=speed)
# Stretching changes length, so we need to resample to maintain duration
# Actually, for TTS we want the new speed, so we don't resample back
# The audio will be shorter or longer based on speed
processing_time = time.time() - start_time
print(f" Effects applied in {processing_time:.2f}s")
# Show processing message if it took more than 1 second
show_message = processing_time > 1.0
return audio, show_message
@classmethod
def validate_effect(cls, effect_name: str, value: Any) -> tuple[bool, str]:
"""
Validate an effect value.
Returns:
Tuple of (is_valid, error_message)
"""
if effect_name == "pitch":
try:
pitch = int(value)
if cls.PITCH_MIN <= pitch <= cls.PITCH_MAX:
return True, ""
return (
False,
f"Pitch must be between {cls.PITCH_MIN} and {cls.PITCH_MAX} semitones",
)
except (ValueError, TypeError):
return False, "Pitch must be a whole number"
elif effect_name == "speed":
try:
speed = float(value)
if cls.SPEED_MIN <= speed <= cls.SPEED_MAX:
return True, ""
return (
False,
f"Speed must be between {cls.SPEED_MIN} and {cls.SPEED_MAX}",
)
except (ValueError, TypeError):
return False, "Speed must be a number"
return False, f"Unknown effect: {effect_name}"
@classmethod
def count_active_effects(cls, pitch: int, speed: float) -> int:
"""Count how many effects are active (non-default)."""
count = 0
if pitch != cls.PITCH_DEFAULT:
count += 1
if speed != cls.SPEED_DEFAULT:
count += 1
return count
@classmethod
def get_effect_description(cls, effect_name: str) -> str:
"""Get a human-readable description of what an effect does."""
descriptions = {
"pitch": f"Changes voice pitch ({cls.PITCH_MIN} to {cls.PITCH_MAX} semitones). Positive = higher/chipmunk, Negative = lower/deeper.",
"speed": f"Changes speech speed ({cls.SPEED_MIN} to {cls.SPEED_MAX}x). Higher = faster, Lower = slower.",
}
return descriptions.get(effect_name, "Unknown effect")
@classmethod
def format_effect_value(cls, effect_name: str, value: Any) -> str:
"""Format an effect value for display."""
if effect_name == "pitch":
pitch = int(value)
if pitch == 0:
return "0 (normal)"
direction = "higher" if pitch > 0 else "lower"
return f"{pitch:+d} ({direction})"
elif effect_name == "speed":
speed = float(value)
if speed == 1.0:
return "1.0x (normal)"
direction = "faster" if speed > 1.0 else "slower"
return f"{speed:.1f}x ({direction})"
return str(value)