- Added new audio_effects.py module with pitch shift and speed change - Pitch range: -12 to +12 semitones (higher = chipmunk, lower = deeper) - Speed range: 0.5 to 2.0x (higher = faster, lower = slower) - Maximum 2 active effects per user (performance optimization) - Added /effects command group: - /effects list - Shows current effects with descriptions - /effects set pitch|speed <value> - Apply effects - /effects reset - Confirmation UI to clear all effects - Effects persist across restarts in preferences.json - Updated /voice preview to support optional pitch/speed parameters - Effects applied in _generate_wav_bytes using librosa - Added performance warnings when processing takes >1 second - Updated README with effects documentation
147 lines
4.8 KiB
Python
147 lines
4.8 KiB
Python
"""Audio effects processing for TTS output."""
|
|
|
|
import time
|
|
from typing import Any
|
|
|
|
import librosa
|
|
import numpy as np
|
|
|
|
|
|
class AudioEffects:
|
|
"""Apply post-processing effects to TTS audio."""
|
|
|
|
MAX_ACTIVE_EFFECTS = 2
|
|
|
|
# Effect ranges
|
|
PITCH_MIN = -12
|
|
PITCH_MAX = 12
|
|
PITCH_DEFAULT = 0
|
|
|
|
SPEED_MIN = 0.5
|
|
SPEED_MAX = 2.0
|
|
SPEED_DEFAULT = 1.0
|
|
|
|
@classmethod
|
|
def apply_effects(
|
|
cls,
|
|
audio: np.ndarray,
|
|
sr: int,
|
|
pitch: int = PITCH_DEFAULT,
|
|
speed: float = SPEED_DEFAULT,
|
|
) -> tuple[np.ndarray, bool]:
|
|
"""
|
|
Apply effects to audio.
|
|
|
|
Args:
|
|
audio: Input audio array
|
|
sr: Sample rate
|
|
pitch: Pitch shift in semitones (-12 to +12, 0 = no shift)
|
|
speed: Speed multiplier (0.5 to 2.0, 1.0 = normal)
|
|
|
|
Returns:
|
|
Tuple of (processed_audio, show_processing_message)
|
|
show_processing_message is True if processing took > 1 second
|
|
"""
|
|
start_time = time.time()
|
|
original_length = len(audio)
|
|
|
|
# Validate inputs
|
|
pitch = max(cls.PITCH_MIN, min(cls.PITCH_MAX, pitch))
|
|
speed = max(cls.SPEED_MIN, min(cls.SPEED_MAX, speed))
|
|
|
|
print(f"Applying effects - Pitch: {pitch:+d}, Speed: {speed:.1f}x")
|
|
|
|
# Apply pitch shift first (if not default)
|
|
if pitch != cls.PITCH_DEFAULT:
|
|
print(f" Applying pitch shift: {pitch:+d} semitones...")
|
|
audio = librosa.effects.pitch_shift(
|
|
audio, sr=sr, n_steps=pitch, bins_per_octave=12
|
|
)
|
|
|
|
# Apply speed change second (if not default)
|
|
if speed != cls.SPEED_DEFAULT:
|
|
print(f" Applying speed change: {speed:.1f}x...")
|
|
audio = librosa.effects.time_stretch(audio, rate=speed)
|
|
|
|
# Stretching changes length, so we need to resample to maintain duration
|
|
# Actually, for TTS we want the new speed, so we don't resample back
|
|
# The audio will be shorter or longer based on speed
|
|
|
|
processing_time = time.time() - start_time
|
|
print(f" Effects applied in {processing_time:.2f}s")
|
|
|
|
# Show processing message if it took more than 1 second
|
|
show_message = processing_time > 1.0
|
|
|
|
return audio, show_message
|
|
|
|
@classmethod
|
|
def validate_effect(cls, effect_name: str, value: Any) -> tuple[bool, str]:
|
|
"""
|
|
Validate an effect value.
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
if effect_name == "pitch":
|
|
try:
|
|
pitch = int(value)
|
|
if cls.PITCH_MIN <= pitch <= cls.PITCH_MAX:
|
|
return True, ""
|
|
return (
|
|
False,
|
|
f"Pitch must be between {cls.PITCH_MIN} and {cls.PITCH_MAX} semitones",
|
|
)
|
|
except (ValueError, TypeError):
|
|
return False, "Pitch must be a whole number"
|
|
|
|
elif effect_name == "speed":
|
|
try:
|
|
speed = float(value)
|
|
if cls.SPEED_MIN <= speed <= cls.SPEED_MAX:
|
|
return True, ""
|
|
return (
|
|
False,
|
|
f"Speed must be between {cls.SPEED_MIN} and {cls.SPEED_MAX}",
|
|
)
|
|
except (ValueError, TypeError):
|
|
return False, "Speed must be a number"
|
|
|
|
return False, f"Unknown effect: {effect_name}"
|
|
|
|
@classmethod
|
|
def count_active_effects(cls, pitch: int, speed: float) -> int:
|
|
"""Count how many effects are active (non-default)."""
|
|
count = 0
|
|
if pitch != cls.PITCH_DEFAULT:
|
|
count += 1
|
|
if speed != cls.SPEED_DEFAULT:
|
|
count += 1
|
|
return count
|
|
|
|
@classmethod
|
|
def get_effect_description(cls, effect_name: str) -> str:
|
|
"""Get a human-readable description of what an effect does."""
|
|
descriptions = {
|
|
"pitch": f"Changes voice pitch ({cls.PITCH_MIN} to {cls.PITCH_MAX} semitones). Positive = higher/chipmunk, Negative = lower/deeper.",
|
|
"speed": f"Changes speech speed ({cls.SPEED_MIN} to {cls.SPEED_MAX}x). Higher = faster, Lower = slower.",
|
|
}
|
|
return descriptions.get(effect_name, "Unknown effect")
|
|
|
|
@classmethod
|
|
def format_effect_value(cls, effect_name: str, value: Any) -> str:
|
|
"""Format an effect value for display."""
|
|
if effect_name == "pitch":
|
|
pitch = int(value)
|
|
if pitch == 0:
|
|
return "0 (normal)"
|
|
direction = "higher" if pitch > 0 else "lower"
|
|
return f"{pitch:+d} ({direction})"
|
|
elif effect_name == "speed":
|
|
speed = float(value)
|
|
if speed == 1.0:
|
|
return "1.0x (normal)"
|
|
direction = "faster" if speed > 1.0 else "slower"
|
|
return f"{speed:.1f}x ({direction})"
|
|
return str(value)
|