- Removed MAX_ACTIVE_EFFECTS limit (effects unlimited) - Added echo effect (0-100%): spatial delay/reverb - Added robot effect (0-100%): ring modulation voice - Added chorus effect (0-100%): multiple voices effect - Added tremolo depth (0.0-1.0) and rate (0.0-10.0 Hz): amplitude modulation - Effects apply in order: pitch → speed → echo → chorus → tremolo → robot - Updated /effects command with all 7 effect choices - Updated /effects list to display all 7 effects with emojis - Updated warning system: warns when > 2 active effects - Added validation and formatting for all new effects - Updated voice_manager.py to handle all 7 effect storage/loading Note: Cancel button for processing >10s not yet implemented Note: Queue system needs updating to handle all effect parameters
338 lines
12 KiB
Python
338 lines
12 KiB
Python
"""Audio effects processing for TTS output."""
|
|
|
|
import time
|
|
from typing import Any
|
|
|
|
import librosa
|
|
import numpy as np
|
|
|
|
|
|
class AudioEffects:
|
|
"""Apply post-processing effects to TTS audio."""
|
|
|
|
# No limit on effects, but warnings shown when > 2 active
|
|
MAX_ACTIVE_EFFECTS = None
|
|
|
|
# Effect ranges and defaults
|
|
PITCH_MIN = -12
|
|
PITCH_MAX = 12
|
|
PITCH_DEFAULT = 0
|
|
|
|
SPEED_MIN = 0.5
|
|
SPEED_MAX = 2.0
|
|
SPEED_DEFAULT = 1.0
|
|
|
|
ECHO_MIN = 0
|
|
ECHO_MAX = 100
|
|
ECHO_DEFAULT = 0
|
|
|
|
ROBOT_MIN = 0
|
|
ROBOT_MAX = 100
|
|
ROBOT_DEFAULT = 0
|
|
|
|
CHORUS_MIN = 0
|
|
CHORUS_MAX = 100
|
|
CHORUS_DEFAULT = 0
|
|
|
|
TREMOLO_DEPTH_MIN = 0.0
|
|
TREMOLO_DEPTH_MAX = 1.0
|
|
TREMOLO_DEPTH_DEFAULT = 0.0
|
|
|
|
TREMOLO_RATE_MIN = 0.0
|
|
TREMOLO_RATE_MAX = 10.0
|
|
TREMOLO_RATE_DEFAULT = 0.0
|
|
|
|
@classmethod
|
|
def apply_effects(
|
|
cls,
|
|
audio: np.ndarray,
|
|
sr: int,
|
|
pitch: int = PITCH_DEFAULT,
|
|
speed: float = SPEED_DEFAULT,
|
|
echo: int = ECHO_DEFAULT,
|
|
robot: int = ROBOT_DEFAULT,
|
|
chorus: int = CHORUS_DEFAULT,
|
|
tremolo_depth: float = TREMOLO_DEPTH_DEFAULT,
|
|
tremolo_rate: float = TREMOLO_RATE_DEFAULT,
|
|
) -> tuple[np.ndarray, bool]:
|
|
"""
|
|
Apply effects to audio in order: pitch → speed → echo → chorus → tremolo → robot
|
|
|
|
Args:
|
|
audio: Input audio array (1D)
|
|
sr: Sample rate
|
|
pitch: Pitch shift in semitones (-12 to +12, 0 = no shift)
|
|
speed: Speed multiplier (0.5 to 2.0, 1.0 = normal)
|
|
echo: Echo intensity (0-100, 0 = no echo)
|
|
robot: Robot voice intensity (0-100, 0 = no robot)
|
|
chorus: Chorus intensity (0-100, 0 = no chorus)
|
|
tremolo_depth: Tremolo depth (0.0-1.0, 0.0 = no tremolo)
|
|
tremolo_rate: Tremolo rate in Hz (0.0-10.0)
|
|
|
|
Returns:
|
|
Tuple of (processed_audio, show_processing_message)
|
|
show_processing_message is True if processing took > 1 second
|
|
"""
|
|
start_time = time.time()
|
|
original_length = len(audio)
|
|
|
|
# Validate inputs
|
|
pitch = max(cls.PITCH_MIN, min(cls.PITCH_MAX, pitch))
|
|
speed = max(cls.SPEED_MIN, min(cls.SPEED_MAX, speed))
|
|
echo = max(cls.ECHO_MIN, min(cls.ECHO_MAX, echo))
|
|
robot = max(cls.ROBOT_MIN, min(cls.ROBOT_MAX, robot))
|
|
chorus = max(cls.CHORUS_MIN, min(cls.CHORUS_MAX, chorus))
|
|
tremolo_depth = max(cls.TREMOLO_DEPTH_MIN, min(cls.TREMOLO_DEPTH_MAX, tremolo_depth))
|
|
tremolo_rate = max(cls.TREMOLO_RATE_MIN, min(cls.TREMOLO_RATE_MAX, tremolo_rate))
|
|
|
|
# Apply pitch shift first
|
|
if pitch != cls.PITCH_DEFAULT:
|
|
print(f" Applying pitch shift: {pitch:+d} semitones...")
|
|
audio = librosa.effects.pitch_shift(
|
|
audio, sr=sr, n_steps=pitch, bins_per_octave=12
|
|
)
|
|
|
|
# Apply speed change second
|
|
if speed != cls.SPEED_DEFAULT:
|
|
print(f" Applying speed change: {speed:.1f}x...")
|
|
audio = librosa.effects.time_stretch(audio, rate=speed)
|
|
|
|
# Apply echo third
|
|
if echo > 0:
|
|
print(f" Applying echo: {echo}%...")
|
|
audio = cls._apply_echo(audio, sr, echo)
|
|
|
|
# Apply chorus fourth
|
|
if chorus > 0:
|
|
print(f" Applying chorus: {chorus}%...")
|
|
audio = cls._apply_chorus(audio, sr, chorus)
|
|
|
|
# Apply tremolo fifth
|
|
if tremolo_depth > 0 and tremolo_rate > 0:
|
|
print(f" Applying tremolo: depth={tremolo_depth:.1f}, rate={tremolo_rate:.1f}Hz...")
|
|
audio = cls._apply_tremolo(audio, sr, tremolo_depth, tremolo_rate)
|
|
|
|
# Apply robot voice last
|
|
if robot > 0:
|
|
print(f" Applying robot effect: {robot}%...")
|
|
audio = cls._apply_robot(audio, sr, robot)
|
|
|
|
processing_time = time.time() - start_time
|
|
print(f" Effects applied in {processing_time:.2f}s")
|
|
|
|
# Show processing message if it took more than 1 second
|
|
show_message = processing_time > 1.0
|
|
|
|
return audio, show_message
|
|
|
|
@classmethod
|
|
def _apply_echo(cls, audio: np.ndarray, sr: int, intensity: int) -> np.ndarray:
|
|
"""Apply simple echo/reverb effect."""
|
|
if intensity == 0:
|
|
return audio
|
|
|
|
# Calculate delay in samples (50-300ms based on intensity)
|
|
delay_ms = 50 + (intensity / 100) * 250
|
|
delay_samples = int((delay_ms / 1000) * sr)
|
|
|
|
# Create output array
|
|
output = np.copy(audio)
|
|
|
|
# Add delayed copy with decay
|
|
decay = 0.3 + (intensity / 100) * 0.4 # 0.3-0.7 decay factor
|
|
if delay_samples < len(audio):
|
|
output[delay_samples:] += audio[:-delay_samples] * decay
|
|
|
|
# Normalize
|
|
max_val = np.max(np.abs(output))
|
|
if max_val > 0:
|
|
output = output / max_val * np.max(np.abs(audio))
|
|
|
|
return output
|
|
|
|
@classmethod
|
|
def _apply_chorus(cls, audio: np.ndarray, sr: int, intensity: int) -> np.ndarray:
|
|
"""Apply chorus effect using multiple delayed voices."""
|
|
if intensity == 0:
|
|
return audio
|
|
|
|
# Number of voices based on intensity (1-3)
|
|
num_voices = 1 + int((intensity / 100) * 2)
|
|
|
|
# Base delay (15-30ms)
|
|
base_delay_ms = 15 + (intensity / 100) * 15
|
|
base_delay_samples = int((base_delay_ms / 1000) * sr)
|
|
|
|
output = np.copy(audio) * 0.6 # Reduce original to make room for voices
|
|
|
|
for i in range(num_voices):
|
|
# Slight pitch variation for each voice (±3%)
|
|
pitch_var = 1.0 + (0.03 * (i - 1))
|
|
try:
|
|
voice = librosa.effects.time_stretch(audio, rate=pitch_var)
|
|
|
|
# Slight delay variation
|
|
delay_samples = base_delay_samples + int((i * 5 / 1000) * sr)
|
|
|
|
# Mix voice into output
|
|
voice_len = min(len(voice), len(output) - delay_samples)
|
|
if voice_len > 0:
|
|
output[delay_samples:delay_samples + voice_len] += voice[:voice_len] * 0.2
|
|
except Exception as e:
|
|
print(f" Warning: Chorus voice {i+1} failed: {e}")
|
|
|
|
# Normalize
|
|
max_val = np.max(np.abs(output))
|
|
if max_val > 0:
|
|
output = output / max_val * 0.95
|
|
|
|
return output
|
|
|
|
@classmethod
|
|
def _apply_tremolo(cls, audio: np.ndarray, sr: int, depth: float, rate: float) -> np.ndarray:
|
|
"""Apply tremolo effect (amplitude modulation)."""
|
|
if depth == 0 or rate == 0:
|
|
return audio
|
|
|
|
# Create modulation signal
|
|
duration = len(audio) / sr
|
|
t = np.linspace(0, duration, len(audio))
|
|
|
|
# Sine wave modulation at specified rate
|
|
modulation = 1.0 - depth * 0.5 * (1 - np.sin(2 * np.pi * rate * t))
|
|
|
|
return audio * modulation
|
|
|
|
@classmethod
|
|
def _apply_robot(cls, audio: np.ndarray, sr: int, intensity: int) -> np.ndarray:
|
|
"""Apply robot voice effect using ring modulation."""
|
|
if intensity == 0:
|
|
return audio
|
|
|
|
# Carrier frequency based on intensity (80-300 Hz)
|
|
carrier_freq = 80 + (intensity / 100) * 220
|
|
|
|
# Create carrier signal
|
|
duration = len(audio) / sr
|
|
t = np.linspace(0, duration, len(audio))
|
|
carrier = np.sin(2 * np.pi * carrier_freq * t)
|
|
|
|
# Mix original with ring-modulated version based on intensity
|
|
mix = intensity / 100
|
|
robot_signal = audio * carrier
|
|
output = audio * (1 - mix * 0.7) + robot_signal * mix * 0.7
|
|
|
|
# Normalize
|
|
max_val = np.max(np.abs(output))
|
|
if max_val > 0:
|
|
output = output / max_val * 0.95
|
|
|
|
return output
|
|
|
|
@classmethod
|
|
def validate_effect(cls, effect_name: str, value: Any) -> tuple[bool, str]:
|
|
"""
|
|
Validate an effect value.
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
validators = {
|
|
"pitch": (int, cls.PITCH_MIN, cls.PITCH_MAX, "Pitch must be a whole number", "semitones"),
|
|
"speed": (float, cls.SPEED_MIN, cls.SPEED_MAX, "Speed must be a number", "x"),
|
|
"echo": (int, cls.ECHO_MIN, cls.ECHO_MAX, "Echo must be a whole number", "%"),
|
|
"robot": (int, cls.ROBOT_MIN, cls.ROBOT_MAX, "Robot must be a whole number", "%"),
|
|
"chorus": (int, cls.CHORUS_MIN, cls.CHORUS_MAX, "Chorus must be a whole number", "%"),
|
|
"tremolo_depth": (float, cls.TREMOLO_DEPTH_MIN, cls.TREMOLO_DEPTH_MAX, "Tremolo depth must be a number", ""),
|
|
"tremolo_rate": (float, cls.TREMOLO_RATE_MIN, cls.TREMOLO_RATE_MAX, "Tremolo rate must be a number", "Hz"),
|
|
}
|
|
|
|
if effect_name not in validators:
|
|
return False, f"Unknown effect: {effect_name}"
|
|
|
|
type_func, min_val, max_val, error_msg, unit = validators[effect_name]
|
|
|
|
try:
|
|
val = type_func(value)
|
|
if min_val <= val <= max_val:
|
|
return True, ""
|
|
unit_str = f" {unit}" if unit else ""
|
|
return False, f"{effect_name.replace('_', ' ').title()} must be between {min_val} and {max_val}{unit_str}"
|
|
except (ValueError, TypeError):
|
|
return False, error_msg
|
|
|
|
@classmethod
|
|
def count_active_effects(cls, **effects) -> int:
|
|
"""Count how many effects are active (non-default)."""
|
|
count = 0
|
|
if effects.get("pitch", cls.PITCH_DEFAULT) != cls.PITCH_DEFAULT:
|
|
count += 1
|
|
if effects.get("speed", cls.SPEED_DEFAULT) != cls.SPEED_DEFAULT:
|
|
count += 1
|
|
if effects.get("echo", cls.ECHO_DEFAULT) > cls.ECHO_DEFAULT:
|
|
count += 1
|
|
if effects.get("robot", cls.ROBOT_DEFAULT) > cls.ROBOT_DEFAULT:
|
|
count += 1
|
|
if effects.get("chorus", cls.CHORUS_DEFAULT) > cls.CHORUS_DEFAULT:
|
|
count += 1
|
|
if effects.get("tremolo_depth", cls.TREMOLO_DEPTH_DEFAULT) > cls.TREMOLO_DEPTH_DEFAULT:
|
|
count += 1
|
|
# tremolo_rate only counts if depth is also active
|
|
return count
|
|
|
|
@classmethod
|
|
def get_effect_description(cls, effect_name: str) -> str:
|
|
"""Get a human-readable description of what an effect does."""
|
|
descriptions = {
|
|
"pitch": f"Changes voice pitch ({cls.PITCH_MIN} to {cls.PITCH_MAX} semitones). Positive = higher/chipmunk, Negative = lower/deeper.",
|
|
"speed": f"Changes speech speed ({cls.SPEED_MIN} to {cls.SPEED_MAX}x). Higher = faster, Lower = slower.",
|
|
"echo": f"Adds echo/reverb ({cls.ECHO_MIN} to {cls.ECHO_MAX}%). Higher = more pronounced echo.",
|
|
"robot": f"Applies robot voice effect ({cls.ROBOT_MIN} to {cls.ROBOT_MAX}%). Higher = more robotic.",
|
|
"chorus": f"Adds chorus effect ({cls.CHORUS_MIN} to {cls.CHORUS_MAX}%). Higher = more voices/depth.",
|
|
"tremolo_depth": f"Tremolo amplitude modulation ({cls.TREMOLO_DEPTH_MIN} to {cls.TREMOLO_DEPTH_MAX}). Higher = more warble.",
|
|
"tremolo_rate": f"Tremolo speed ({cls.TREMOLO_RATE_MIN} to {cls.TREMOLO_RATE_MAX} Hz). Higher = faster warble.",
|
|
}
|
|
return descriptions.get(effect_name, "Unknown effect")
|
|
|
|
@classmethod
|
|
def format_effect_value(cls, effect_name: str, value: Any) -> str:
|
|
"""Format an effect value for display."""
|
|
if effect_name == "pitch":
|
|
pitch = int(value)
|
|
if pitch == 0:
|
|
return "0 (normal)"
|
|
direction = "higher" if pitch > 0 else "lower"
|
|
return f"{pitch:+d} ({direction})"
|
|
elif effect_name == "speed":
|
|
speed = float(value)
|
|
if speed == 1.0:
|
|
return "1.0x (normal)"
|
|
direction = "faster" if speed > 1.0 else "slower"
|
|
return f"{speed:.1f}x ({direction})"
|
|
elif effect_name == "echo":
|
|
echo = int(value)
|
|
if echo == 0:
|
|
return "0% (off)"
|
|
return f"{echo}%"
|
|
elif effect_name == "robot":
|
|
robot = int(value)
|
|
if robot == 0:
|
|
return "0% (off)"
|
|
return f"{robot}%"
|
|
elif effect_name == "chorus":
|
|
chorus = int(value)
|
|
if chorus == 0:
|
|
return "0% (off)"
|
|
return f"{chorus}%"
|
|
elif effect_name == "tremolo_depth":
|
|
depth = float(value)
|
|
if depth == 0.0:
|
|
return "0.0 (off)"
|
|
return f"{depth:.1f}"
|
|
elif effect_name == "tremolo_rate":
|
|
rate = float(value)
|
|
if rate == 0.0:
|
|
return "0.0 Hz (off)"
|
|
return f"{rate:.1f} Hz"
|
|
return str(value)
|