fix: squeeze audio to 1D before applying effects

The TTS model returns a 2D array [samples, 1], but librosa.effects
functions expect 1D arrays. This was causing the warning:
'n_fft=2048 is too large for input signal of length=1'

Fix: Squeeze to 1D before effects, reshape back after.

Also moved the effects application logic to handle the shape
conversion properly.
This commit is contained in:
2026-01-31 16:50:43 -06:00
parent b12639a618
commit 4cb0a78486

9
bot.py
View File

@@ -699,15 +699,20 @@ class TTSBot(commands.Bot):
audio = model.generate_audio(voice_state, text) audio = model.generate_audio(voice_state, text)
audio_np = audio.numpy() audio_np = audio.numpy()
# Ensure audio is 2D [samples, channels] for storage
if audio_np.ndim == 1: if audio_np.ndim == 1:
audio_np = audio_np.reshape(-1, 1) audio_np = audio_np.reshape(-1, 1)
# Apply audio effects if any are active # Apply audio effects if any are active
if pitch != 0 or speed != 1.0: if pitch != 0 or speed != 1.0:
print(f"Applying effects - Pitch: {pitch:+d}, Speed: {speed:.1f}x") print(f"Applying effects - Pitch: {pitch:+d}, Speed: {speed:.1f}x")
audio_np, show_processing = AudioEffects.apply_effects( # Squeeze to 1D for librosa effects, then reshape back
audio_np, model.sample_rate, pitch, speed audio_1d = audio_np.squeeze()
audio_1d, show_processing = AudioEffects.apply_effects(
audio_1d, model.sample_rate, pitch, speed
) )
# Reshape back to 2D
audio_np = audio_1d.reshape(-1, 1)
if show_processing: if show_processing:
print("⚠️ Audio processing took longer than expected due to effects") print("⚠️ Audio processing took longer than expected due to effects")