From 4cb0a784867857d4e331c3fdb5a9e8a35b2fe41d Mon Sep 17 00:00:00 2001 From: Spencer Grimes Date: Sat, 31 Jan 2026 16:50:43 -0600 Subject: [PATCH] fix: squeeze audio to 1D before applying effects The TTS model returns a 2D array [samples, 1], but librosa.effects functions expect 1D arrays. This was causing the warning: 'n_fft=2048 is too large for input signal of length=1' Fix: Squeeze to 1D before effects, reshape back after. Also moved the effects application logic to handle the shape conversion properly. --- bot.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bot.py b/bot.py index a83ca75..c63b02b 100644 --- a/bot.py +++ b/bot.py @@ -699,15 +699,20 @@ class TTSBot(commands.Bot): audio = model.generate_audio(voice_state, text) audio_np = audio.numpy() + # Ensure audio is 2D [samples, channels] for storage if audio_np.ndim == 1: audio_np = audio_np.reshape(-1, 1) # Apply audio effects if any are active if pitch != 0 or speed != 1.0: print(f"Applying effects - Pitch: {pitch:+d}, Speed: {speed:.1f}x") - audio_np, show_processing = AudioEffects.apply_effects( - audio_np, model.sample_rate, pitch, speed + # Squeeze to 1D for librosa effects, then reshape back + audio_1d = audio_np.squeeze() + audio_1d, show_processing = AudioEffects.apply_effects( + audio_1d, model.sample_rate, pitch, speed ) + # Reshape back to 2D + audio_np = audio_1d.reshape(-1, 1) if show_processing: print("⚠️ Audio processing took longer than expected due to effects")