From 4cb0a784867857d4e331c3fdb5a9e8a35b2fe41d Mon Sep 17 00:00:00 2001
From: Spencer Grimes <toamidan@gmail.com>
Date: Sat, 31 Jan 2026 16:50:43 -0600
Subject: [PATCH] fix: squeeze audio to 1D before applying effects

The TTS model returns a 2D array [samples, 1], but librosa.effects
functions expect 1D arrays. This was causing the warning:
'n_fft=2048 is too large for input signal of length=1'

Fix: Squeeze to 1D before effects, reshape back after.

Also moved the effects application logic to handle the shape
conversion properly.
---
 bot.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/bot.py b/bot.py
index a83ca75..c63b02b 100644
--- a/bot.py
+++ b/bot.py
@@ -699,15 +699,20 @@ class TTSBot(commands.Bot):
         audio = model.generate_audio(voice_state, text)
         audio_np = audio.numpy()
 
+        # Ensure audio is 2D [samples, channels] for storage
         if audio_np.ndim == 1:
             audio_np = audio_np.reshape(-1, 1)
 
         # Apply audio effects if any are active
         if pitch != 0 or speed != 1.0:
             print(f"Applying effects - Pitch: {pitch:+d}, Speed: {speed:.1f}x")
-            audio_np, show_processing = AudioEffects.apply_effects(
-                audio_np, model.sample_rate, pitch, speed
+            # Squeeze to 1D for librosa effects, then reshape back
+            audio_1d = audio_np.squeeze()
+            audio_1d, show_processing = AudioEffects.apply_effects(
+                audio_1d, model.sample_rate, pitch, speed
             )
+            # Reshape back to 2D
+            audio_np = audio_1d.reshape(-1, 1)
             if show_processing:
                 print("⚠️ Audio processing took longer than expected due to effects")