fix: squeeze audio to 1D before applying effects
The TTS model returns a 2D array [samples, 1], but librosa.effects functions expect 1D arrays. This was causing the warning: 'n_fft=2048 is too large for input signal of length=1' Fix: Squeeze to 1D before effects, reshape back after. Also moved the effects application logic to handle the shape conversion properly.
This commit is contained in:
9
bot.py
9
bot.py
@@ -699,15 +699,20 @@ class TTSBot(commands.Bot):
|
||||
audio = model.generate_audio(voice_state, text)
|
||||
audio_np = audio.numpy()
|
||||
|
||||
# Ensure audio is 2D [samples, channels] for storage
|
||||
if audio_np.ndim == 1:
|
||||
audio_np = audio_np.reshape(-1, 1)
|
||||
|
||||
# Apply audio effects if any are active
|
||||
if pitch != 0 or speed != 1.0:
|
||||
print(f"Applying effects - Pitch: {pitch:+d}, Speed: {speed:.1f}x")
|
||||
audio_np, show_processing = AudioEffects.apply_effects(
|
||||
audio_np, model.sample_rate, pitch, speed
|
||||
# Squeeze to 1D for librosa effects, then reshape back
|
||||
audio_1d = audio_np.squeeze()
|
||||
audio_1d, show_processing = AudioEffects.apply_effects(
|
||||
audio_1d, model.sample_rate, pitch, speed
|
||||
)
|
||||
# Reshape back to 2D
|
||||
audio_np = audio_1d.reshape(-1, 1)
|
||||
if show_processing:
|
||||
print("⚠️ Audio processing took longer than expected due to effects")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user