- Replace Unicode emoji (✓, ⚠️) with [OK] and [WARN] in audio_preprocessor.py
to prevent UnicodeEncodeError on Windows console (cp1252 codec)
- Add auto-update dependencies function to bot.py for easier maintenance
- Remove setup_linux.sh (no longer needed)
- Update .gitignore to exclude VS Code launch.json
206 lines
6.7 KiB
Python
206 lines
6.7 KiB
Python
"""Audio preprocessing utilities for improving voice cloning quality."""
|
|
|
|
import os
|
|
import tempfile
|
|
from dataclasses import dataclass
|
|
|
|
import librosa
|
|
import noisereduce as nr
|
|
import numpy as np
|
|
import soundfile as sf
|
|
|
|
|
|
@dataclass
|
|
class PreprocessingConfig:
|
|
"""Configuration for audio preprocessing."""
|
|
|
|
target_sample_rate: int = 22050
|
|
normalize: bool = True
|
|
trim_silence: bool = True
|
|
trim_top_db: int = 20
|
|
reduce_noise: bool = True
|
|
target_length_seconds: float | None = None # None means keep original length
|
|
|
|
|
|
class AudioPreprocessor:
|
|
"""Preprocesses audio files for optimal voice cloning."""
|
|
|
|
def __init__(self, config: PreprocessingConfig | None = None):
|
|
self.config = config or PreprocessingConfig()
|
|
|
|
def preprocess_file(self, input_path: str, output_path: str | None = None) -> str:
|
|
"""
|
|
Preprocess an audio file for voice cloning.
|
|
|
|
Args:
|
|
input_path: Path to the input audio file
|
|
output_path: Optional path for the output file. If None, creates a temp file.
|
|
|
|
Returns:
|
|
Path to the preprocessed audio file
|
|
"""
|
|
print(f"Preprocessing audio: {input_path}")
|
|
|
|
# Load audio with librosa (automatically converts to mono and resamples)
|
|
audio, sr = librosa.load(
|
|
input_path,
|
|
sr=self.config.target_sample_rate,
|
|
mono=True
|
|
)
|
|
print(f" Loaded audio: {len(audio) / sr:.2f}s at {sr}Hz")
|
|
|
|
# Apply preprocessing steps
|
|
audio = self._normalize(audio)
|
|
audio = self._trim_silence(audio, sr)
|
|
audio = self._reduce_noise(audio, sr)
|
|
audio = self._limit_length(audio, sr)
|
|
|
|
# Ensure we have valid audio
|
|
if len(audio) < sr * 0.5: # Less than 0.5 seconds
|
|
print(" Warning: Audio is very short after preprocessing!")
|
|
|
|
# Save to output path
|
|
if output_path is None:
|
|
fd, output_path = tempfile.mkstemp(suffix=".wav")
|
|
os.close(fd)
|
|
|
|
sf.write(output_path, audio, sr, subtype="PCM_16")
|
|
print(f" Saved preprocessed audio: {output_path} ({len(audio) / sr:.2f}s)")
|
|
|
|
return output_path
|
|
|
|
def preprocess_to_array(self, input_path: str) -> tuple[np.ndarray, int]:
|
|
"""
|
|
Preprocess an audio file and return as numpy array.
|
|
|
|
Args:
|
|
input_path: Path to the input audio file
|
|
|
|
Returns:
|
|
Tuple of (audio array, sample rate)
|
|
"""
|
|
# Use temporary file approach for consistency
|
|
temp_path = self.preprocess_file(input_path)
|
|
audio, sr = librosa.load(temp_path, sr=None, mono=True)
|
|
os.unlink(temp_path)
|
|
return audio, sr
|
|
|
|
def _normalize(self, audio: np.ndarray) -> np.ndarray:
|
|
"""Normalize audio to a consistent volume level."""
|
|
if not self.config.normalize:
|
|
return audio
|
|
|
|
max_val = np.max(np.abs(audio))
|
|
if max_val > 0:
|
|
# Normalize to 95% of max to avoid clipping
|
|
audio = audio / max_val * 0.95
|
|
print(" Applied volume normalization")
|
|
return audio
|
|
|
|
def _trim_silence(self, audio: np.ndarray, sr: int) -> np.ndarray:
|
|
"""Trim silence from the beginning and end of audio."""
|
|
if not self.config.trim_silence:
|
|
return audio
|
|
|
|
trimmed, _ = librosa.effects.trim(
|
|
audio,
|
|
top_db=self.config.trim_top_db
|
|
)
|
|
trimmed_duration = len(audio) - len(trimmed)
|
|
if trimmed_duration > 0:
|
|
print(f" Trimmed {trimmed_duration / sr:.2f}s of silence")
|
|
return trimmed
|
|
|
|
def _reduce_noise(self, audio: np.ndarray, sr: int) -> np.ndarray:
|
|
"""Apply noise reduction to the audio."""
|
|
if not self.config.reduce_noise:
|
|
return audio
|
|
|
|
try:
|
|
reduced = nr.reduce_noise(
|
|
y=audio,
|
|
sr=sr,
|
|
stationary=True,
|
|
prop_decrease=0.75
|
|
)
|
|
print(" Applied noise reduction")
|
|
return reduced
|
|
except Exception as e:
|
|
print(f" Warning: Noise reduction failed: {e}")
|
|
return audio
|
|
|
|
def _limit_length(self, audio: np.ndarray, sr: int) -> np.ndarray:
|
|
"""Limit audio to target length if specified."""
|
|
if self.config.target_length_seconds is None:
|
|
return audio
|
|
|
|
max_samples = int(self.config.target_length_seconds * sr)
|
|
if len(audio) > max_samples:
|
|
audio = audio[:max_samples]
|
|
print(f" Trimmed to {self.config.target_length_seconds}s")
|
|
return audio
|
|
|
|
|
|
def analyze_audio(file_path: str) -> dict:
|
|
"""
|
|
Analyze an audio file and return its properties.
|
|
Useful for debugging voice cloning issues.
|
|
"""
|
|
audio, sr = librosa.load(file_path, sr=None, mono=False)
|
|
|
|
is_stereo = audio.ndim > 1
|
|
if is_stereo:
|
|
audio_mono = librosa.to_mono(audio)
|
|
else:
|
|
audio_mono = audio
|
|
|
|
duration = len(audio_mono) / sr
|
|
max_amplitude = np.max(np.abs(audio_mono))
|
|
rms = np.sqrt(np.mean(audio_mono**2))
|
|
|
|
# Estimate noise level from quietest parts
|
|
frame_length = int(sr * 0.025)
|
|
hop_length = int(sr * 0.010)
|
|
rms_frames = librosa.feature.rms(
|
|
y=audio_mono,
|
|
frame_length=frame_length,
|
|
hop_length=hop_length
|
|
)[0]
|
|
noise_floor = np.percentile(rms_frames, 10)
|
|
|
|
return {
|
|
"path": file_path,
|
|
"sample_rate": sr,
|
|
"duration_seconds": duration,
|
|
"is_stereo": is_stereo,
|
|
"max_amplitude": float(max_amplitude),
|
|
"rms_level": float(rms),
|
|
"estimated_noise_floor": float(noise_floor),
|
|
"is_normalized": max_amplitude > 0.8,
|
|
"is_too_short": duration < 3,
|
|
"is_too_long": duration > 30,
|
|
"needs_resampling": sr != 22050,
|
|
}
|
|
|
|
|
|
def print_audio_analysis(file_path: str) -> None:
|
|
"""Print a formatted analysis of an audio file."""
|
|
info = analyze_audio(file_path)
|
|
|
|
print(f"\n{'=' * 50}")
|
|
print(f"Audio Analysis: {info['path']}")
|
|
print(f"{'=' * 50}")
|
|
print(f" Sample Rate: {info['sample_rate']} Hz {'[WARN] (should be 22050)' if info['needs_resampling'] else '[OK]'}")
|
|
print(f" Duration: {info['duration_seconds']:.2f}s", end="")
|
|
if info['is_too_short']:
|
|
print(" [WARN] (too short, aim for 5-15s)")
|
|
elif info['is_too_long']:
|
|
print(" [WARN] (quite long, 5-15s is ideal)")
|
|
else:
|
|
print(" [OK]")
|
|
print(f" Channels: {'Stereo' if info['is_stereo'] else 'Mono'} {'[WARN] (will convert to mono)' if info['is_stereo'] else '[OK]'}")
|
|
print(f" Max Amplitude: {info['max_amplitude']:.3f} {'[OK]' if info['is_normalized'] else '[WARN] (low volume)'}")
|
|
print(f" RMS Level: {info['rms_level']:.4f}")
|
|
print(f" Noise Floor: {info['estimated_noise_floor']:.4f}")
|
|
print(f"{'=' * 50}\n")
|