Initial commit
This commit is contained in:
205
audio_preprocessor.py
Normal file
205
audio_preprocessor.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""Audio preprocessing utilities for improving voice cloning quality."""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
|
||||
import librosa
|
||||
import noisereduce as nr
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
@dataclass
|
||||
class PreprocessingConfig:
|
||||
"""Configuration for audio preprocessing."""
|
||||
|
||||
target_sample_rate: int = 22050
|
||||
normalize: bool = True
|
||||
trim_silence: bool = True
|
||||
trim_top_db: int = 20
|
||||
reduce_noise: bool = True
|
||||
target_length_seconds: float | None = None # None means keep original length
|
||||
|
||||
|
||||
class AudioPreprocessor:
|
||||
"""Preprocesses audio files for optimal voice cloning."""
|
||||
|
||||
def __init__(self, config: PreprocessingConfig | None = None):
|
||||
self.config = config or PreprocessingConfig()
|
||||
|
||||
def preprocess_file(self, input_path: str, output_path: str | None = None) -> str:
|
||||
"""
|
||||
Preprocess an audio file for voice cloning.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input audio file
|
||||
output_path: Optional path for the output file. If None, creates a temp file.
|
||||
|
||||
Returns:
|
||||
Path to the preprocessed audio file
|
||||
"""
|
||||
print(f"Preprocessing audio: {input_path}")
|
||||
|
||||
# Load audio with librosa (automatically converts to mono and resamples)
|
||||
audio, sr = librosa.load(
|
||||
input_path,
|
||||
sr=self.config.target_sample_rate,
|
||||
mono=True
|
||||
)
|
||||
print(f" Loaded audio: {len(audio) / sr:.2f}s at {sr}Hz")
|
||||
|
||||
# Apply preprocessing steps
|
||||
audio = self._normalize(audio)
|
||||
audio = self._trim_silence(audio, sr)
|
||||
audio = self._reduce_noise(audio, sr)
|
||||
audio = self._limit_length(audio, sr)
|
||||
|
||||
# Ensure we have valid audio
|
||||
if len(audio) < sr * 0.5: # Less than 0.5 seconds
|
||||
print(" Warning: Audio is very short after preprocessing!")
|
||||
|
||||
# Save to output path
|
||||
if output_path is None:
|
||||
fd, output_path = tempfile.mkstemp(suffix=".wav")
|
||||
os.close(fd)
|
||||
|
||||
sf.write(output_path, audio, sr, subtype="PCM_16")
|
||||
print(f" Saved preprocessed audio: {output_path} ({len(audio) / sr:.2f}s)")
|
||||
|
||||
return output_path
|
||||
|
||||
def preprocess_to_array(self, input_path: str) -> tuple[np.ndarray, int]:
|
||||
"""
|
||||
Preprocess an audio file and return as numpy array.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input audio file
|
||||
|
||||
Returns:
|
||||
Tuple of (audio array, sample rate)
|
||||
"""
|
||||
# Use temporary file approach for consistency
|
||||
temp_path = self.preprocess_file(input_path)
|
||||
audio, sr = librosa.load(temp_path, sr=None, mono=True)
|
||||
os.unlink(temp_path)
|
||||
return audio, sr
|
||||
|
||||
def _normalize(self, audio: np.ndarray) -> np.ndarray:
|
||||
"""Normalize audio to a consistent volume level."""
|
||||
if not self.config.normalize:
|
||||
return audio
|
||||
|
||||
max_val = np.max(np.abs(audio))
|
||||
if max_val > 0:
|
||||
# Normalize to 95% of max to avoid clipping
|
||||
audio = audio / max_val * 0.95
|
||||
print(" Applied volume normalization")
|
||||
return audio
|
||||
|
||||
def _trim_silence(self, audio: np.ndarray, sr: int) -> np.ndarray:
|
||||
"""Trim silence from the beginning and end of audio."""
|
||||
if not self.config.trim_silence:
|
||||
return audio
|
||||
|
||||
trimmed, _ = librosa.effects.trim(
|
||||
audio,
|
||||
top_db=self.config.trim_top_db
|
||||
)
|
||||
trimmed_duration = len(audio) - len(trimmed)
|
||||
if trimmed_duration > 0:
|
||||
print(f" Trimmed {trimmed_duration / sr:.2f}s of silence")
|
||||
return trimmed
|
||||
|
||||
def _reduce_noise(self, audio: np.ndarray, sr: int) -> np.ndarray:
|
||||
"""Apply noise reduction to the audio."""
|
||||
if not self.config.reduce_noise:
|
||||
return audio
|
||||
|
||||
try:
|
||||
reduced = nr.reduce_noise(
|
||||
y=audio,
|
||||
sr=sr,
|
||||
stationary=True,
|
||||
prop_decrease=0.75
|
||||
)
|
||||
print(" Applied noise reduction")
|
||||
return reduced
|
||||
except Exception as e:
|
||||
print(f" Warning: Noise reduction failed: {e}")
|
||||
return audio
|
||||
|
||||
def _limit_length(self, audio: np.ndarray, sr: int) -> np.ndarray:
|
||||
"""Limit audio to target length if specified."""
|
||||
if self.config.target_length_seconds is None:
|
||||
return audio
|
||||
|
||||
max_samples = int(self.config.target_length_seconds * sr)
|
||||
if len(audio) > max_samples:
|
||||
audio = audio[:max_samples]
|
||||
print(f" Trimmed to {self.config.target_length_seconds}s")
|
||||
return audio
|
||||
|
||||
|
||||
def analyze_audio(file_path: str) -> dict:
|
||||
"""
|
||||
Analyze an audio file and return its properties.
|
||||
Useful for debugging voice cloning issues.
|
||||
"""
|
||||
audio, sr = librosa.load(file_path, sr=None, mono=False)
|
||||
|
||||
is_stereo = audio.ndim > 1
|
||||
if is_stereo:
|
||||
audio_mono = librosa.to_mono(audio)
|
||||
else:
|
||||
audio_mono = audio
|
||||
|
||||
duration = len(audio_mono) / sr
|
||||
max_amplitude = np.max(np.abs(audio_mono))
|
||||
rms = np.sqrt(np.mean(audio_mono**2))
|
||||
|
||||
# Estimate noise level from quietest parts
|
||||
frame_length = int(sr * 0.025)
|
||||
hop_length = int(sr * 0.010)
|
||||
rms_frames = librosa.feature.rms(
|
||||
y=audio_mono,
|
||||
frame_length=frame_length,
|
||||
hop_length=hop_length
|
||||
)[0]
|
||||
noise_floor = np.percentile(rms_frames, 10)
|
||||
|
||||
return {
|
||||
"path": file_path,
|
||||
"sample_rate": sr,
|
||||
"duration_seconds": duration,
|
||||
"is_stereo": is_stereo,
|
||||
"max_amplitude": float(max_amplitude),
|
||||
"rms_level": float(rms),
|
||||
"estimated_noise_floor": float(noise_floor),
|
||||
"is_normalized": max_amplitude > 0.8,
|
||||
"is_too_short": duration < 3,
|
||||
"is_too_long": duration > 30,
|
||||
"needs_resampling": sr != 22050,
|
||||
}
|
||||
|
||||
|
||||
def print_audio_analysis(file_path: str) -> None:
|
||||
"""Print a formatted analysis of an audio file."""
|
||||
info = analyze_audio(file_path)
|
||||
|
||||
print(f"\n{'=' * 50}")
|
||||
print(f"Audio Analysis: {info['path']}")
|
||||
print(f"{'=' * 50}")
|
||||
print(f" Sample Rate: {info['sample_rate']} Hz {'⚠️ (should be 22050)' if info['needs_resampling'] else '✓'}")
|
||||
print(f" Duration: {info['duration_seconds']:.2f}s", end="")
|
||||
if info['is_too_short']:
|
||||
print(" ⚠️ (too short, aim for 5-15s)")
|
||||
elif info['is_too_long']:
|
||||
print(" ⚠️ (quite long, 5-15s is ideal)")
|
||||
else:
|
||||
print(" ✓")
|
||||
print(f" Channels: {'Stereo' if info['is_stereo'] else 'Mono'} {'⚠️ (will convert to mono)' if info['is_stereo'] else '✓'}")
|
||||
print(f" Max Amplitude: {info['max_amplitude']:.3f} {'✓' if info['is_normalized'] else '⚠️ (low volume)'}")
|
||||
print(f" RMS Level: {info['rms_level']:.4f}")
|
||||
print(f" Noise Floor: {info['estimated_noise_floor']:.4f}")
|
||||
print(f"{'=' * 50}\n")
|
||||
Reference in New Issue
Block a user