Initial commit

This commit is contained in:
2026-01-18 17:08:37 -06:00
commit ae1c2a65d3
28 changed files with 719 additions and 0 deletions

205
audio_preprocessor.py Normal file
View File

@@ -0,0 +1,205 @@
"""Audio preprocessing utilities for improving voice cloning quality."""
import os
import tempfile
from dataclasses import dataclass
import librosa
import noisereduce as nr
import numpy as np
import soundfile as sf
@dataclass
class PreprocessingConfig:
"""Configuration for audio preprocessing."""
target_sample_rate: int = 22050
normalize: bool = True
trim_silence: bool = True
trim_top_db: int = 20
reduce_noise: bool = True
target_length_seconds: float | None = None # None means keep original length
class AudioPreprocessor:
"""Preprocesses audio files for optimal voice cloning."""
def __init__(self, config: PreprocessingConfig | None = None):
self.config = config or PreprocessingConfig()
def preprocess_file(self, input_path: str, output_path: str | None = None) -> str:
"""
Preprocess an audio file for voice cloning.
Args:
input_path: Path to the input audio file
output_path: Optional path for the output file. If None, creates a temp file.
Returns:
Path to the preprocessed audio file
"""
print(f"Preprocessing audio: {input_path}")
# Load audio with librosa (automatically converts to mono and resamples)
audio, sr = librosa.load(
input_path,
sr=self.config.target_sample_rate,
mono=True
)
print(f" Loaded audio: {len(audio) / sr:.2f}s at {sr}Hz")
# Apply preprocessing steps
audio = self._normalize(audio)
audio = self._trim_silence(audio, sr)
audio = self._reduce_noise(audio, sr)
audio = self._limit_length(audio, sr)
# Ensure we have valid audio
if len(audio) < sr * 0.5: # Less than 0.5 seconds
print(" Warning: Audio is very short after preprocessing!")
# Save to output path
if output_path is None:
fd, output_path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
sf.write(output_path, audio, sr, subtype="PCM_16")
print(f" Saved preprocessed audio: {output_path} ({len(audio) / sr:.2f}s)")
return output_path
def preprocess_to_array(self, input_path: str) -> tuple[np.ndarray, int]:
"""
Preprocess an audio file and return as numpy array.
Args:
input_path: Path to the input audio file
Returns:
Tuple of (audio array, sample rate)
"""
# Use temporary file approach for consistency
temp_path = self.preprocess_file(input_path)
audio, sr = librosa.load(temp_path, sr=None, mono=True)
os.unlink(temp_path)
return audio, sr
def _normalize(self, audio: np.ndarray) -> np.ndarray:
"""Normalize audio to a consistent volume level."""
if not self.config.normalize:
return audio
max_val = np.max(np.abs(audio))
if max_val > 0:
# Normalize to 95% of max to avoid clipping
audio = audio / max_val * 0.95
print(" Applied volume normalization")
return audio
def _trim_silence(self, audio: np.ndarray, sr: int) -> np.ndarray:
"""Trim silence from the beginning and end of audio."""
if not self.config.trim_silence:
return audio
trimmed, _ = librosa.effects.trim(
audio,
top_db=self.config.trim_top_db
)
trimmed_duration = len(audio) - len(trimmed)
if trimmed_duration > 0:
print(f" Trimmed {trimmed_duration / sr:.2f}s of silence")
return trimmed
def _reduce_noise(self, audio: np.ndarray, sr: int) -> np.ndarray:
"""Apply noise reduction to the audio."""
if not self.config.reduce_noise:
return audio
try:
reduced = nr.reduce_noise(
y=audio,
sr=sr,
stationary=True,
prop_decrease=0.75
)
print(" Applied noise reduction")
return reduced
except Exception as e:
print(f" Warning: Noise reduction failed: {e}")
return audio
def _limit_length(self, audio: np.ndarray, sr: int) -> np.ndarray:
"""Limit audio to target length if specified."""
if self.config.target_length_seconds is None:
return audio
max_samples = int(self.config.target_length_seconds * sr)
if len(audio) > max_samples:
audio = audio[:max_samples]
print(f" Trimmed to {self.config.target_length_seconds}s")
return audio
def analyze_audio(file_path: str) -> dict:
"""
Analyze an audio file and return its properties.
Useful for debugging voice cloning issues.
"""
audio, sr = librosa.load(file_path, sr=None, mono=False)
is_stereo = audio.ndim > 1
if is_stereo:
audio_mono = librosa.to_mono(audio)
else:
audio_mono = audio
duration = len(audio_mono) / sr
max_amplitude = np.max(np.abs(audio_mono))
rms = np.sqrt(np.mean(audio_mono**2))
# Estimate noise level from quietest parts
frame_length = int(sr * 0.025)
hop_length = int(sr * 0.010)
rms_frames = librosa.feature.rms(
y=audio_mono,
frame_length=frame_length,
hop_length=hop_length
)[0]
noise_floor = np.percentile(rms_frames, 10)
return {
"path": file_path,
"sample_rate": sr,
"duration_seconds": duration,
"is_stereo": is_stereo,
"max_amplitude": float(max_amplitude),
"rms_level": float(rms),
"estimated_noise_floor": float(noise_floor),
"is_normalized": max_amplitude > 0.8,
"is_too_short": duration < 3,
"is_too_long": duration > 30,
"needs_resampling": sr != 22050,
}
def print_audio_analysis(file_path: str) -> None:
"""Print a formatted analysis of an audio file."""
info = analyze_audio(file_path)
print(f"\n{'=' * 50}")
print(f"Audio Analysis: {info['path']}")
print(f"{'=' * 50}")
print(f" Sample Rate: {info['sample_rate']} Hz {'⚠️ (should be 22050)' if info['needs_resampling'] else ''}")
print(f" Duration: {info['duration_seconds']:.2f}s", end="")
if info['is_too_short']:
print(" ⚠️ (too short, aim for 5-15s)")
elif info['is_too_long']:
print(" ⚠️ (quite long, 5-15s is ideal)")
else:
print("")
print(f" Channels: {'Stereo' if info['is_stereo'] else 'Mono'} {'⚠️ (will convert to mono)' if info['is_stereo'] else ''}")
print(f" Max Amplitude: {info['max_amplitude']:.3f} {'' if info['is_normalized'] else '⚠️ (low volume)'}")
print(f" RMS Level: {info['rms_level']:.4f}")
print(f" Noise Floor: {info['estimated_noise_floor']:.4f}")
print(f"{'=' * 50}\n")