Files
Vox/audio_preprocessor.py
Spencer 9917d44f5d docs: add HuggingFace cache troubleshooting to README
- Document HF_HOME environment variable for writable cache
- Add systemd service permission guidance for /tmp paths
- Troubleshooting steps for read-only file system errors
2026-02-26 15:56:09 -06:00

206 lines
6.7 KiB
Python
Executable File

"""Audio preprocessing utilities for improving voice cloning quality."""
import os
import tempfile
from dataclasses import dataclass
import librosa
import noisereduce as nr
import numpy as np
import soundfile as sf
@dataclass
class PreprocessingConfig:
"""Configuration for audio preprocessing."""
target_sample_rate: int = 22050
normalize: bool = True
trim_silence: bool = True
trim_top_db: int = 20
reduce_noise: bool = True
target_length_seconds: float | None = None # None means keep original length
class AudioPreprocessor:
"""Preprocesses audio files for optimal voice cloning."""
def __init__(self, config: PreprocessingConfig | None = None):
self.config = config or PreprocessingConfig()
def preprocess_file(self, input_path: str, output_path: str | None = None) -> str:
"""
Preprocess an audio file for voice cloning.
Args:
input_path: Path to the input audio file
output_path: Optional path for the output file. If None, creates a temp file.
Returns:
Path to the preprocessed audio file
"""
print(f"Preprocessing audio: {input_path}")
# Load audio with librosa (automatically converts to mono and resamples)
audio, sr = librosa.load(
input_path,
sr=self.config.target_sample_rate,
mono=True
)
print(f" Loaded audio: {len(audio) / sr:.2f}s at {sr}Hz")
# Apply preprocessing steps
audio = self._normalize(audio)
audio = self._trim_silence(audio, sr)
audio = self._reduce_noise(audio, sr)
audio = self._limit_length(audio, sr)
# Ensure we have valid audio
if len(audio) < sr * 0.5: # Less than 0.5 seconds
print(" Warning: Audio is very short after preprocessing!")
# Save to output path
if output_path is None:
fd, output_path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
sf.write(output_path, audio, sr, subtype="PCM_16")
print(f" Saved preprocessed audio: {output_path} ({len(audio) / sr:.2f}s)")
return output_path
def preprocess_to_array(self, input_path: str) -> tuple[np.ndarray, int]:
"""
Preprocess an audio file and return as numpy array.
Args:
input_path: Path to the input audio file
Returns:
Tuple of (audio array, sample rate)
"""
# Use temporary file approach for consistency
temp_path = self.preprocess_file(input_path)
audio, sr = librosa.load(temp_path, sr=None, mono=True)
os.unlink(temp_path)
return audio, sr
def _normalize(self, audio: np.ndarray) -> np.ndarray:
"""Normalize audio to a consistent volume level."""
if not self.config.normalize:
return audio
max_val = np.max(np.abs(audio))
if max_val > 0:
# Normalize to 95% of max to avoid clipping
audio = audio / max_val * 0.95
print(" Applied volume normalization")
return audio
def _trim_silence(self, audio: np.ndarray, sr: int) -> np.ndarray:
"""Trim silence from the beginning and end of audio."""
if not self.config.trim_silence:
return audio
trimmed, _ = librosa.effects.trim(
audio,
top_db=self.config.trim_top_db
)
trimmed_duration = len(audio) - len(trimmed)
if trimmed_duration > 0:
print(f" Trimmed {trimmed_duration / sr:.2f}s of silence")
return trimmed
def _reduce_noise(self, audio: np.ndarray, sr: int) -> np.ndarray:
"""Apply noise reduction to the audio."""
if not self.config.reduce_noise:
return audio
try:
reduced = nr.reduce_noise(
y=audio,
sr=sr,
stationary=True,
prop_decrease=0.75
)
print(" Applied noise reduction")
return reduced
except Exception as e:
print(f" Warning: Noise reduction failed: {e}")
return audio
def _limit_length(self, audio: np.ndarray, sr: int) -> np.ndarray:
"""Limit audio to target length if specified."""
if self.config.target_length_seconds is None:
return audio
max_samples = int(self.config.target_length_seconds * sr)
if len(audio) > max_samples:
audio = audio[:max_samples]
print(f" Trimmed to {self.config.target_length_seconds}s")
return audio
def analyze_audio(file_path: str) -> dict:
"""
Analyze an audio file and return its properties.
Useful for debugging voice cloning issues.
"""
audio, sr = librosa.load(file_path, sr=None, mono=False)
is_stereo = audio.ndim > 1
if is_stereo:
audio_mono = librosa.to_mono(audio)
else:
audio_mono = audio
duration = len(audio_mono) / sr
max_amplitude = np.max(np.abs(audio_mono))
rms = np.sqrt(np.mean(audio_mono**2))
# Estimate noise level from quietest parts
frame_length = int(sr * 0.025)
hop_length = int(sr * 0.010)
rms_frames = librosa.feature.rms(
y=audio_mono,
frame_length=frame_length,
hop_length=hop_length
)[0]
noise_floor = np.percentile(rms_frames, 10)
return {
"path": file_path,
"sample_rate": sr,
"duration_seconds": duration,
"is_stereo": is_stereo,
"max_amplitude": float(max_amplitude),
"rms_level": float(rms),
"estimated_noise_floor": float(noise_floor),
"is_normalized": max_amplitude > 0.8,
"is_too_short": duration < 3,
"is_too_long": duration > 30,
"needs_resampling": sr != 22050,
}
def print_audio_analysis(file_path: str) -> None:
"""Print a formatted analysis of an audio file."""
info = analyze_audio(file_path)
print(f"\n{'=' * 50}")
print(f"Audio Analysis: {info['path']}")
print(f"{'=' * 50}")
print(f" Sample Rate: {info['sample_rate']} Hz {'[WARN] (should be 22050)' if info['needs_resampling'] else '[OK]'}")
print(f" Duration: {info['duration_seconds']:.2f}s", end="")
if info['is_too_short']:
print(" [WARN] (too short, aim for 5-15s)")
elif info['is_too_long']:
print(" [WARN] (quite long, 5-15s is ideal)")
else:
print(" [OK]")
print(f" Channels: {'Stereo' if info['is_stereo'] else 'Mono'} {'[WARN] (will convert to mono)' if info['is_stereo'] else '[OK]'}")
print(f" Max Amplitude: {info['max_amplitude']:.3f} {'[OK]' if info['is_normalized'] else '[WARN] (low volume)'}")
print(f" RMS Level: {info['rms_level']:.4f}")
print(f" Noise Floor: {info['estimated_noise_floor']:.4f}")
print(f"{'=' * 50}\n")