"""Audio preprocessing utilities for improving voice cloning quality.""" import os import tempfile from dataclasses import dataclass import librosa import noisereduce as nr import numpy as np import soundfile as sf @dataclass class PreprocessingConfig: """Configuration for audio preprocessing.""" target_sample_rate: int = 22050 normalize: bool = True trim_silence: bool = True trim_top_db: int = 20 reduce_noise: bool = True target_length_seconds: float | None = None # None means keep original length class AudioPreprocessor: """Preprocesses audio files for optimal voice cloning.""" def __init__(self, config: PreprocessingConfig | None = None): self.config = config or PreprocessingConfig() def preprocess_file(self, input_path: str, output_path: str | None = None) -> str: """ Preprocess an audio file for voice cloning. Args: input_path: Path to the input audio file output_path: Optional path for the output file. If None, creates a temp file. Returns: Path to the preprocessed audio file """ print(f"Preprocessing audio: {input_path}") # Load audio with librosa (automatically converts to mono and resamples) audio, sr = librosa.load( input_path, sr=self.config.target_sample_rate, mono=True ) print(f" Loaded audio: {len(audio) / sr:.2f}s at {sr}Hz") # Apply preprocessing steps audio = self._normalize(audio) audio = self._trim_silence(audio, sr) audio = self._reduce_noise(audio, sr) audio = self._limit_length(audio, sr) # Ensure we have valid audio if len(audio) < sr * 0.5: # Less than 0.5 seconds print(" Warning: Audio is very short after preprocessing!") # Save to output path if output_path is None: fd, output_path = tempfile.mkstemp(suffix=".wav") os.close(fd) sf.write(output_path, audio, sr, subtype="PCM_16") print(f" Saved preprocessed audio: {output_path} ({len(audio) / sr:.2f}s)") return output_path def preprocess_to_array(self, input_path: str) -> tuple[np.ndarray, int]: """ Preprocess an audio file and return as numpy array. Args: input_path: Path to the input audio file Returns: Tuple of (audio array, sample rate) """ # Use temporary file approach for consistency temp_path = self.preprocess_file(input_path) audio, sr = librosa.load(temp_path, sr=None, mono=True) os.unlink(temp_path) return audio, sr def _normalize(self, audio: np.ndarray) -> np.ndarray: """Normalize audio to a consistent volume level.""" if not self.config.normalize: return audio max_val = np.max(np.abs(audio)) if max_val > 0: # Normalize to 95% of max to avoid clipping audio = audio / max_val * 0.95 print(" Applied volume normalization") return audio def _trim_silence(self, audio: np.ndarray, sr: int) -> np.ndarray: """Trim silence from the beginning and end of audio.""" if not self.config.trim_silence: return audio trimmed, _ = librosa.effects.trim( audio, top_db=self.config.trim_top_db ) trimmed_duration = len(audio) - len(trimmed) if trimmed_duration > 0: print(f" Trimmed {trimmed_duration / sr:.2f}s of silence") return trimmed def _reduce_noise(self, audio: np.ndarray, sr: int) -> np.ndarray: """Apply noise reduction to the audio.""" if not self.config.reduce_noise: return audio try: reduced = nr.reduce_noise( y=audio, sr=sr, stationary=True, prop_decrease=0.75 ) print(" Applied noise reduction") return reduced except Exception as e: print(f" Warning: Noise reduction failed: {e}") return audio def _limit_length(self, audio: np.ndarray, sr: int) -> np.ndarray: """Limit audio to target length if specified.""" if self.config.target_length_seconds is None: return audio max_samples = int(self.config.target_length_seconds * sr) if len(audio) > max_samples: audio = audio[:max_samples] print(f" Trimmed to {self.config.target_length_seconds}s") return audio def analyze_audio(file_path: str) -> dict: """ Analyze an audio file and return its properties. Useful for debugging voice cloning issues. """ audio, sr = librosa.load(file_path, sr=None, mono=False) is_stereo = audio.ndim > 1 if is_stereo: audio_mono = librosa.to_mono(audio) else: audio_mono = audio duration = len(audio_mono) / sr max_amplitude = np.max(np.abs(audio_mono)) rms = np.sqrt(np.mean(audio_mono**2)) # Estimate noise level from quietest parts frame_length = int(sr * 0.025) hop_length = int(sr * 0.010) rms_frames = librosa.feature.rms( y=audio_mono, frame_length=frame_length, hop_length=hop_length )[0] noise_floor = np.percentile(rms_frames, 10) return { "path": file_path, "sample_rate": sr, "duration_seconds": duration, "is_stereo": is_stereo, "max_amplitude": float(max_amplitude), "rms_level": float(rms), "estimated_noise_floor": float(noise_floor), "is_normalized": max_amplitude > 0.8, "is_too_short": duration < 3, "is_too_long": duration > 30, "needs_resampling": sr != 22050, } def print_audio_analysis(file_path: str) -> None: """Print a formatted analysis of an audio file.""" info = analyze_audio(file_path) print(f"\n{'=' * 50}") print(f"Audio Analysis: {info['path']}") print(f"{'=' * 50}") print(f" Sample Rate: {info['sample_rate']} Hz {'[WARN] (should be 22050)' if info['needs_resampling'] else '[OK]'}") print(f" Duration: {info['duration_seconds']:.2f}s", end="") if info['is_too_short']: print(" [WARN] (too short, aim for 5-15s)") elif info['is_too_long']: print(" [WARN] (quite long, 5-15s is ideal)") else: print(" [OK]") print(f" Channels: {'Stereo' if info['is_stereo'] else 'Mono'} {'[WARN] (will convert to mono)' if info['is_stereo'] else '[OK]'}") print(f" Max Amplitude: {info['max_amplitude']:.3f} {'[OK]' if info['is_normalized'] else '[WARN] (low volume)'}") print(f" RMS Level: {info['rms_level']:.4f}") print(f" Noise Floor: {info['estimated_noise_floor']:.4f}") print(f"{'=' * 50}\n")