Initial commit

2026-01-18 17:08:37 -06:00
commit ae1c2a65d3
28 changed files with 719 additions and 0 deletions
--- a/audio_preprocessor.py
+++ b/audio_preprocessor.py
@@ -0,0 +1,205 @@
+"""Audio preprocessing utilities for improving voice cloning quality."""
+
+import os
+import tempfile
+from dataclasses import dataclass
+
+import librosa
+import noisereduce as nr
+import numpy as np
+import soundfile as sf
+
+
+@dataclass
+class PreprocessingConfig:
+    """Configuration for audio preprocessing."""
+
+    target_sample_rate: int = 22050
+    normalize: bool = True
+    trim_silence: bool = True
+    trim_top_db: int = 20
+    reduce_noise: bool = True
+    target_length_seconds: float | None = None  # None means keep original length
+
+
+class AudioPreprocessor:
+    """Preprocesses audio files for optimal voice cloning."""
+
+    def __init__(self, config: PreprocessingConfig | None = None):
+        self.config = config or PreprocessingConfig()
+
+    def preprocess_file(self, input_path: str, output_path: str | None = None) -> str:
+        """
+        Preprocess an audio file for voice cloning.
+
+        Args:
+            input_path: Path to the input audio file
+            output_path: Optional path for the output file. If None, creates a temp file.
+
+        Returns:
+            Path to the preprocessed audio file
+        """
+        print(f"Preprocessing audio: {input_path}")
+
+        # Load audio with librosa (automatically converts to mono and resamples)
+        audio, sr = librosa.load(
+            input_path,
+            sr=self.config.target_sample_rate,
+            mono=True
+        )
+        print(f"  Loaded audio: {len(audio) / sr:.2f}s at {sr}Hz")
+
+        # Apply preprocessing steps
+        audio = self._normalize(audio)
+        audio = self._trim_silence(audio, sr)
+        audio = self._reduce_noise(audio, sr)
+        audio = self._limit_length(audio, sr)
+
+        # Ensure we have valid audio
+        if len(audio) < sr * 0.5:  # Less than 0.5 seconds
+            print("  Warning: Audio is very short after preprocessing!")
+
+        # Save to output path
+        if output_path is None:
+            fd, output_path = tempfile.mkstemp(suffix=".wav")
+            os.close(fd)
+
+        sf.write(output_path, audio, sr, subtype="PCM_16")
+        print(f"  Saved preprocessed audio: {output_path} ({len(audio) / sr:.2f}s)")
+
+        return output_path
+
+    def preprocess_to_array(self, input_path: str) -> tuple[np.ndarray, int]:
+        """
+        Preprocess an audio file and return as numpy array.
+
+        Args:
+            input_path: Path to the input audio file
+
+        Returns:
+            Tuple of (audio array, sample rate)
+        """
+        # Use temporary file approach for consistency
+        temp_path = self.preprocess_file(input_path)
+        audio, sr = librosa.load(temp_path, sr=None, mono=True)
+        os.unlink(temp_path)
+        return audio, sr
+
+    def _normalize(self, audio: np.ndarray) -> np.ndarray:
+        """Normalize audio to a consistent volume level."""
+        if not self.config.normalize:
+            return audio
+
+        max_val = np.max(np.abs(audio))
+        if max_val > 0:
+            # Normalize to 95% of max to avoid clipping
+            audio = audio / max_val * 0.95
+            print("  Applied volume normalization")
+        return audio
+
+    def _trim_silence(self, audio: np.ndarray, sr: int) -> np.ndarray:
+        """Trim silence from the beginning and end of audio."""
+        if not self.config.trim_silence:
+            return audio
+
+        trimmed, _ = librosa.effects.trim(
+            audio,
+            top_db=self.config.trim_top_db
+        )
+        trimmed_duration = len(audio) - len(trimmed)
+        if trimmed_duration > 0:
+            print(f"  Trimmed {trimmed_duration / sr:.2f}s of silence")
+        return trimmed
+
+    def _reduce_noise(self, audio: np.ndarray, sr: int) -> np.ndarray:
+        """Apply noise reduction to the audio."""
+        if not self.config.reduce_noise:
+            return audio
+
+        try:
+            reduced = nr.reduce_noise(
+                y=audio,
+                sr=sr,
+                stationary=True,
+                prop_decrease=0.75
+            )
+            print("  Applied noise reduction")
+            return reduced
+        except Exception as e:
+            print(f"  Warning: Noise reduction failed: {e}")
+            return audio
+
+    def _limit_length(self, audio: np.ndarray, sr: int) -> np.ndarray:
+        """Limit audio to target length if specified."""
+        if self.config.target_length_seconds is None:
+            return audio
+
+        max_samples = int(self.config.target_length_seconds * sr)
+        if len(audio) > max_samples:
+            audio = audio[:max_samples]
+            print(f"  Trimmed to {self.config.target_length_seconds}s")
+        return audio
+
+
+def analyze_audio(file_path: str) -> dict:
+    """
+    Analyze an audio file and return its properties.
+    Useful for debugging voice cloning issues.
+    """
+    audio, sr = librosa.load(file_path, sr=None, mono=False)
+
+    is_stereo = audio.ndim > 1
+    if is_stereo:
+        audio_mono = librosa.to_mono(audio)
+    else:
+        audio_mono = audio
+
+    duration = len(audio_mono) / sr
+    max_amplitude = np.max(np.abs(audio_mono))
+    rms = np.sqrt(np.mean(audio_mono**2))
+
+    # Estimate noise level from quietest parts
+    frame_length = int(sr * 0.025)
+    hop_length = int(sr * 0.010)
+    rms_frames = librosa.feature.rms(
+        y=audio_mono,
+        frame_length=frame_length,
+        hop_length=hop_length
+    )[0]
+    noise_floor = np.percentile(rms_frames, 10)
+
+    return {
+        "path": file_path,
+        "sample_rate": sr,
+        "duration_seconds": duration,
+        "is_stereo": is_stereo,
+        "max_amplitude": float(max_amplitude),
+        "rms_level": float(rms),
+        "estimated_noise_floor": float(noise_floor),
+        "is_normalized": max_amplitude > 0.8,
+        "is_too_short": duration < 3,
+        "is_too_long": duration > 30,
+        "needs_resampling": sr != 22050,
+    }
+
+
+def print_audio_analysis(file_path: str) -> None:
+    """Print a formatted analysis of an audio file."""
+    info = analyze_audio(file_path)
+
+    print(f"\n{'=' * 50}")
+    print(f"Audio Analysis: {info['path']}")
+    print(f"{'=' * 50}")
+    print(f"  Sample Rate:    {info['sample_rate']} Hz {'⚠️  (should be 22050)' if info['needs_resampling'] else '✓'}")
+    print(f"  Duration:       {info['duration_seconds']:.2f}s", end="")
+    if info['is_too_short']:
+        print(" ⚠️  (too short, aim for 5-15s)")
+    elif info['is_too_long']:
+        print(" ⚠️  (quite long, 5-15s is ideal)")
+    else:
+        print(" ✓")
+    print(f"  Channels:       {'Stereo' if info['is_stereo'] else 'Mono'} {'⚠️  (will convert to mono)' if info['is_stereo'] else '✓'}")
+    print(f"  Max Amplitude:  {info['max_amplitude']:.3f} {'✓' if info['is_normalized'] else '⚠️  (low volume)'}")
+    print(f"  RMS Level:      {info['rms_level']:.4f}")
+    print(f"  Noise Floor:    {info['estimated_noise_floor']:.4f}")
+    print(f"{'=' * 50}\n")