Vox/audio_preprocessor.py

"""Audio preprocessing utilities for improving voice cloning quality."""

import os
import tempfile
from dataclasses import dataclass

import librosa
import noisereduce as nr
import numpy as np
import soundfile as sf


@dataclass
class PreprocessingConfig:
    """Configuration for audio preprocessing."""

    target_sample_rate: int = 22050
    normalize: bool = True
    trim_silence: bool = True
    trim_top_db: int = 20
    reduce_noise: bool = True
    target_length_seconds: float | None = None  # None means keep original length


class AudioPreprocessor:
    """Preprocesses audio files for optimal voice cloning."""

    def __init__(self, config: PreprocessingConfig | None = None):
        self.config = config or PreprocessingConfig()

    def preprocess_file(self, input_path: str, output_path: str | None = None) -> str:
        """
        Preprocess an audio file for voice cloning.

        Args:
            input_path: Path to the input audio file
            output_path: Optional path for the output file. If None, creates a temp file.

        Returns:
            Path to the preprocessed audio file
        """
        print(f"Preprocessing audio: {input_path}")

        # Load audio with librosa (automatically converts to mono and resamples)
        audio, sr = librosa.load(
            input_path,
            sr=self.config.target_sample_rate,
            mono=True
        )
        print(f"  Loaded audio: {len(audio) / sr:.2f}s at {sr}Hz")

        # Apply preprocessing steps
        audio = self._normalize(audio)
        audio = self._trim_silence(audio, sr)
        audio = self._reduce_noise(audio, sr)
        audio = self._limit_length(audio, sr)

        # Ensure we have valid audio
        if len(audio) < sr * 0.5:  # Less than 0.5 seconds
            print("  Warning: Audio is very short after preprocessing!")

        # Save to output path
        if output_path is None:
            fd, output_path = tempfile.mkstemp(suffix=".wav")
            os.close(fd)

        sf.write(output_path, audio, sr, subtype="PCM_16")
        print(f"  Saved preprocessed audio: {output_path} ({len(audio) / sr:.2f}s)")

        return output_path

    def preprocess_to_array(self, input_path: str) -> tuple[np.ndarray, int]:
        """
        Preprocess an audio file and return as numpy array.

        Args:
            input_path: Path to the input audio file

        Returns:
            Tuple of (audio array, sample rate)
        """
        # Use temporary file approach for consistency
        temp_path = self.preprocess_file(input_path)
        audio, sr = librosa.load(temp_path, sr=None, mono=True)
        os.unlink(temp_path)
        return audio, sr

    def _normalize(self, audio: np.ndarray) -> np.ndarray:
        """Normalize audio to a consistent volume level."""
        if not self.config.normalize:
            return audio

        max_val = np.max(np.abs(audio))
        if max_val > 0:
            # Normalize to 95% of max to avoid clipping
            audio = audio / max_val * 0.95
            print("  Applied volume normalization")
        return audio

    def _trim_silence(self, audio: np.ndarray, sr: int) -> np.ndarray:
        """Trim silence from the beginning and end of audio."""
        if not self.config.trim_silence:
            return audio

        trimmed, _ = librosa.effects.trim(
            audio,
            top_db=self.config.trim_top_db
        )
        trimmed_duration = len(audio) - len(trimmed)
        if trimmed_duration > 0:
            print(f"  Trimmed {trimmed_duration / sr:.2f}s of silence")
        return trimmed

    def _reduce_noise(self, audio: np.ndarray, sr: int) -> np.ndarray:
        """Apply noise reduction to the audio."""
        if not self.config.reduce_noise:
            return audio

        try:
            reduced = nr.reduce_noise(
                y=audio,
                sr=sr,
                stationary=True,
                prop_decrease=0.75
            )
            print("  Applied noise reduction")
            return reduced
        except Exception as e:
            print(f"  Warning: Noise reduction failed: {e}")
            return audio

    def _limit_length(self, audio: np.ndarray, sr: int) -> np.ndarray:
        """Limit audio to target length if specified."""
        if self.config.target_length_seconds is None:
            return audio

        max_samples = int(self.config.target_length_seconds * sr)
        if len(audio) > max_samples:
            audio = audio[:max_samples]
            print(f"  Trimmed to {self.config.target_length_seconds}s")
        return audio


def analyze_audio(file_path: str) -> dict:
    """
    Analyze an audio file and return its properties.
    Useful for debugging voice cloning issues.
    """
    audio, sr = librosa.load(file_path, sr=None, mono=False)

    is_stereo = audio.ndim > 1
    if is_stereo:
        audio_mono = librosa.to_mono(audio)
    else:
        audio_mono = audio

    duration = len(audio_mono) / sr
    max_amplitude = np.max(np.abs(audio_mono))
    rms = np.sqrt(np.mean(audio_mono**2))

    # Estimate noise level from quietest parts
    frame_length = int(sr * 0.025)
    hop_length = int(sr * 0.010)
    rms_frames = librosa.feature.rms(
        y=audio_mono,
        frame_length=frame_length,
        hop_length=hop_length
    )[0]
    noise_floor = np.percentile(rms_frames, 10)

    return {
        "path": file_path,
        "sample_rate": sr,
        "duration_seconds": duration,
        "is_stereo": is_stereo,
        "max_amplitude": float(max_amplitude),
        "rms_level": float(rms),
        "estimated_noise_floor": float(noise_floor),
        "is_normalized": max_amplitude > 0.8,
        "is_too_short": duration < 3,
        "is_too_long": duration > 30,
        "needs_resampling": sr != 22050,
    }


def print_audio_analysis(file_path: str) -> None:
    """Print a formatted analysis of an audio file."""
    info = analyze_audio(file_path)

    print(f"\n{'=' * 50}")
    print(f"Audio Analysis: {info['path']}")
    print(f"{'=' * 50}")
    print(f"  Sample Rate:    {info['sample_rate']} Hz {'[WARN] (should be 22050)' if info['needs_resampling'] else '[OK]'}")
    print(f"  Duration:       {info['duration_seconds']:.2f}s", end="")
    if info['is_too_short']:
        print(" [WARN] (too short, aim for 5-15s)")
    elif info['is_too_long']:
        print(" [WARN] (quite long, 5-15s is ideal)")
    else:
        print(" [OK]")
    print(f"  Channels:       {'Stereo' if info['is_stereo'] else 'Mono'} {'[WARN] (will convert to mono)' if info['is_stereo'] else '[OK]'}")
    print(f"  Max Amplitude:  {info['max_amplitude']:.3f} {'[OK]' if info['is_normalized'] else '[WARN] (low volume)'}")
    print(f"  RMS Level:      {info['rms_level']:.4f}")
    print(f"  Noise Floor:    {info['estimated_noise_floor']:.4f}")
    print(f"{'=' * 50}\n")