voice-cloning-backend / backend /app /voice_cloning.py
AJ50's picture
Fix noisy audio: Use WaveRNN vocoder instead of Griffin-Lim + add normalization
c222fbc
raw
history blame
4.41 kB
"""Core voice cloning logic shared by the API routes."""
from __future__ import annotations
import shutil
import gc
import torch
from pathlib import Path
from typing import Dict, Tuple
import numpy as np
import soundfile as sf
from huggingface_hub import hf_hub_download
from encoder import inference as encoder_infer
from synthesizer import inference as synthesizer_infer
from synthesizer.hparams import hparams as syn_hp
from app.vocoder import inference as vocoder_infer
MODEL_SPECS: Dict[str, Tuple[str, str]] = {
"encoder.pt": ("AJ50/voice-clone-encoder", "encoder.pt"),
"synthesizer.pt": ("AJ50/voice-clone-synthesizer", "synthesizer.pt"),
"vocoder.pt": ("AJ50/voice-clone-vocoder", "vocoder.pt"),
}
def ensure_default_models(models_dir: Path) -> None:
"""Download the required pretrained weights if they are missing."""
target_dir = models_dir / "default"
target_dir.mkdir(parents=True, exist_ok=True)
for filename, (repo_id, repo_filename) in MODEL_SPECS.items():
destination = target_dir / filename
if destination.exists():
continue
print(f"[Models] Downloading {filename} from {repo_id}...")
downloaded_path = Path(
hf_hub_download(repo_id=repo_id, filename=repo_filename)
)
shutil.copy2(downloaded_path, destination)
print(f"[Models] Saved to {destination}")
def synthesize(voice_path: Path, text: str, models_dir: Path, out_path: Path) -> Path:
"""Run end-to-end voice cloning and return the generated audio path."""
ensure_default_models(models_dir)
enc_path = models_dir / "default" / "encoder.pt"
syn_path = models_dir / "default" / "synthesizer.pt"
voc_path = models_dir / "default" / "vocoder.pt"
for model_path in (enc_path, syn_path, voc_path):
if not model_path.exists():
raise RuntimeError(f"Model file missing: {model_path}")
print("[VoiceCloning] Loading encoder...")
encoder_infer.load_model(enc_path)
print("[VoiceCloning] Loading synthesizer...")
synthesizer = synthesizer_infer.Synthesizer(syn_path)
print("[VoiceCloning] Loading vocoder...")
vocoder_infer.load_model(voc_path)
if not voice_path.exists():
raise RuntimeError(f"Reference voice file not found: {voice_path}")
print("[VoiceCloning] Preprocessing reference audio...")
wav = encoder_infer.preprocess_wav(voice_path)
embed = encoder_infer.embed_utterance(wav)
print("[VoiceCloning] Generating mel-spectrogram...")
mels = synthesizer.synthesize_spectrograms([text], [embed])
mel = mels[0]
print("[VoiceCloning] Vocoding waveform with WaveRNN...")
try:
# Use the high-quality WaveRNN vocoder (much better than Griffin-Lim)
waveform = vocoder_infer.infer_waveform(
mel, normalize=True, batched=False, target=8000, overlap=800
).astype(np.float32)
except Exception as e:
print(f"[VoiceCloning] Vocoder failed: {e}, falling back to Griffin-Lim...")
waveform = synthesizer.griffin_lim(mel).astype(np.float32)
# Normalize waveform to prevent clipping and ensure good volume
max_abs_value = np.max(np.abs(waveform))
if max_abs_value > 0:
# Target peak level at -3dB (0.707 * max_int16)
target_level = 0.707
waveform = waveform * (target_level / max_abs_value)
# Ensure waveform is in valid range for 16-bit audio
waveform = np.clip(waveform, -1.0, 1.0)
print(f"[VoiceCloning] Waveform normalized - Max: {np.max(np.abs(waveform)):.4f}")
out_path.parent.mkdir(parents=True, exist_ok=True)
sf.write(out_path.as_posix(), waveform, syn_hp.sample_rate)
print(f"[VoiceCloning] Audio saved to {out_path}")
# Memory optimization for Render free tier
print("[VoiceCloning] Cleaning up models to free memory...")
try:
# Clear model caches
if hasattr(encoder_infer, '_model'):
encoder_infer._model = None
if hasattr(synthesizer_infer, '_model'):
synthesizer_infer._model = None
if hasattr(vocoder_infer, '_model'):
vocoder_infer._model = None
# Force garbage collection
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception as e:
print(f"[VoiceCloning] Warning during cleanup: {e}")
return out_path