"""Core voice cloning logic shared by the API routes.""" from __future__ import annotations import shutil import gc import torch from pathlib import Path from typing import Dict, Tuple import numpy as np import soundfile as sf from huggingface_hub import hf_hub_download from encoder import inference as encoder_infer from synthesizer import inference as synthesizer_infer from synthesizer.hparams import hparams as syn_hp from app.vocoder import inference as vocoder_infer MODEL_SPECS: Dict[str, Tuple[str, str]] = { "encoder.pt": ("AJ50/voice-clone-encoder", "encoder.pt"), "synthesizer.pt": ("AJ50/voice-clone-synthesizer", "synthesizer.pt"), "vocoder.pt": ("AJ50/voice-clone-vocoder", "vocoder.pt"), } def ensure_default_models(models_dir: Path) -> None: """Download the required pretrained weights if they are missing.""" target_dir = models_dir / "default" target_dir.mkdir(parents=True, exist_ok=True) for filename, (repo_id, repo_filename) in MODEL_SPECS.items(): destination = target_dir / filename if destination.exists(): continue print(f"[Models] Downloading {filename} from {repo_id}...") downloaded_path = Path( hf_hub_download(repo_id=repo_id, filename=repo_filename) ) shutil.copy2(downloaded_path, destination) print(f"[Models] Saved to {destination}") def synthesize(voice_path: Path, text: str, models_dir: Path, out_path: Path) -> Path: """Run end-to-end voice cloning and return the generated audio path.""" ensure_default_models(models_dir) enc_path = models_dir / "default" / "encoder.pt" syn_path = models_dir / "default" / "synthesizer.pt" voc_path = models_dir / "default" / "vocoder.pt" for model_path in (enc_path, syn_path, voc_path): if not model_path.exists(): raise RuntimeError(f"Model file missing: {model_path}") print("[VoiceCloning] Loading encoder...") encoder_infer.load_model(enc_path) print("[VoiceCloning] Loading synthesizer...") synthesizer = synthesizer_infer.Synthesizer(syn_path) print("[VoiceCloning] Loading vocoder...") vocoder_infer.load_model(voc_path) if not voice_path.exists(): raise RuntimeError(f"Reference voice file not found: {voice_path}") print("[VoiceCloning] Preprocessing reference audio...") wav = encoder_infer.preprocess_wav(voice_path) embed = encoder_infer.embed_utterance(wav) print("[VoiceCloning] Generating mel-spectrogram...") mels = synthesizer.synthesize_spectrograms([text], [embed]) mel = mels[0] print("[VoiceCloning] Vocoding waveform with WaveRNN...") try: # Use the high-quality WaveRNN vocoder (much better than Griffin-Lim) waveform = vocoder_infer.infer_waveform( mel, normalize=True, batched=False, target=8000, overlap=800 ).astype(np.float32) except Exception as e: print(f"[VoiceCloning] Vocoder failed: {e}, falling back to Griffin-Lim...") waveform = synthesizer.griffin_lim(mel).astype(np.float32) # Normalize waveform to prevent clipping and ensure good volume max_abs_value = np.max(np.abs(waveform)) if max_abs_value > 0: # Target peak level at -3dB (0.707 * max_int16) target_level = 0.707 waveform = waveform * (target_level / max_abs_value) # Ensure waveform is in valid range for 16-bit audio waveform = np.clip(waveform, -1.0, 1.0) print(f"[VoiceCloning] Waveform normalized - Max: {np.max(np.abs(waveform)):.4f}") out_path.parent.mkdir(parents=True, exist_ok=True) sf.write(out_path.as_posix(), waveform, syn_hp.sample_rate) print(f"[VoiceCloning] Audio saved to {out_path}") # Memory optimization for Render free tier print("[VoiceCloning] Cleaning up models to free memory...") try: # Clear model caches if hasattr(encoder_infer, '_model'): encoder_infer._model = None if hasattr(synthesizer_infer, '_model'): synthesizer_infer._model = None if hasattr(vocoder_infer, '_model'): vocoder_infer._model = None # Force garbage collection gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() except Exception as e: print(f"[VoiceCloning] Warning during cleanup: {e}") return out_path