Spaces:

AJ50
/

voice-cloning-backend

Sleeping

App Files Files Community

voice-cloning-backend / backend /app /voice_cloning.py

AJ50

Fix noisy audio: Use WaveRNN vocoder instead of Griffin-Lim + add normalization

c222fbc 16 days ago

raw

history blame

4.41 kB

	"""Core voice cloning logic shared by the API routes."""

	from __future__ import annotations

	import shutil
	import gc
	import torch
	from pathlib import Path
	from typing import Dict, Tuple

	import numpy as np
	import soundfile as sf
	from huggingface_hub import hf_hub_download

	from encoder import inference as encoder_infer
	from synthesizer import inference as synthesizer_infer
	from synthesizer.hparams import hparams as syn_hp
	from app.vocoder import inference as vocoder_infer


	MODEL_SPECS: Dict[str, Tuple[str, str]] = {
	"encoder.pt": ("AJ50/voice-clone-encoder", "encoder.pt"),
	"synthesizer.pt": ("AJ50/voice-clone-synthesizer", "synthesizer.pt"),
	"vocoder.pt": ("AJ50/voice-clone-vocoder", "vocoder.pt"),
	}


	def ensure_default_models(models_dir: Path) -> None:
	"""Download the required pretrained weights if they are missing."""

	target_dir = models_dir / "default"
	target_dir.mkdir(parents=True, exist_ok=True)

	for filename, (repo_id, repo_filename) in MODEL_SPECS.items():
	destination = target_dir / filename
	if destination.exists():
	continue

	print(f"[Models] Downloading {filename} from {repo_id}...")
	downloaded_path = Path(
	hf_hub_download(repo_id=repo_id, filename=repo_filename)
	)
	shutil.copy2(downloaded_path, destination)
	print(f"[Models] Saved to {destination}")


	def synthesize(voice_path: Path, text: str, models_dir: Path, out_path: Path) -> Path:
	"""Run end-to-end voice cloning and return the generated audio path."""

	ensure_default_models(models_dir)

	enc_path = models_dir / "default" / "encoder.pt"
	syn_path = models_dir / "default" / "synthesizer.pt"
	voc_path = models_dir / "default" / "vocoder.pt"

	for model_path in (enc_path, syn_path, voc_path):
	if not model_path.exists():
	raise RuntimeError(f"Model file missing: {model_path}")

	print("[VoiceCloning] Loading encoder...")
	encoder_infer.load_model(enc_path)
	print("[VoiceCloning] Loading synthesizer...")
	synthesizer = synthesizer_infer.Synthesizer(syn_path)
	print("[VoiceCloning] Loading vocoder...")
	vocoder_infer.load_model(voc_path)

	if not voice_path.exists():
	raise RuntimeError(f"Reference voice file not found: {voice_path}")

	print("[VoiceCloning] Preprocessing reference audio...")
	wav = encoder_infer.preprocess_wav(voice_path)
	embed = encoder_infer.embed_utterance(wav)

	print("[VoiceCloning] Generating mel-spectrogram...")
	mels = synthesizer.synthesize_spectrograms([text], [embed])
	mel = mels[0]

	print("[VoiceCloning] Vocoding waveform with WaveRNN...")
	try:
	# Use the high-quality WaveRNN vocoder (much better than Griffin-Lim)
	waveform = vocoder_infer.infer_waveform(
	mel, normalize=True, batched=False, target=8000, overlap=800
	).astype(np.float32)
	except Exception as e:
	print(f"[VoiceCloning] Vocoder failed: {e}, falling back to Griffin-Lim...")
	waveform = synthesizer.griffin_lim(mel).astype(np.float32)

	# Normalize waveform to prevent clipping and ensure good volume
	max_abs_value = np.max(np.abs(waveform))
	if max_abs_value > 0:
	# Target peak level at -3dB (0.707 * max_int16)
	target_level = 0.707
	waveform = waveform * (target_level / max_abs_value)

	# Ensure waveform is in valid range for 16-bit audio
	waveform = np.clip(waveform, -1.0, 1.0)

	print(f"[VoiceCloning] Waveform normalized - Max: {np.max(np.abs(waveform)):.4f}")

	out_path.parent.mkdir(parents=True, exist_ok=True)
	sf.write(out_path.as_posix(), waveform, syn_hp.sample_rate)
	print(f"[VoiceCloning] Audio saved to {out_path}")

	# Memory optimization for Render free tier
	print("[VoiceCloning] Cleaning up models to free memory...")
	try:
	# Clear model caches
	if hasattr(encoder_infer, '_model'):
	encoder_infer._model = None
	if hasattr(synthesizer_infer, '_model'):
	synthesizer_infer._model = None
	if hasattr(vocoder_infer, '_model'):
	vocoder_infer._model = None

	# Force garbage collection
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	except Exception as e:
	print(f"[VoiceCloning] Warning during cleanup: {e}")

	return out_path