Spaces:

AJ50
/

voice-cloning-backend

Sleeping

App Files Files Community

voice-cloning-backend / backend /app /multilingual_song_processor.py

AJ50

Add multilingual support: English (WaveRNN) + Hindi (XTTS) [sync with pragyan]

03fe1d8 16 days ago

raw

history blame

6.84 kB

	"""Multilingual song processing - English and Hindi support."""

	import gc
	import torch
	import numpy as np
	from pathlib import Path
	from typing import Optional
	import sys

	from app.song_conversion.vocal_separator import VocalSeparator
	from app.song_conversion.audio_mixer import AudioMixer
	from app.multilingual_tts import MultilingualTTSService, Language


	class MultilingualSongProcessor:
	"""
	Orchestrates song voice conversion for multiple languages.

	- English songs: Uses WaveRNN voice cloning
	- Hindi songs: Uses XTTS Hindi model
	"""

	def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
	"""
	Initialize multilingual song processor.

	Args:
	models_dir: Directory with English models
	hindi_model_dir: Directory with Hindi XTTS model
	"""
	self.models_dir = Path(models_dir)
	self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None
	self.separator = None
	self.tts_service = None
	self.sr = 16000

	def _ensure_separator(self) -> VocalSeparator:
	"""Lazy load vocal separator."""
	if self.separator is None:
	print("[MultilingualSongProcessor] Initializing vocal separator...")
	self.separator = VocalSeparator(model_name="htdemucs")
	return self.separator

	def _ensure_tts_service(self) -> MultilingualTTSService:
	"""Lazy load TTS service."""
	if self.tts_service is None:
	print("[MultilingualSongProcessor] Initializing multilingual TTS service...")
	self.tts_service = MultilingualTTSService(
	models_dir=self.models_dir,
	hindi_model_dir=self.hindi_model_dir
	)
	return self.tts_service

	def _extract_lyrics_from_audio(self, audio_path: Path) -> str:
	"""
	Extract lyrics from audio (placeholder).
	In production, would use Whisper with language detection.

	Args:
	audio_path: Path to audio file

	Returns:
	Extracted or placeholder lyrics
	"""
	print("[MultilingualSongProcessor] Extracting lyrics from audio...")

	# Placeholder: return generic phonetically rich text
	# In production, use: whisper_model.transcribe(str(audio_path), language='en'/'hi')
	lyrics = "The music is playing so well with this song today"

	print(f"[MultilingualSongProcessor] Using default lyrics: {lyrics}")
	return lyrics

	def convert_song(self, song_path: Path, voice_path: Path, output_path: Path,
	language: str = 'english', add_effects: bool = True) -> Path:
	"""
	Convert song to user's voice (multilingual support).

	Pipeline:
	1. Separate vocals from instrumental (Demucs)
	2. Extract lyrics (placeholder or Whisper)
	3. Synthesize with user's voice (language-aware)
	4. Mix synthesized vocals with instrumental
	5. Add audio effects

	Args:
	song_path: Path to input song
	voice_path: Path to reference voice sample
	output_path: Path for output song
	language: 'english' or 'hindi'
	add_effects: Whether to add reverb/compression

	Returns:
	Path to output song
	"""
	song_path = Path(song_path)
	voice_path = Path(voice_path)
	output_path = Path(output_path)
	language = language.lower()

	try:
	print(f"\n[MultilingualSongProcessor] ========== SONG CONVERSION START ==========")
	print(f"[MultilingualSongProcessor] Language: {language.upper()}")
	print(f"[MultilingualSongProcessor] Song: {song_path}")
	print(f"[MultilingualSongProcessor] Voice: {voice_path}")
	print(f"[MultilingualSongProcessor] Output: {output_path}")

	# Step 1: Separate vocals
	print(f"\n[MultilingualSongProcessor] STEP 1: Separating vocals...")
	separator = self._ensure_separator()
	vocals, instrumental = separator.separate(song_path, sr=self.sr)

	# Step 2: Extract/prepare lyrics
	print(f"\n[MultilingualSongProcessor] STEP 2: Preparing lyrics...")
	lyrics = self._extract_lyrics_from_audio(song_path)

	# Step 3-4: Synthesize and mix using multilingual TTS
	print(f"\n[MultilingualSongProcessor] STEP 3-4: Synthesizing vocals with {language.upper()} model...")
	tts_service = self._ensure_tts_service()

	try:
	synthesized_vocal = tts_service.synthesize(lyrics, voice_path, language)
	except Exception as e:
	print(f"[MultilingualSongProcessor] Synthesis error: {e}")
	raise

	# Resample if needed (XTTS uses 24kHz, we need 16kHz for mixing)
	if len(synthesized_vocal.shape) > 1:
	synthesized_vocal = np.mean(synthesized_vocal, axis=1)

	if language == Language.HINDI.value:
	# XTTS uses 24kHz, resample to 16kHz for consistency
	from scipy import signal
	num_samples = int(len(synthesized_vocal) * (self.sr / 24000))
	synthesized_vocal = signal.resample(synthesized_vocal, num_samples)

	synthesized_vocal = synthesized_vocal.astype(np.float32)
	print(f"[MultilingualSongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}")

	# Step 5: Mix with instrumental
	print(f"\n[MultilingualSongProcessor] STEP 5: Mixing vocals with instrumental...")
	final_audio = AudioMixer.mix_and_save(
	synthesized_vocal, instrumental,
	output_path, sr=self.sr,
	add_effects=add_effects
	)

	# Cleanup
	print(f"\n[MultilingualSongProcessor] Cleaning up models...")
	try:
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	except Exception as e:
	print(f"[MultilingualSongProcessor] Warning during cleanup: {e}")

	print(f"\n[MultilingualSongProcessor] ========== SONG CONVERSION COMPLETE ==========")
	print(f"[MultilingualSongProcessor] Output saved to: {final_audio}")

	return final_audio

	except Exception as e:
	print(f"\n[MultilingualSongProcessor] ✗ ERROR: {e}")
	import traceback
	traceback.print_exc()
	sys.stdout.flush()
	raise