voice-cloning-backend / backend /app /multilingual_song_processor.py
AJ50's picture
Add multilingual support: English (WaveRNN) + Hindi (XTTS) [sync with pragyan]
03fe1d8
raw
history blame
6.84 kB
"""Multilingual song processing - English and Hindi support."""
import gc
import torch
import numpy as np
from pathlib import Path
from typing import Optional
import sys
from app.song_conversion.vocal_separator import VocalSeparator
from app.song_conversion.audio_mixer import AudioMixer
from app.multilingual_tts import MultilingualTTSService, Language
class MultilingualSongProcessor:
"""
Orchestrates song voice conversion for multiple languages.
- English songs: Uses WaveRNN voice cloning
- Hindi songs: Uses XTTS Hindi model
"""
def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
"""
Initialize multilingual song processor.
Args:
models_dir: Directory with English models
hindi_model_dir: Directory with Hindi XTTS model
"""
self.models_dir = Path(models_dir)
self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None
self.separator = None
self.tts_service = None
self.sr = 16000
def _ensure_separator(self) -> VocalSeparator:
"""Lazy load vocal separator."""
if self.separator is None:
print("[MultilingualSongProcessor] Initializing vocal separator...")
self.separator = VocalSeparator(model_name="htdemucs")
return self.separator
def _ensure_tts_service(self) -> MultilingualTTSService:
"""Lazy load TTS service."""
if self.tts_service is None:
print("[MultilingualSongProcessor] Initializing multilingual TTS service...")
self.tts_service = MultilingualTTSService(
models_dir=self.models_dir,
hindi_model_dir=self.hindi_model_dir
)
return self.tts_service
def _extract_lyrics_from_audio(self, audio_path: Path) -> str:
"""
Extract lyrics from audio (placeholder).
In production, would use Whisper with language detection.
Args:
audio_path: Path to audio file
Returns:
Extracted or placeholder lyrics
"""
print("[MultilingualSongProcessor] Extracting lyrics from audio...")
# Placeholder: return generic phonetically rich text
# In production, use: whisper_model.transcribe(str(audio_path), language='en'/'hi')
lyrics = "The music is playing so well with this song today"
print(f"[MultilingualSongProcessor] Using default lyrics: {lyrics}")
return lyrics
def convert_song(self, song_path: Path, voice_path: Path, output_path: Path,
language: str = 'english', add_effects: bool = True) -> Path:
"""
Convert song to user's voice (multilingual support).
Pipeline:
1. Separate vocals from instrumental (Demucs)
2. Extract lyrics (placeholder or Whisper)
3. Synthesize with user's voice (language-aware)
4. Mix synthesized vocals with instrumental
5. Add audio effects
Args:
song_path: Path to input song
voice_path: Path to reference voice sample
output_path: Path for output song
language: 'english' or 'hindi'
add_effects: Whether to add reverb/compression
Returns:
Path to output song
"""
song_path = Path(song_path)
voice_path = Path(voice_path)
output_path = Path(output_path)
language = language.lower()
try:
print(f"\n[MultilingualSongProcessor] ========== SONG CONVERSION START ==========")
print(f"[MultilingualSongProcessor] Language: {language.upper()}")
print(f"[MultilingualSongProcessor] Song: {song_path}")
print(f"[MultilingualSongProcessor] Voice: {voice_path}")
print(f"[MultilingualSongProcessor] Output: {output_path}")
# Step 1: Separate vocals
print(f"\n[MultilingualSongProcessor] STEP 1: Separating vocals...")
separator = self._ensure_separator()
vocals, instrumental = separator.separate(song_path, sr=self.sr)
# Step 2: Extract/prepare lyrics
print(f"\n[MultilingualSongProcessor] STEP 2: Preparing lyrics...")
lyrics = self._extract_lyrics_from_audio(song_path)
# Step 3-4: Synthesize and mix using multilingual TTS
print(f"\n[MultilingualSongProcessor] STEP 3-4: Synthesizing vocals with {language.upper()} model...")
tts_service = self._ensure_tts_service()
try:
synthesized_vocal = tts_service.synthesize(lyrics, voice_path, language)
except Exception as e:
print(f"[MultilingualSongProcessor] Synthesis error: {e}")
raise
# Resample if needed (XTTS uses 24kHz, we need 16kHz for mixing)
if len(synthesized_vocal.shape) > 1:
synthesized_vocal = np.mean(synthesized_vocal, axis=1)
if language == Language.HINDI.value:
# XTTS uses 24kHz, resample to 16kHz for consistency
from scipy import signal
num_samples = int(len(synthesized_vocal) * (self.sr / 24000))
synthesized_vocal = signal.resample(synthesized_vocal, num_samples)
synthesized_vocal = synthesized_vocal.astype(np.float32)
print(f"[MultilingualSongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}")
# Step 5: Mix with instrumental
print(f"\n[MultilingualSongProcessor] STEP 5: Mixing vocals with instrumental...")
final_audio = AudioMixer.mix_and_save(
synthesized_vocal, instrumental,
output_path, sr=self.sr,
add_effects=add_effects
)
# Cleanup
print(f"\n[MultilingualSongProcessor] Cleaning up models...")
try:
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception as e:
print(f"[MultilingualSongProcessor] Warning during cleanup: {e}")
print(f"\n[MultilingualSongProcessor] ========== SONG CONVERSION COMPLETE ==========")
print(f"[MultilingualSongProcessor] Output saved to: {final_audio}")
return final_audio
except Exception as e:
print(f"\n[MultilingualSongProcessor] ✗ ERROR: {e}")
import traceback
traceback.print_exc()
sys.stdout.flush()
raise