Spaces:
Sleeping
Sleeping
| """Multilingual song processing - English and Hindi support.""" | |
| import gc | |
| import torch | |
| import numpy as np | |
| from pathlib import Path | |
| from typing import Optional | |
| import sys | |
| from app.song_conversion.vocal_separator import VocalSeparator | |
| from app.song_conversion.audio_mixer import AudioMixer | |
| from app.multilingual_tts import MultilingualTTSService, Language | |
| class MultilingualSongProcessor: | |
| """ | |
| Orchestrates song voice conversion for multiple languages. | |
| - English songs: Uses WaveRNN voice cloning | |
| - Hindi songs: Uses XTTS Hindi model | |
| """ | |
| def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None): | |
| """ | |
| Initialize multilingual song processor. | |
| Args: | |
| models_dir: Directory with English models | |
| hindi_model_dir: Directory with Hindi XTTS model | |
| """ | |
| self.models_dir = Path(models_dir) | |
| self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None | |
| self.separator = None | |
| self.tts_service = None | |
| self.sr = 16000 | |
| def _ensure_separator(self) -> VocalSeparator: | |
| """Lazy load vocal separator.""" | |
| if self.separator is None: | |
| print("[MultilingualSongProcessor] Initializing vocal separator...") | |
| self.separator = VocalSeparator(model_name="htdemucs") | |
| return self.separator | |
| def _ensure_tts_service(self) -> MultilingualTTSService: | |
| """Lazy load TTS service.""" | |
| if self.tts_service is None: | |
| print("[MultilingualSongProcessor] Initializing multilingual TTS service...") | |
| self.tts_service = MultilingualTTSService( | |
| models_dir=self.models_dir, | |
| hindi_model_dir=self.hindi_model_dir | |
| ) | |
| return self.tts_service | |
| def _extract_lyrics_from_audio(self, audio_path: Path) -> str: | |
| """ | |
| Extract lyrics from audio (placeholder). | |
| In production, would use Whisper with language detection. | |
| Args: | |
| audio_path: Path to audio file | |
| Returns: | |
| Extracted or placeholder lyrics | |
| """ | |
| print("[MultilingualSongProcessor] Extracting lyrics from audio...") | |
| # Placeholder: return generic phonetically rich text | |
| # In production, use: whisper_model.transcribe(str(audio_path), language='en'/'hi') | |
| lyrics = "The music is playing so well with this song today" | |
| print(f"[MultilingualSongProcessor] Using default lyrics: {lyrics}") | |
| return lyrics | |
| def convert_song(self, song_path: Path, voice_path: Path, output_path: Path, | |
| language: str = 'english', add_effects: bool = True) -> Path: | |
| """ | |
| Convert song to user's voice (multilingual support). | |
| Pipeline: | |
| 1. Separate vocals from instrumental (Demucs) | |
| 2. Extract lyrics (placeholder or Whisper) | |
| 3. Synthesize with user's voice (language-aware) | |
| 4. Mix synthesized vocals with instrumental | |
| 5. Add audio effects | |
| Args: | |
| song_path: Path to input song | |
| voice_path: Path to reference voice sample | |
| output_path: Path for output song | |
| language: 'english' or 'hindi' | |
| add_effects: Whether to add reverb/compression | |
| Returns: | |
| Path to output song | |
| """ | |
| song_path = Path(song_path) | |
| voice_path = Path(voice_path) | |
| output_path = Path(output_path) | |
| language = language.lower() | |
| try: | |
| print(f"\n[MultilingualSongProcessor] ========== SONG CONVERSION START ==========") | |
| print(f"[MultilingualSongProcessor] Language: {language.upper()}") | |
| print(f"[MultilingualSongProcessor] Song: {song_path}") | |
| print(f"[MultilingualSongProcessor] Voice: {voice_path}") | |
| print(f"[MultilingualSongProcessor] Output: {output_path}") | |
| # Step 1: Separate vocals | |
| print(f"\n[MultilingualSongProcessor] STEP 1: Separating vocals...") | |
| separator = self._ensure_separator() | |
| vocals, instrumental = separator.separate(song_path, sr=self.sr) | |
| # Step 2: Extract/prepare lyrics | |
| print(f"\n[MultilingualSongProcessor] STEP 2: Preparing lyrics...") | |
| lyrics = self._extract_lyrics_from_audio(song_path) | |
| # Step 3-4: Synthesize and mix using multilingual TTS | |
| print(f"\n[MultilingualSongProcessor] STEP 3-4: Synthesizing vocals with {language.upper()} model...") | |
| tts_service = self._ensure_tts_service() | |
| try: | |
| synthesized_vocal = tts_service.synthesize(lyrics, voice_path, language) | |
| except Exception as e: | |
| print(f"[MultilingualSongProcessor] Synthesis error: {e}") | |
| raise | |
| # Resample if needed (XTTS uses 24kHz, we need 16kHz for mixing) | |
| if len(synthesized_vocal.shape) > 1: | |
| synthesized_vocal = np.mean(synthesized_vocal, axis=1) | |
| if language == Language.HINDI.value: | |
| # XTTS uses 24kHz, resample to 16kHz for consistency | |
| from scipy import signal | |
| num_samples = int(len(synthesized_vocal) * (self.sr / 24000)) | |
| synthesized_vocal = signal.resample(synthesized_vocal, num_samples) | |
| synthesized_vocal = synthesized_vocal.astype(np.float32) | |
| print(f"[MultilingualSongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}") | |
| # Step 5: Mix with instrumental | |
| print(f"\n[MultilingualSongProcessor] STEP 5: Mixing vocals with instrumental...") | |
| final_audio = AudioMixer.mix_and_save( | |
| synthesized_vocal, instrumental, | |
| output_path, sr=self.sr, | |
| add_effects=add_effects | |
| ) | |
| # Cleanup | |
| print(f"\n[MultilingualSongProcessor] Cleaning up models...") | |
| try: | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| except Exception as e: | |
| print(f"[MultilingualSongProcessor] Warning during cleanup: {e}") | |
| print(f"\n[MultilingualSongProcessor] ========== SONG CONVERSION COMPLETE ==========") | |
| print(f"[MultilingualSongProcessor] Output saved to: {final_audio}") | |
| return final_audio | |
| except Exception as e: | |
| print(f"\n[MultilingualSongProcessor] ✗ ERROR: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.stdout.flush() | |
| raise | |