Spaces:

AJ50
/

voice-cloning-backend

Running

AJ50 commited on 14 days ago

Commit

03fe1d8

1 Parent(s): abd73a3

Add multilingual support: English (WaveRNN) + Hindi (XTTS) [sync with pragyan]

Backend Changes:
- multilingual_tts.py: Unified TTS service supporting English and Hindi
- multilingual_song_processor.py: Orchestrator for multilingual song conversion
- routes.py: Updated /api/synthesize and /api/convert_song to support language parameter
- requirements.txt: Added TTS>=0.21.0 for XTTS Hindi model support

Frontend Changes:
- SpeechSynthesis.tsx: Added language selector buttons (English/Hindi)
- SongGeneration.tsx: Already has language toggle (reuse existing)
- api.ts: Updated synthesize() to accept language parameter
- Index.tsx: Pass language state to SpeechSynthesis component

Synced with pragyan branch multilingual integration

Files changed (8) hide show

backend/app/multilingual_song_processor.py +167 -0
backend/app/multilingual_tts.py +234 -0
backend/app/routes.py +91 -31
backend/requirements.txt +1 -0
frontend/.env.production +1 -1
frontend/src/components/forms/SpeechSynthesis.tsx +30 -2
frontend/src/pages/Index.tsx +2 -0
frontend/src/services/api.ts +3 -2

backend/app/multilingual_song_processor.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""Multilingual song processing - English and Hindi support."""
+import gc
+import torch
+import numpy as np
+from pathlib import Path
+from typing import Optional
+import sys
+from app.song_conversion.vocal_separator import VocalSeparator
+from app.song_conversion.audio_mixer import AudioMixer
+from app.multilingual_tts import MultilingualTTSService, Language
+class MultilingualSongProcessor:
+    """
+    Orchestrates song voice conversion for multiple languages.
+    - English songs: Uses WaveRNN voice cloning
+    - Hindi songs: Uses XTTS Hindi model
+    """
+    def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
+        """
+        Initialize multilingual song processor.
+        Args:
+            models_dir: Directory with English models
+            hindi_model_dir: Directory with Hindi XTTS model
+        """
+        self.models_dir = Path(models_dir)
+        self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None
+        self.separator = None
+        self.tts_service = None
+        self.sr = 16000
+    def _ensure_separator(self) -> VocalSeparator:
+        """Lazy load vocal separator."""
+        if self.separator is None:
+            print("[MultilingualSongProcessor] Initializing vocal separator...")
+            self.separator = VocalSeparator(model_name="htdemucs")
+        return self.separator
+    def _ensure_tts_service(self) -> MultilingualTTSService:
+        """Lazy load TTS service."""
+        if self.tts_service is None:
+            print("[MultilingualSongProcessor] Initializing multilingual TTS service...")
+            self.tts_service = MultilingualTTSService(
+                models_dir=self.models_dir,
+                hindi_model_dir=self.hindi_model_dir
+            )
+        return self.tts_service
+    def _extract_lyrics_from_audio(self, audio_path: Path) -> str:
+        """
+        Extract lyrics from audio (placeholder).
+        In production, would use Whisper with language detection.
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            Extracted or placeholder lyrics
+        """
+        print("[MultilingualSongProcessor] Extracting lyrics from audio...")
+        # Placeholder: return generic phonetically rich text
+        # In production, use: whisper_model.transcribe(str(audio_path), language='en'/'hi')
+        lyrics = "The music is playing so well with this song today"
+        print(f"[MultilingualSongProcessor] Using default lyrics: {lyrics}")
+        return lyrics
+    def convert_song(self, song_path: Path, voice_path: Path, output_path: Path,
+                    language: str = 'english', add_effects: bool = True) -> Path:
+        """
+        Convert song to user's voice (multilingual support).
+        Pipeline:
+        1. Separate vocals from instrumental (Demucs)
+        2. Extract lyrics (placeholder or Whisper)
+        3. Synthesize with user's voice (language-aware)
+        4. Mix synthesized vocals with instrumental
+        5. Add audio effects
+        Args:
+            song_path: Path to input song
+            voice_path: Path to reference voice sample
+            output_path: Path for output song
+            language: 'english' or 'hindi'
+            add_effects: Whether to add reverb/compression
+        Returns:
+            Path to output song
+        """
+        song_path = Path(song_path)
+        voice_path = Path(voice_path)
+        output_path = Path(output_path)
+        language = language.lower()
+        try:
+            print(f"\n[MultilingualSongProcessor] ========== SONG CONVERSION START ==========")
+            print(f"[MultilingualSongProcessor] Language: {language.upper()}")
+            print(f"[MultilingualSongProcessor] Song: {song_path}")
+            print(f"[MultilingualSongProcessor] Voice: {voice_path}")
+            print(f"[MultilingualSongProcessor] Output: {output_path}")
+            # Step 1: Separate vocals
+            print(f"\n[MultilingualSongProcessor] STEP 1: Separating vocals...")
+            separator = self._ensure_separator()
+            vocals, instrumental = separator.separate(song_path, sr=self.sr)
+            # Step 2: Extract/prepare lyrics
+            print(f"\n[MultilingualSongProcessor] STEP 2: Preparing lyrics...")
+            lyrics = self._extract_lyrics_from_audio(song_path)
+            # Step 3-4: Synthesize and mix using multilingual TTS
+            print(f"\n[MultilingualSongProcessor] STEP 3-4: Synthesizing vocals with {language.upper()} model...")
+            tts_service = self._ensure_tts_service()
+            try:
+                synthesized_vocal = tts_service.synthesize(lyrics, voice_path, language)
+            except Exception as e:
+                print(f"[MultilingualSongProcessor] Synthesis error: {e}")
+                raise
+            # Resample if needed (XTTS uses 24kHz, we need 16kHz for mixing)
+            if len(synthesized_vocal.shape) > 1:
+                synthesized_vocal = np.mean(synthesized_vocal, axis=1)
+            if language == Language.HINDI.value:
+                # XTTS uses 24kHz, resample to 16kHz for consistency
+                from scipy import signal
+                num_samples = int(len(synthesized_vocal) * (self.sr / 24000))
+                synthesized_vocal = signal.resample(synthesized_vocal, num_samples)
+            synthesized_vocal = synthesized_vocal.astype(np.float32)
+            print(f"[MultilingualSongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}")
+            # Step 5: Mix with instrumental
+            print(f"\n[MultilingualSongProcessor] STEP 5: Mixing vocals with instrumental...")
+            final_audio = AudioMixer.mix_and_save(
+                synthesized_vocal, instrumental,
+                output_path, sr=self.sr,
+                add_effects=add_effects
+            )
+            # Cleanup
+            print(f"\n[MultilingualSongProcessor] Cleaning up models...")
+            try:
+                gc.collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            except Exception as e:
+                print(f"[MultilingualSongProcessor] Warning during cleanup: {e}")
+            print(f"\n[MultilingualSongProcessor] ========== SONG CONVERSION COMPLETE ==========")
+            print(f"[MultilingualSongProcessor] Output saved to: {final_audio}")
+            return final_audio
+        except Exception as e:
+            print(f"\n[MultilingualSongProcessor] ✗ ERROR: {e}")
+            import traceback
+            traceback.print_exc()
+            sys.stdout.flush()
+            raise

backend/app/multilingual_tts.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""Multilingual TTS Service - Supports English (WaveRNN) and Hindi (XTTS)."""
+import gc
+import torch
+import numpy as np
+from pathlib import Path
+from typing import Optional, Union
+from enum import Enum
+import sys
+class Language(str, Enum):
+    """Supported languages."""
+    ENGLISH = "english"
+    HINDI = "hindi"
+class MultilingualTTSService:
+    """
+    Unified TTS service supporting multiple languages.
+    - English: Uses existing WaveRNN vocoder + Tacotron2 synthesizer + encoder
+    - Hindi: Uses XTTS (Coqui TTS) model
+    """
+    def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
+        """
+        Initialize multilingual TTS service.
+        Args:
+            models_dir: Directory with English models (encoder.pt, synthesizer.pt, vocoder.pt)
+            hindi_model_dir: Directory with XTTS Hindi model. If None, Hindi support disabled.
+        """
+        self.models_dir = Path(models_dir)
+        self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None
+        # Track loaded models
+        self._encoder_model = None
+        self._synthesizer_model = None
+        self._vocoder_model = None
+        self._xtts_model = None
+        self.sr = 16000
+        print("[MultilingualTTSService] Initialized")
+        print(f"[MultilingualTTSService] English models dir: {self.models_dir}")
+        if self.hindi_model_dir:
+            print(f"[MultilingualTTSService] Hindi XTTS dir: {self.hindi_model_dir}")
+        else:
+            print("[MultilingualTTSService] Hindi support: DISABLED (no model path)")
+    def _load_english_models(self):
+        """Load English voice cloning models (lazy load)."""
+        if self._encoder_model is None:
+            print("[MultilingualTTSService] Loading English encoder...")
+            from encoder import inference as encoder_infer
+            enc_path = self.models_dir / "default" / "encoder.pt"
+            if not enc_path.exists():
+                raise RuntimeError(f"English encoder model missing: {enc_path}")
+            encoder_infer.load_model(enc_path)
+            self._encoder_model = True
+            print("[MultilingualTTSService] ✓ English encoder loaded")
+        if self._synthesizer_model is None:
+            print("[MultilingualTTSService] Loading English synthesizer...")
+            from synthesizer import inference as synthesizer_infer
+            syn_path = self.models_dir / "default" / "synthesizer.pt"
+            if not syn_path.exists():
+                raise RuntimeError(f"English synthesizer model missing: {syn_path}")
+            self._synthesizer_model = synthesizer_infer.Synthesizer(syn_path)
+            print("[MultilingualTTSService] ✓ English synthesizer loaded")
+        if self._vocoder_model is None:
+            print("[MultilingualTTSService] Loading English vocoder...")
+            from app.vocoder import inference as vocoder_infer
+            voc_path = self.models_dir / "default" / "vocoder.pt"
+            if not voc_path.exists():
+                raise RuntimeError(f"English vocoder model missing: {voc_path}")
+            vocoder_infer.load_model(voc_path)
+            self._vocoder_model = True
+            print("[MultilingualTTSService] ✓ English vocoder loaded")
+    def _load_hindi_models(self):
+        """Load Hindi XTTS model (lazy load)."""
+        if not self.hindi_model_dir:
+            raise RuntimeError("Hindi model not configured. Set hindi_model_dir path.")
+        if self._xtts_model is None:
+            print("[MultilingualTTSService] Loading Hindi XTTS model...")
+            try:
+                from TTS.api import TTS
+            except ImportError:
+                raise ImportError(
+                    "TTS library required for Hindi support. "
+                    "Install with: pip install TTS>=0.21.0"
+                )
+            config_path = self.hindi_model_dir / "config.json"
+            if not config_path.exists():
+                raise RuntimeError(f"Hindi model config missing: {config_path}")
+            # Load XTTS model
+            self._xtts_model = TTS(
+                model_path=str(self.hindi_model_dir.resolve().as_posix()),
+                config_path=str(config_path),
+                gpu=False  # Set to True if CUDA available and needed
+            )
+            print("[MultilingualTTSService] ✓ Hindi XTTS loaded")
+    def synthesize(self, text: str, voice_sample_path: Union[str, Path],
+                  language: str = "english") -> np.ndarray:
+        """
+        Synthesize speech in specified language.
+        Args:
+            text: Text to synthesize
+            voice_sample_path: Path to reference voice sample
+            language: "english" or "hindi"
+        Returns:
+            Audio waveform as numpy array
+        """
+        language = language.lower()
+        if language == Language.ENGLISH:
+            return self._synthesize_english(text, voice_sample_path)
+        elif language == Language.HINDI:
+            return self._synthesize_hindi(text, voice_sample_path)
+        else:
+            raise ValueError(f"Unsupported language: {language}")
+    def _synthesize_english(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
+        """Synthesize English speech using WaveRNN + Tacotron2."""
+        from encoder import inference as encoder_infer
+        from app.vocoder import inference as vocoder_infer
+        self._load_english_models()
+        print(f"[MultilingualTTSService] Synthesizing English: {text[:50]}...")
+        # Embed voice
+        wav = encoder_infer.preprocess_wav(voice_sample_path)
+        embed = encoder_infer.embed_utterance(wav)
+        # Generate mel
+        mels = self._synthesizer_model.synthesize_spectrograms([text], [embed])
+        mel = mels[0]
+        # Vocalize
+        try:
+            synthesized = vocoder_infer.infer_waveform(
+                mel, normalize=True, batched=False, target=8000, overlap=800
+            ).astype(np.float32)
+        except Exception as e:
+            print(f"[MultilingualTTSService] Vocoder failed: {e}, using Griffin-Lim fallback")
+            synthesized = self._synthesizer_model.griffin_lim(mel).astype(np.float32)
+        # Normalize
+        max_val = np.max(np.abs(synthesized))
+        if max_val > 0:
+            target_level = 0.707
+            synthesized = synthesized * (target_level / max_val)
+        return np.clip(synthesized, -1.0, 1.0)
+    def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
+        """Synthesize Hindi speech using XTTS model."""
+        self._load_hindi_models()
+        print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
+        # XTTS synthesize
+        audio = self._xtts_model.tts(
+            text=text,
+            speaker_wav=str(voice_sample_path),
+            language="hi"
+        )
+        # Convert to float32 if needed
+        audio = np.asarray(audio, dtype=np.float32)
+        # Normalize
+        max_val = np.max(np.abs(audio))
+        if max_val > 0:
+            target_level = 0.707
+            audio = audio * (target_level / max_val)
+        return np.clip(audio, -1.0, 1.0)
+    def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path],
+                           output_path: Union[str, Path], language: str = "english") -> Path:
+        """
+        Synthesize and save to file.
+        Args:
+            text: Text to synthesize
+            voice_sample_path: Path to reference voice
+            output_path: Where to save audio
+            language: "english" or "hindi"
+        Returns:
+            Path to output file
+        """
+        import soundfile as sf
+        output_path = Path(output_path)
+        try:
+            audio = self.synthesize(text, voice_sample_path, language)
+            # Determine sample rate based on language
+            sr = 24000 if language.lower() == Language.HINDI else 16000
+            sf.write(output_path, audio, sr)
+            print(f"[MultilingualTTSService] Audio saved: {output_path}")
+            return output_path
+        except Exception as e:
+            print(f"[MultilingualTTSService] Error during synthesis: {e}")
+            raise
+    def cleanup(self):
+        """Release model memory."""
+        print("[MultilingualTTSService] Cleaning up models...")
+        try:
+            self._encoder_model = None
+            self._synthesizer_model = None
+            self._vocoder_model = None
+            self._xtts_model = None
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        except Exception as e:
+            print(f"[MultilingualTTSService] Cleanup warning: {e}")

backend/app/routes.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
-Flask API Backend for Voice Cloning
 Integrates the Python voice cloning backend with the React frontend
 """
 from flask import Blueprint, request, jsonify, send_file
@@ -9,6 +10,7 @@ import uuid
 import json
 from datetime import datetime
 import sys
 from .voice_cloning import synthesize
@@ -22,6 +24,24 @@ OUTPUT_FOLDER = BASE_DIR / 'outputs'
 MODELS_DIR = BASE_DIR / 'models'
 VOICES_DB = UPLOAD_FOLDER / 'voices.json'
 # Create directories with parents
 try:
     UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
@@ -152,8 +172,18 @@ def get_voices():
 @bp.route('/synthesize', methods=['POST'])
 def synthesize_speech():
     """
-    Synthesize speech from text using enrolled voice
-    Frontend sends: { "text": "...", "voiceId": "voice_xxx" }
     """
     try:
         data = request.get_json()
@@ -162,7 +192,8 @@ def synthesize_speech():
             return jsonify({'error': 'No data provided'}), 400
         text = data.get('text', '').strip()
-        voice_id = data.get('voice_id', '')  # Changed from 'voiceId' to 'voice_id'
         if not text:
             return jsonify({'error': 'No text provided'}), 400
@@ -170,6 +201,16 @@ def synthesize_speech():
         if not voice_id:
             return jsonify({'error': 'No voice selected'}), 400
         # Find the voice in database
         voices = load_voices_db()
         voice = next((v for v in voices if v['id'] == voice_id), None)
@@ -177,7 +218,7 @@ def synthesize_speech():
         if not voice:
             return jsonify({'error': 'Voice not found'}), 404
-        # Reconstruct path from UPLOAD_FOLDER (server-agnostic)
         voice_filepath = UPLOAD_FOLDER / voice['filename']
         if not voice_filepath.exists():
@@ -187,28 +228,43 @@ def synthesize_speech():
         output_filename = f"synthesis_{uuid.uuid4().hex[:8]}.wav"
         output_path = OUTPUT_FOLDER / output_filename
-        # Call the voice cloning synthesis function
-        print(f"Synthesizing: '{text}' with voice '{voice['name']}'")
-        print(f"Voice file: {voice_filepath}")
-        print(f"Output path: {output_path}")
-        print(f"Models dir: {MODELS_DIR}")
-        print("Starting synthesis... This may take 30-60 seconds...")
         try:
-            # Flush output to see logs immediately
-            sys.stdout.flush()
-            synthesize(
-                voice_path=voice_filepath,
-                text=text,
-                models_dir=MODELS_DIR,
-                out_path=output_path
-            )
-            print(f"Synthesis completed! Output saved to: {output_path}")
             sys.stdout.flush()
         except Exception as synth_error:
-            print(f"Synthesis error: {synth_error}")
             import traceback
             traceback.print_exc()
             sys.stdout.flush()
@@ -221,12 +277,13 @@ def synthesize_speech():
         # Return the audio file URL
         return jsonify({
             'success': True,
-            'message': 'Speech synthesized successfully',
-            'audio_url': f'/api/audio/{output_filename}'
         }), 200
     except Exception as e:
-        print(f"Error synthesizing speech: {e}")
         import traceback
         traceback.print_exc()
         return jsonify({'error': f'Failed to synthesize speech: {str(e)}'}), 500
@@ -469,17 +526,19 @@ def convert_song():
         print(f"[API] Language: {language}")
         print(f"[API] Add effects: {add_effects}")
-        # Import song processor
-        from app.song_conversion.song_processor import SongProcessor
-        processor = SongProcessor(MODELS_DIR)
         result_path = processor.convert_song(
             song_path=song_path,
             voice_path=voice_filepath,
             output_path=output_path,
             language=language,
-            add_effects=add_effects,
-            models_dir=MODELS_DIR
         )
         print(f"[API] Song conversion complete: {result_path}")
@@ -489,7 +548,8 @@ def convert_song():
             'success': True,
             'message': 'Song converted successfully',
             'audio_url': f'/api/audio/{output_filename}',
-            'filename': output_filename
         }), 200
     except Exception as e:

 """
+"""Flask API Backend for Voice Cloning
 Integrates the Python voice cloning backend with the React frontend
+Supports multilingual synthesis: English (WaveRNN) and Hindi (XTTS)
 """
 from flask import Blueprint, request, jsonify, send_file
 import json
 from datetime import datetime
 import sys
+import os
 from .voice_cloning import synthesize
 MODELS_DIR = BASE_DIR / 'models'
 VOICES_DB = UPLOAD_FOLDER / 'voices.json'
+# Hindi model directory (check multiple possible locations)
+HINDI_MODEL_DIR = None
+possible_hindi_dirs = [
+    Path(os.getenv('HINDI_MODEL_PATH', '')) if os.getenv('HINDI_MODEL_PATH') else None,
+    BASE_DIR.parent / 'Apoorv_hindi_model' / 'models' / 'xtts_hindi',  # Local development
+    BASE_DIR / 'models' / 'xtts_hindi',  # Alternative location
+]
+for path in possible_hindi_dirs:
+    if path and path.exists():
+        HINDI_MODEL_DIR = path
+        print(f"✓ Hindi model found at: {HINDI_MODEL_DIR}")
+        break
+if not HINDI_MODEL_DIR:
+    print("⚠ Hindi model not found. Hindi synthesis will be unavailable.")
+    print("  To enable Hindi support, set HINDI_MODEL_PATH environment variable")
+    print("  or place model at: Apoorv_hindi_model/models/xtts_hindi")
 # Create directories with parents
 try:
     UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
 @bp.route('/synthesize', methods=['POST'])
 def synthesize_speech():
     """
+    Synthesize speech from text using enrolled voice (multilingual support).
+    Frontend sends JSON:
+    {
+        "text": "Your text here",
+        "voice_id": "voice_xxx",
+        "language": "english" or "hindi"  (optional, defaults to english)
+    }
+    Supports:
+    - English: Uses WaveRNN vocoder (existing model)
+    - Hindi: Uses XTTS model (requires hindi_model_dir)
     """
     try:
         data = request.get_json()
             return jsonify({'error': 'No data provided'}), 400
         text = data.get('text', '').strip()
+        voice_id = data.get('voice_id', '')
+        language = data.get('language', 'english').lower()
         if not text:
             return jsonify({'error': 'No text provided'}), 400
         if not voice_id:
             return jsonify({'error': 'No voice selected'}), 400
+        if language not in ['english', 'hindi']:
+            return jsonify({'error': f'Unsupported language: {language}. Supported: english, hindi'}), 400
+        # Check if Hindi model is available for Hindi synthesis
+        if language == 'hindi' and not HINDI_MODEL_DIR:
+            return jsonify({
+                'error': 'Hindi synthesis unavailable. Hindi model not configured.',
+                'available_languages': ['english']
+            }), 503
         # Find the voice in database
         voices = load_voices_db()
         voice = next((v for v in voices if v['id'] == voice_id), None)
         if not voice:
             return jsonify({'error': 'Voice not found'}), 404
+        # Reconstruct path from UPLOAD_FOLDER
         voice_filepath = UPLOAD_FOLDER / voice['filename']
         if not voice_filepath.exists():
         output_filename = f"synthesis_{uuid.uuid4().hex[:8]}.wav"
         output_path = OUTPUT_FOLDER / output_filename
+        print(f"\n[API /synthesize]")
+        print(f"  Language: {language.upper()}")
+        print(f"  Text: '{text[:50]}...'")
+        print(f"  Voice: '{voice['name']}'")
+        print(f"  Voice file: {voice_filepath}")
+        print(f"  Output: {output_path}")
+        sys.stdout.flush()
         try:
+            if language == 'english':
+                # Use original English synthesis (WaveRNN)
+                synthesize(
+                    voice_path=voice_filepath,
+                    text=text,
+                    models_dir=MODELS_DIR,
+                    out_path=output_path
+                )
+            else:
+                # Use multilingual TTS for Hindi
+                from app.multilingual_tts import MultilingualTTSService
+                tts_service = MultilingualTTSService(
+                    models_dir=MODELS_DIR,
+                    hindi_model_dir=HINDI_MODEL_DIR
+                )
+                tts_service.synthesize_and_save(
+                    text=text,
+                    voice_sample_path=voice_filepath,
+                    output_path=output_path,
+                    language=language
+                )
+                tts_service.cleanup()
+            print(f"[API /synthesize] ✓ Synthesis completed!")
             sys.stdout.flush()
         except Exception as synth_error:
+            print(f"[API /synthesize] ✗ Synthesis error: {synth_error}")
             import traceback
             traceback.print_exc()
             sys.stdout.flush()
         # Return the audio file URL
         return jsonify({
             'success': True,
+            'message': f'{language.capitalize()} speech synthesized successfully',
+            'audio_url': f'/api/audio/{output_filename}',
+            'language': language
         }), 200
     except Exception as e:
+        print(f"[API /synthesize] Unexpected error: {e}")
         import traceback
         traceback.print_exc()
         return jsonify({'error': f'Failed to synthesize speech: {str(e)}'}), 500
         print(f"[API] Language: {language}")
         print(f"[API] Add effects: {add_effects}")
+        # Import multilingual song processor
+        from app.multilingual_song_processor import MultilingualSongProcessor
+        processor = MultilingualSongProcessor(
+            models_dir=MODELS_DIR,
+            hindi_model_dir=HINDI_MODEL_DIR if language == 'hindi' else None
+        )
         result_path = processor.convert_song(
             song_path=song_path,
             voice_path=voice_filepath,
             output_path=output_path,
             language=language,
+            add_effects=add_effects
         )
         print(f"[API] Song conversion complete: {result_path}")
             'success': True,
             'message': 'Song converted successfully',
             'audio_url': f'/api/audio/{output_filename}',
+            'filename': output_filename,
+            'language': language
         }), 200
     except Exception as e:

backend/requirements.txt CHANGED Viewed

@@ -14,3 +14,4 @@ unidecode>=1.2.0
 inflect>=6.0.0
 demucs>=4.0.0
 pydub>=0.25.1

 inflect>=6.0.0
 demucs>=4.0.0
 pydub>=0.25.1
+TTS>=0.21.0

frontend/.env.production CHANGED Viewed

	@@ -1,2 +1,2 @@
1	# Production deployment
2	- VITE_API_URL=https://voice-cloning-~~personalized-speech~~.~~onrender~~.~~com~~


1	# Production deployment
2	+ VITE_API_URL=https://aj50-voice-cloning-backend.hf.space

frontend/src/components/forms/SpeechSynthesis.tsx CHANGED Viewed

@@ -23,6 +23,8 @@ interface Voice {
 interface SpeechSynthesisProps {
   voices?: Voice[];
   onSynthesisComplete?: (audioUrl: string) => void;
   className?: string;
 }
@@ -36,6 +38,8 @@ const sampleTexts = {
 export default function SpeechSynthesis({
   voices: propVoices,
   onSynthesisComplete,
   className = ""
 }: SpeechSynthesisProps) {
@@ -113,8 +117,8 @@ export default function SpeechSynthesis({
     setSynthesizerStartTime(Date.now()); // Record synthesis start time
     try {
-      // Call backend API for synthesis
-      const result = await api.synthesize(selectedVoice, inputText);
       // Get the audio file URL from backend with cache busting
       const audioUrl = api.getAudioUrl(result.audio_url) + `?t=${Date.now()}`;
@@ -228,6 +232,30 @@ export default function SpeechSynthesis({
           </Button>
         </CardHeader>
       <CardContent className="space-y-6">
         {/* Voice Selection */}
         <div className="space-y-2">
           <Label htmlFor="voice-select">Select Voice</Label>

 interface SpeechSynthesisProps {
   voices?: Voice[];
+  language?: 'english' | 'hindi';
+  onLanguageChange?: (language: 'english' | 'hindi') => void;
   onSynthesisComplete?: (audioUrl: string) => void;
   className?: string;
 }
 export default function SpeechSynthesis({
   voices: propVoices,
+  language = 'english',
+  onLanguageChange,
   onSynthesisComplete,
   className = ""
 }: SpeechSynthesisProps) {
     setSynthesizerStartTime(Date.now()); // Record synthesis start time
     try {
+      // Call backend API for synthesis with language support
+      const result = await api.synthesize(selectedVoice, inputText, language);
       // Get the audio file URL from backend with cache busting
       const audioUrl = api.getAudioUrl(result.audio_url) + `?t=${Date.now()}`;
           </Button>
         </CardHeader>
       <CardContent className="space-y-6">
+        {/* Language Selector */}
+        <div className="flex gap-2">
+          <button
+            onClick={() => onLanguageChange?.('english')}
+            className={`flex-1 py-2 px-4 rounded-lg font-medium transition-all ${
+              language === 'english'
+                ? 'bg-blue-600 text-white shadow-lg'
+                : 'bg-gray-200 text-gray-700 hover:bg-gray-300'
+            }`}
+          >
+            🇬🇧 English
+          </button>
+          <button
+            onClick={() => onLanguageChange?.('hindi')}
+            className={`flex-1 py-2 px-4 rounded-lg font-medium transition-all ${
+              language === 'hindi'
+                ? 'bg-orange-600 text-white shadow-lg'
+                : 'bg-gray-200 text-gray-700 hover:bg-gray-300'
+            }`}
+          >
+            🇮🇳 हिन्दी
+          </button>
+        </div>
         {/* Voice Selection */}
         <div className="space-y-2">
           <Label htmlFor="voice-select">Select Voice</Label>

frontend/src/pages/Index.tsx CHANGED Viewed

@@ -251,6 +251,8 @@ const Index = () => {
             <TabsContent value="synthesize" className="space-y-6">
               <SpeechSynthesis
                 voices={enrolledVoices.length ? enrolledVoices : undefined}
                 onSynthesisComplete={handleSynthesisComplete}
               />

             <TabsContent value="synthesize" className="space-y-6">
               <SpeechSynthesis
                 voices={enrolledVoices.length ? enrolledVoices : undefined}
+                language={language}
+                onLanguageChange={setLanguage}
                 onSynthesisComplete={handleSynthesisComplete}
               />

frontend/src/services/api.ts CHANGED Viewed

@@ -41,9 +41,9 @@ export const api = {
   },
   /**
-   * Synthesize speech from text
    */
-  synthesize: async (voiceId: string, text: string) => {
     const response = await fetch(api.getUrl('/synthesize'), {
       method: 'POST',
       headers: {
@@ -52,6 +52,7 @@ export const api = {
       body: JSON.stringify({
         voice_id: voiceId,
         text: text,
       }),
     });
     if (!response.ok) {

   },
   /**
+   * Synthesize speech from text (supports multilingual: english, hindi)
    */
+  synthesize: async (voiceId: string, text: string, language: string = 'english') => {
     const response = await fetch(api.getUrl('/synthesize'), {
       method: 'POST',
       headers: {
       body: JSON.stringify({
         voice_id: voiceId,
         text: text,
+        language: language,
       }),
     });
     if (!response.ok) {