Spaces:
Running
Running
Add multilingual support: English (WaveRNN) + Hindi (XTTS) [sync with pragyan]
Browse filesBackend Changes:
- multilingual_tts.py: Unified TTS service supporting English and Hindi
- multilingual_song_processor.py: Orchestrator for multilingual song conversion
- routes.py: Updated /api/synthesize and /api/convert_song to support language parameter
- requirements.txt: Added TTS>=0.21.0 for XTTS Hindi model support
Frontend Changes:
- SpeechSynthesis.tsx: Added language selector buttons (English/Hindi)
- SongGeneration.tsx: Already has language toggle (reuse existing)
- api.ts: Updated synthesize() to accept language parameter
- Index.tsx: Pass language state to SpeechSynthesis component
Synced with pragyan branch multilingual integration
- backend/app/multilingual_song_processor.py +167 -0
- backend/app/multilingual_tts.py +234 -0
- backend/app/routes.py +91 -31
- backend/requirements.txt +1 -0
- frontend/.env.production +1 -1
- frontend/src/components/forms/SpeechSynthesis.tsx +30 -2
- frontend/src/pages/Index.tsx +2 -0
- frontend/src/services/api.ts +3 -2
backend/app/multilingual_song_processor.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Multilingual song processing - English and Hindi support."""
|
| 2 |
+
|
| 3 |
+
import gc
|
| 4 |
+
import torch
|
| 5 |
+
import numpy as np
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Optional
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
from app.song_conversion.vocal_separator import VocalSeparator
|
| 11 |
+
from app.song_conversion.audio_mixer import AudioMixer
|
| 12 |
+
from app.multilingual_tts import MultilingualTTSService, Language
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class MultilingualSongProcessor:
|
| 16 |
+
"""
|
| 17 |
+
Orchestrates song voice conversion for multiple languages.
|
| 18 |
+
|
| 19 |
+
- English songs: Uses WaveRNN voice cloning
|
| 20 |
+
- Hindi songs: Uses XTTS Hindi model
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
|
| 24 |
+
"""
|
| 25 |
+
Initialize multilingual song processor.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
models_dir: Directory with English models
|
| 29 |
+
hindi_model_dir: Directory with Hindi XTTS model
|
| 30 |
+
"""
|
| 31 |
+
self.models_dir = Path(models_dir)
|
| 32 |
+
self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None
|
| 33 |
+
self.separator = None
|
| 34 |
+
self.tts_service = None
|
| 35 |
+
self.sr = 16000
|
| 36 |
+
|
| 37 |
+
def _ensure_separator(self) -> VocalSeparator:
|
| 38 |
+
"""Lazy load vocal separator."""
|
| 39 |
+
if self.separator is None:
|
| 40 |
+
print("[MultilingualSongProcessor] Initializing vocal separator...")
|
| 41 |
+
self.separator = VocalSeparator(model_name="htdemucs")
|
| 42 |
+
return self.separator
|
| 43 |
+
|
| 44 |
+
def _ensure_tts_service(self) -> MultilingualTTSService:
|
| 45 |
+
"""Lazy load TTS service."""
|
| 46 |
+
if self.tts_service is None:
|
| 47 |
+
print("[MultilingualSongProcessor] Initializing multilingual TTS service...")
|
| 48 |
+
self.tts_service = MultilingualTTSService(
|
| 49 |
+
models_dir=self.models_dir,
|
| 50 |
+
hindi_model_dir=self.hindi_model_dir
|
| 51 |
+
)
|
| 52 |
+
return self.tts_service
|
| 53 |
+
|
| 54 |
+
def _extract_lyrics_from_audio(self, audio_path: Path) -> str:
|
| 55 |
+
"""
|
| 56 |
+
Extract lyrics from audio (placeholder).
|
| 57 |
+
In production, would use Whisper with language detection.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
audio_path: Path to audio file
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
Extracted or placeholder lyrics
|
| 64 |
+
"""
|
| 65 |
+
print("[MultilingualSongProcessor] Extracting lyrics from audio...")
|
| 66 |
+
|
| 67 |
+
# Placeholder: return generic phonetically rich text
|
| 68 |
+
# In production, use: whisper_model.transcribe(str(audio_path), language='en'/'hi')
|
| 69 |
+
lyrics = "The music is playing so well with this song today"
|
| 70 |
+
|
| 71 |
+
print(f"[MultilingualSongProcessor] Using default lyrics: {lyrics}")
|
| 72 |
+
return lyrics
|
| 73 |
+
|
| 74 |
+
def convert_song(self, song_path: Path, voice_path: Path, output_path: Path,
|
| 75 |
+
language: str = 'english', add_effects: bool = True) -> Path:
|
| 76 |
+
"""
|
| 77 |
+
Convert song to user's voice (multilingual support).
|
| 78 |
+
|
| 79 |
+
Pipeline:
|
| 80 |
+
1. Separate vocals from instrumental (Demucs)
|
| 81 |
+
2. Extract lyrics (placeholder or Whisper)
|
| 82 |
+
3. Synthesize with user's voice (language-aware)
|
| 83 |
+
4. Mix synthesized vocals with instrumental
|
| 84 |
+
5. Add audio effects
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
song_path: Path to input song
|
| 88 |
+
voice_path: Path to reference voice sample
|
| 89 |
+
output_path: Path for output song
|
| 90 |
+
language: 'english' or 'hindi'
|
| 91 |
+
add_effects: Whether to add reverb/compression
|
| 92 |
+
|
| 93 |
+
Returns:
|
| 94 |
+
Path to output song
|
| 95 |
+
"""
|
| 96 |
+
song_path = Path(song_path)
|
| 97 |
+
voice_path = Path(voice_path)
|
| 98 |
+
output_path = Path(output_path)
|
| 99 |
+
language = language.lower()
|
| 100 |
+
|
| 101 |
+
try:
|
| 102 |
+
print(f"\n[MultilingualSongProcessor] ========== SONG CONVERSION START ==========")
|
| 103 |
+
print(f"[MultilingualSongProcessor] Language: {language.upper()}")
|
| 104 |
+
print(f"[MultilingualSongProcessor] Song: {song_path}")
|
| 105 |
+
print(f"[MultilingualSongProcessor] Voice: {voice_path}")
|
| 106 |
+
print(f"[MultilingualSongProcessor] Output: {output_path}")
|
| 107 |
+
|
| 108 |
+
# Step 1: Separate vocals
|
| 109 |
+
print(f"\n[MultilingualSongProcessor] STEP 1: Separating vocals...")
|
| 110 |
+
separator = self._ensure_separator()
|
| 111 |
+
vocals, instrumental = separator.separate(song_path, sr=self.sr)
|
| 112 |
+
|
| 113 |
+
# Step 2: Extract/prepare lyrics
|
| 114 |
+
print(f"\n[MultilingualSongProcessor] STEP 2: Preparing lyrics...")
|
| 115 |
+
lyrics = self._extract_lyrics_from_audio(song_path)
|
| 116 |
+
|
| 117 |
+
# Step 3-4: Synthesize and mix using multilingual TTS
|
| 118 |
+
print(f"\n[MultilingualSongProcessor] STEP 3-4: Synthesizing vocals with {language.upper()} model...")
|
| 119 |
+
tts_service = self._ensure_tts_service()
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
synthesized_vocal = tts_service.synthesize(lyrics, voice_path, language)
|
| 123 |
+
except Exception as e:
|
| 124 |
+
print(f"[MultilingualSongProcessor] Synthesis error: {e}")
|
| 125 |
+
raise
|
| 126 |
+
|
| 127 |
+
# Resample if needed (XTTS uses 24kHz, we need 16kHz for mixing)
|
| 128 |
+
if len(synthesized_vocal.shape) > 1:
|
| 129 |
+
synthesized_vocal = np.mean(synthesized_vocal, axis=1)
|
| 130 |
+
|
| 131 |
+
if language == Language.HINDI.value:
|
| 132 |
+
# XTTS uses 24kHz, resample to 16kHz for consistency
|
| 133 |
+
from scipy import signal
|
| 134 |
+
num_samples = int(len(synthesized_vocal) * (self.sr / 24000))
|
| 135 |
+
synthesized_vocal = signal.resample(synthesized_vocal, num_samples)
|
| 136 |
+
|
| 137 |
+
synthesized_vocal = synthesized_vocal.astype(np.float32)
|
| 138 |
+
print(f"[MultilingualSongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}")
|
| 139 |
+
|
| 140 |
+
# Step 5: Mix with instrumental
|
| 141 |
+
print(f"\n[MultilingualSongProcessor] STEP 5: Mixing vocals with instrumental...")
|
| 142 |
+
final_audio = AudioMixer.mix_and_save(
|
| 143 |
+
synthesized_vocal, instrumental,
|
| 144 |
+
output_path, sr=self.sr,
|
| 145 |
+
add_effects=add_effects
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Cleanup
|
| 149 |
+
print(f"\n[MultilingualSongProcessor] Cleaning up models...")
|
| 150 |
+
try:
|
| 151 |
+
gc.collect()
|
| 152 |
+
if torch.cuda.is_available():
|
| 153 |
+
torch.cuda.empty_cache()
|
| 154 |
+
except Exception as e:
|
| 155 |
+
print(f"[MultilingualSongProcessor] Warning during cleanup: {e}")
|
| 156 |
+
|
| 157 |
+
print(f"\n[MultilingualSongProcessor] ========== SONG CONVERSION COMPLETE ==========")
|
| 158 |
+
print(f"[MultilingualSongProcessor] Output saved to: {final_audio}")
|
| 159 |
+
|
| 160 |
+
return final_audio
|
| 161 |
+
|
| 162 |
+
except Exception as e:
|
| 163 |
+
print(f"\n[MultilingualSongProcessor] ✗ ERROR: {e}")
|
| 164 |
+
import traceback
|
| 165 |
+
traceback.print_exc()
|
| 166 |
+
sys.stdout.flush()
|
| 167 |
+
raise
|
backend/app/multilingual_tts.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Multilingual TTS Service - Supports English (WaveRNN) and Hindi (XTTS)."""
|
| 2 |
+
|
| 3 |
+
import gc
|
| 4 |
+
import torch
|
| 5 |
+
import numpy as np
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Optional, Union
|
| 8 |
+
from enum import Enum
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Language(str, Enum):
|
| 13 |
+
"""Supported languages."""
|
| 14 |
+
ENGLISH = "english"
|
| 15 |
+
HINDI = "hindi"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class MultilingualTTSService:
|
| 19 |
+
"""
|
| 20 |
+
Unified TTS service supporting multiple languages.
|
| 21 |
+
|
| 22 |
+
- English: Uses existing WaveRNN vocoder + Tacotron2 synthesizer + encoder
|
| 23 |
+
- Hindi: Uses XTTS (Coqui TTS) model
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
|
| 27 |
+
"""
|
| 28 |
+
Initialize multilingual TTS service.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
models_dir: Directory with English models (encoder.pt, synthesizer.pt, vocoder.pt)
|
| 32 |
+
hindi_model_dir: Directory with XTTS Hindi model. If None, Hindi support disabled.
|
| 33 |
+
"""
|
| 34 |
+
self.models_dir = Path(models_dir)
|
| 35 |
+
self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None
|
| 36 |
+
|
| 37 |
+
# Track loaded models
|
| 38 |
+
self._encoder_model = None
|
| 39 |
+
self._synthesizer_model = None
|
| 40 |
+
self._vocoder_model = None
|
| 41 |
+
self._xtts_model = None
|
| 42 |
+
|
| 43 |
+
self.sr = 16000
|
| 44 |
+
|
| 45 |
+
print("[MultilingualTTSService] Initialized")
|
| 46 |
+
print(f"[MultilingualTTSService] English models dir: {self.models_dir}")
|
| 47 |
+
if self.hindi_model_dir:
|
| 48 |
+
print(f"[MultilingualTTSService] Hindi XTTS dir: {self.hindi_model_dir}")
|
| 49 |
+
else:
|
| 50 |
+
print("[MultilingualTTSService] Hindi support: DISABLED (no model path)")
|
| 51 |
+
|
| 52 |
+
def _load_english_models(self):
|
| 53 |
+
"""Load English voice cloning models (lazy load)."""
|
| 54 |
+
if self._encoder_model is None:
|
| 55 |
+
print("[MultilingualTTSService] Loading English encoder...")
|
| 56 |
+
from encoder import inference as encoder_infer
|
| 57 |
+
enc_path = self.models_dir / "default" / "encoder.pt"
|
| 58 |
+
if not enc_path.exists():
|
| 59 |
+
raise RuntimeError(f"English encoder model missing: {enc_path}")
|
| 60 |
+
encoder_infer.load_model(enc_path)
|
| 61 |
+
self._encoder_model = True
|
| 62 |
+
print("[MultilingualTTSService] ✓ English encoder loaded")
|
| 63 |
+
|
| 64 |
+
if self._synthesizer_model is None:
|
| 65 |
+
print("[MultilingualTTSService] Loading English synthesizer...")
|
| 66 |
+
from synthesizer import inference as synthesizer_infer
|
| 67 |
+
syn_path = self.models_dir / "default" / "synthesizer.pt"
|
| 68 |
+
if not syn_path.exists():
|
| 69 |
+
raise RuntimeError(f"English synthesizer model missing: {syn_path}")
|
| 70 |
+
self._synthesizer_model = synthesizer_infer.Synthesizer(syn_path)
|
| 71 |
+
print("[MultilingualTTSService] ✓ English synthesizer loaded")
|
| 72 |
+
|
| 73 |
+
if self._vocoder_model is None:
|
| 74 |
+
print("[MultilingualTTSService] Loading English vocoder...")
|
| 75 |
+
from app.vocoder import inference as vocoder_infer
|
| 76 |
+
voc_path = self.models_dir / "default" / "vocoder.pt"
|
| 77 |
+
if not voc_path.exists():
|
| 78 |
+
raise RuntimeError(f"English vocoder model missing: {voc_path}")
|
| 79 |
+
vocoder_infer.load_model(voc_path)
|
| 80 |
+
self._vocoder_model = True
|
| 81 |
+
print("[MultilingualTTSService] ✓ English vocoder loaded")
|
| 82 |
+
|
| 83 |
+
def _load_hindi_models(self):
|
| 84 |
+
"""Load Hindi XTTS model (lazy load)."""
|
| 85 |
+
if not self.hindi_model_dir:
|
| 86 |
+
raise RuntimeError("Hindi model not configured. Set hindi_model_dir path.")
|
| 87 |
+
|
| 88 |
+
if self._xtts_model is None:
|
| 89 |
+
print("[MultilingualTTSService] Loading Hindi XTTS model...")
|
| 90 |
+
try:
|
| 91 |
+
from TTS.api import TTS
|
| 92 |
+
except ImportError:
|
| 93 |
+
raise ImportError(
|
| 94 |
+
"TTS library required for Hindi support. "
|
| 95 |
+
"Install with: pip install TTS>=0.21.0"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
config_path = self.hindi_model_dir / "config.json"
|
| 99 |
+
if not config_path.exists():
|
| 100 |
+
raise RuntimeError(f"Hindi model config missing: {config_path}")
|
| 101 |
+
|
| 102 |
+
# Load XTTS model
|
| 103 |
+
self._xtts_model = TTS(
|
| 104 |
+
model_path=str(self.hindi_model_dir.resolve().as_posix()),
|
| 105 |
+
config_path=str(config_path),
|
| 106 |
+
gpu=False # Set to True if CUDA available and needed
|
| 107 |
+
)
|
| 108 |
+
print("[MultilingualTTSService] ✓ Hindi XTTS loaded")
|
| 109 |
+
|
| 110 |
+
def synthesize(self, text: str, voice_sample_path: Union[str, Path],
|
| 111 |
+
language: str = "english") -> np.ndarray:
|
| 112 |
+
"""
|
| 113 |
+
Synthesize speech in specified language.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
text: Text to synthesize
|
| 117 |
+
voice_sample_path: Path to reference voice sample
|
| 118 |
+
language: "english" or "hindi"
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
Audio waveform as numpy array
|
| 122 |
+
"""
|
| 123 |
+
language = language.lower()
|
| 124 |
+
|
| 125 |
+
if language == Language.ENGLISH:
|
| 126 |
+
return self._synthesize_english(text, voice_sample_path)
|
| 127 |
+
elif language == Language.HINDI:
|
| 128 |
+
return self._synthesize_hindi(text, voice_sample_path)
|
| 129 |
+
else:
|
| 130 |
+
raise ValueError(f"Unsupported language: {language}")
|
| 131 |
+
|
| 132 |
+
def _synthesize_english(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
|
| 133 |
+
"""Synthesize English speech using WaveRNN + Tacotron2."""
|
| 134 |
+
from encoder import inference as encoder_infer
|
| 135 |
+
from app.vocoder import inference as vocoder_infer
|
| 136 |
+
|
| 137 |
+
self._load_english_models()
|
| 138 |
+
|
| 139 |
+
print(f"[MultilingualTTSService] Synthesizing English: {text[:50]}...")
|
| 140 |
+
|
| 141 |
+
# Embed voice
|
| 142 |
+
wav = encoder_infer.preprocess_wav(voice_sample_path)
|
| 143 |
+
embed = encoder_infer.embed_utterance(wav)
|
| 144 |
+
|
| 145 |
+
# Generate mel
|
| 146 |
+
mels = self._synthesizer_model.synthesize_spectrograms([text], [embed])
|
| 147 |
+
mel = mels[0]
|
| 148 |
+
|
| 149 |
+
# Vocalize
|
| 150 |
+
try:
|
| 151 |
+
synthesized = vocoder_infer.infer_waveform(
|
| 152 |
+
mel, normalize=True, batched=False, target=8000, overlap=800
|
| 153 |
+
).astype(np.float32)
|
| 154 |
+
except Exception as e:
|
| 155 |
+
print(f"[MultilingualTTSService] Vocoder failed: {e}, using Griffin-Lim fallback")
|
| 156 |
+
synthesized = self._synthesizer_model.griffin_lim(mel).astype(np.float32)
|
| 157 |
+
|
| 158 |
+
# Normalize
|
| 159 |
+
max_val = np.max(np.abs(synthesized))
|
| 160 |
+
if max_val > 0:
|
| 161 |
+
target_level = 0.707
|
| 162 |
+
synthesized = synthesized * (target_level / max_val)
|
| 163 |
+
|
| 164 |
+
return np.clip(synthesized, -1.0, 1.0)
|
| 165 |
+
|
| 166 |
+
def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
|
| 167 |
+
"""Synthesize Hindi speech using XTTS model."""
|
| 168 |
+
self._load_hindi_models()
|
| 169 |
+
|
| 170 |
+
print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
|
| 171 |
+
|
| 172 |
+
# XTTS synthesize
|
| 173 |
+
audio = self._xtts_model.tts(
|
| 174 |
+
text=text,
|
| 175 |
+
speaker_wav=str(voice_sample_path),
|
| 176 |
+
language="hi"
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
# Convert to float32 if needed
|
| 180 |
+
audio = np.asarray(audio, dtype=np.float32)
|
| 181 |
+
|
| 182 |
+
# Normalize
|
| 183 |
+
max_val = np.max(np.abs(audio))
|
| 184 |
+
if max_val > 0:
|
| 185 |
+
target_level = 0.707
|
| 186 |
+
audio = audio * (target_level / max_val)
|
| 187 |
+
|
| 188 |
+
return np.clip(audio, -1.0, 1.0)
|
| 189 |
+
|
| 190 |
+
def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path],
|
| 191 |
+
output_path: Union[str, Path], language: str = "english") -> Path:
|
| 192 |
+
"""
|
| 193 |
+
Synthesize and save to file.
|
| 194 |
+
|
| 195 |
+
Args:
|
| 196 |
+
text: Text to synthesize
|
| 197 |
+
voice_sample_path: Path to reference voice
|
| 198 |
+
output_path: Where to save audio
|
| 199 |
+
language: "english" or "hindi"
|
| 200 |
+
|
| 201 |
+
Returns:
|
| 202 |
+
Path to output file
|
| 203 |
+
"""
|
| 204 |
+
import soundfile as sf
|
| 205 |
+
|
| 206 |
+
output_path = Path(output_path)
|
| 207 |
+
|
| 208 |
+
try:
|
| 209 |
+
audio = self.synthesize(text, voice_sample_path, language)
|
| 210 |
+
|
| 211 |
+
# Determine sample rate based on language
|
| 212 |
+
sr = 24000 if language.lower() == Language.HINDI else 16000
|
| 213 |
+
|
| 214 |
+
sf.write(output_path, audio, sr)
|
| 215 |
+
print(f"[MultilingualTTSService] Audio saved: {output_path}")
|
| 216 |
+
return output_path
|
| 217 |
+
|
| 218 |
+
except Exception as e:
|
| 219 |
+
print(f"[MultilingualTTSService] Error during synthesis: {e}")
|
| 220 |
+
raise
|
| 221 |
+
|
| 222 |
+
def cleanup(self):
|
| 223 |
+
"""Release model memory."""
|
| 224 |
+
print("[MultilingualTTSService] Cleaning up models...")
|
| 225 |
+
try:
|
| 226 |
+
self._encoder_model = None
|
| 227 |
+
self._synthesizer_model = None
|
| 228 |
+
self._vocoder_model = None
|
| 229 |
+
self._xtts_model = None
|
| 230 |
+
gc.collect()
|
| 231 |
+
if torch.cuda.is_available():
|
| 232 |
+
torch.cuda.empty_cache()
|
| 233 |
+
except Exception as e:
|
| 234 |
+
print(f"[MultilingualTTSService] Cleanup warning: {e}")
|
backend/app/routes.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
-
Flask API Backend for Voice Cloning
|
| 3 |
Integrates the Python voice cloning backend with the React frontend
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
from flask import Blueprint, request, jsonify, send_file
|
|
@@ -9,6 +10,7 @@ import uuid
|
|
| 9 |
import json
|
| 10 |
from datetime import datetime
|
| 11 |
import sys
|
|
|
|
| 12 |
|
| 13 |
from .voice_cloning import synthesize
|
| 14 |
|
|
@@ -22,6 +24,24 @@ OUTPUT_FOLDER = BASE_DIR / 'outputs'
|
|
| 22 |
MODELS_DIR = BASE_DIR / 'models'
|
| 23 |
VOICES_DB = UPLOAD_FOLDER / 'voices.json'
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# Create directories with parents
|
| 26 |
try:
|
| 27 |
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
|
@@ -152,8 +172,18 @@ def get_voices():
|
|
| 152 |
@bp.route('/synthesize', methods=['POST'])
|
| 153 |
def synthesize_speech():
|
| 154 |
"""
|
| 155 |
-
Synthesize speech from text using enrolled voice
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
"""
|
| 158 |
try:
|
| 159 |
data = request.get_json()
|
|
@@ -162,7 +192,8 @@ def synthesize_speech():
|
|
| 162 |
return jsonify({'error': 'No data provided'}), 400
|
| 163 |
|
| 164 |
text = data.get('text', '').strip()
|
| 165 |
-
voice_id = data.get('voice_id', '')
|
|
|
|
| 166 |
|
| 167 |
if not text:
|
| 168 |
return jsonify({'error': 'No text provided'}), 400
|
|
@@ -170,6 +201,16 @@ def synthesize_speech():
|
|
| 170 |
if not voice_id:
|
| 171 |
return jsonify({'error': 'No voice selected'}), 400
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
# Find the voice in database
|
| 174 |
voices = load_voices_db()
|
| 175 |
voice = next((v for v in voices if v['id'] == voice_id), None)
|
|
@@ -177,7 +218,7 @@ def synthesize_speech():
|
|
| 177 |
if not voice:
|
| 178 |
return jsonify({'error': 'Voice not found'}), 404
|
| 179 |
|
| 180 |
-
# Reconstruct path from UPLOAD_FOLDER
|
| 181 |
voice_filepath = UPLOAD_FOLDER / voice['filename']
|
| 182 |
|
| 183 |
if not voice_filepath.exists():
|
|
@@ -187,28 +228,43 @@ def synthesize_speech():
|
|
| 187 |
output_filename = f"synthesis_{uuid.uuid4().hex[:8]}.wav"
|
| 188 |
output_path = OUTPUT_FOLDER / output_filename
|
| 189 |
|
| 190 |
-
|
| 191 |
-
print(f"
|
| 192 |
-
print(f"
|
| 193 |
-
print(f"
|
| 194 |
-
print(f"
|
| 195 |
-
print("
|
|
|
|
| 196 |
|
| 197 |
try:
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
-
print(f"Synthesis completed!
|
| 209 |
sys.stdout.flush()
|
|
|
|
| 210 |
except Exception as synth_error:
|
| 211 |
-
print(f"Synthesis error: {synth_error}")
|
| 212 |
import traceback
|
| 213 |
traceback.print_exc()
|
| 214 |
sys.stdout.flush()
|
|
@@ -221,12 +277,13 @@ def synthesize_speech():
|
|
| 221 |
# Return the audio file URL
|
| 222 |
return jsonify({
|
| 223 |
'success': True,
|
| 224 |
-
'message': '
|
| 225 |
-
'audio_url': f'/api/audio/{output_filename}'
|
|
|
|
| 226 |
}), 200
|
| 227 |
|
| 228 |
except Exception as e:
|
| 229 |
-
print(f"
|
| 230 |
import traceback
|
| 231 |
traceback.print_exc()
|
| 232 |
return jsonify({'error': f'Failed to synthesize speech: {str(e)}'}), 500
|
|
@@ -469,17 +526,19 @@ def convert_song():
|
|
| 469 |
print(f"[API] Language: {language}")
|
| 470 |
print(f"[API] Add effects: {add_effects}")
|
| 471 |
|
| 472 |
-
# Import song processor
|
| 473 |
-
from app.
|
| 474 |
|
| 475 |
-
processor =
|
|
|
|
|
|
|
|
|
|
| 476 |
result_path = processor.convert_song(
|
| 477 |
song_path=song_path,
|
| 478 |
voice_path=voice_filepath,
|
| 479 |
output_path=output_path,
|
| 480 |
language=language,
|
| 481 |
-
add_effects=add_effects
|
| 482 |
-
models_dir=MODELS_DIR
|
| 483 |
)
|
| 484 |
|
| 485 |
print(f"[API] Song conversion complete: {result_path}")
|
|
@@ -489,7 +548,8 @@ def convert_song():
|
|
| 489 |
'success': True,
|
| 490 |
'message': 'Song converted successfully',
|
| 491 |
'audio_url': f'/api/audio/{output_filename}',
|
| 492 |
-
'filename': output_filename
|
|
|
|
| 493 |
}), 200
|
| 494 |
|
| 495 |
except Exception as e:
|
|
|
|
| 1 |
"""
|
| 2 |
+
"""Flask API Backend for Voice Cloning
|
| 3 |
Integrates the Python voice cloning backend with the React frontend
|
| 4 |
+
Supports multilingual synthesis: English (WaveRNN) and Hindi (XTTS)
|
| 5 |
"""
|
| 6 |
|
| 7 |
from flask import Blueprint, request, jsonify, send_file
|
|
|
|
| 10 |
import json
|
| 11 |
from datetime import datetime
|
| 12 |
import sys
|
| 13 |
+
import os
|
| 14 |
|
| 15 |
from .voice_cloning import synthesize
|
| 16 |
|
|
|
|
| 24 |
MODELS_DIR = BASE_DIR / 'models'
|
| 25 |
VOICES_DB = UPLOAD_FOLDER / 'voices.json'
|
| 26 |
|
| 27 |
+
# Hindi model directory (check multiple possible locations)
|
| 28 |
+
HINDI_MODEL_DIR = None
|
| 29 |
+
possible_hindi_dirs = [
|
| 30 |
+
Path(os.getenv('HINDI_MODEL_PATH', '')) if os.getenv('HINDI_MODEL_PATH') else None,
|
| 31 |
+
BASE_DIR.parent / 'Apoorv_hindi_model' / 'models' / 'xtts_hindi', # Local development
|
| 32 |
+
BASE_DIR / 'models' / 'xtts_hindi', # Alternative location
|
| 33 |
+
]
|
| 34 |
+
for path in possible_hindi_dirs:
|
| 35 |
+
if path and path.exists():
|
| 36 |
+
HINDI_MODEL_DIR = path
|
| 37 |
+
print(f"✓ Hindi model found at: {HINDI_MODEL_DIR}")
|
| 38 |
+
break
|
| 39 |
+
|
| 40 |
+
if not HINDI_MODEL_DIR:
|
| 41 |
+
print("⚠ Hindi model not found. Hindi synthesis will be unavailable.")
|
| 42 |
+
print(" To enable Hindi support, set HINDI_MODEL_PATH environment variable")
|
| 43 |
+
print(" or place model at: Apoorv_hindi_model/models/xtts_hindi")
|
| 44 |
+
|
| 45 |
# Create directories with parents
|
| 46 |
try:
|
| 47 |
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 172 |
@bp.route('/synthesize', methods=['POST'])
|
| 173 |
def synthesize_speech():
|
| 174 |
"""
|
| 175 |
+
Synthesize speech from text using enrolled voice (multilingual support).
|
| 176 |
+
|
| 177 |
+
Frontend sends JSON:
|
| 178 |
+
{
|
| 179 |
+
"text": "Your text here",
|
| 180 |
+
"voice_id": "voice_xxx",
|
| 181 |
+
"language": "english" or "hindi" (optional, defaults to english)
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
Supports:
|
| 185 |
+
- English: Uses WaveRNN vocoder (existing model)
|
| 186 |
+
- Hindi: Uses XTTS model (requires hindi_model_dir)
|
| 187 |
"""
|
| 188 |
try:
|
| 189 |
data = request.get_json()
|
|
|
|
| 192 |
return jsonify({'error': 'No data provided'}), 400
|
| 193 |
|
| 194 |
text = data.get('text', '').strip()
|
| 195 |
+
voice_id = data.get('voice_id', '')
|
| 196 |
+
language = data.get('language', 'english').lower()
|
| 197 |
|
| 198 |
if not text:
|
| 199 |
return jsonify({'error': 'No text provided'}), 400
|
|
|
|
| 201 |
if not voice_id:
|
| 202 |
return jsonify({'error': 'No voice selected'}), 400
|
| 203 |
|
| 204 |
+
if language not in ['english', 'hindi']:
|
| 205 |
+
return jsonify({'error': f'Unsupported language: {language}. Supported: english, hindi'}), 400
|
| 206 |
+
|
| 207 |
+
# Check if Hindi model is available for Hindi synthesis
|
| 208 |
+
if language == 'hindi' and not HINDI_MODEL_DIR:
|
| 209 |
+
return jsonify({
|
| 210 |
+
'error': 'Hindi synthesis unavailable. Hindi model not configured.',
|
| 211 |
+
'available_languages': ['english']
|
| 212 |
+
}), 503
|
| 213 |
+
|
| 214 |
# Find the voice in database
|
| 215 |
voices = load_voices_db()
|
| 216 |
voice = next((v for v in voices if v['id'] == voice_id), None)
|
|
|
|
| 218 |
if not voice:
|
| 219 |
return jsonify({'error': 'Voice not found'}), 404
|
| 220 |
|
| 221 |
+
# Reconstruct path from UPLOAD_FOLDER
|
| 222 |
voice_filepath = UPLOAD_FOLDER / voice['filename']
|
| 223 |
|
| 224 |
if not voice_filepath.exists():
|
|
|
|
| 228 |
output_filename = f"synthesis_{uuid.uuid4().hex[:8]}.wav"
|
| 229 |
output_path = OUTPUT_FOLDER / output_filename
|
| 230 |
|
| 231 |
+
print(f"\n[API /synthesize]")
|
| 232 |
+
print(f" Language: {language.upper()}")
|
| 233 |
+
print(f" Text: '{text[:50]}...'")
|
| 234 |
+
print(f" Voice: '{voice['name']}'")
|
| 235 |
+
print(f" Voice file: {voice_filepath}")
|
| 236 |
+
print(f" Output: {output_path}")
|
| 237 |
+
sys.stdout.flush()
|
| 238 |
|
| 239 |
try:
|
| 240 |
+
if language == 'english':
|
| 241 |
+
# Use original English synthesis (WaveRNN)
|
| 242 |
+
synthesize(
|
| 243 |
+
voice_path=voice_filepath,
|
| 244 |
+
text=text,
|
| 245 |
+
models_dir=MODELS_DIR,
|
| 246 |
+
out_path=output_path
|
| 247 |
+
)
|
| 248 |
+
else:
|
| 249 |
+
# Use multilingual TTS for Hindi
|
| 250 |
+
from app.multilingual_tts import MultilingualTTSService
|
| 251 |
+
tts_service = MultilingualTTSService(
|
| 252 |
+
models_dir=MODELS_DIR,
|
| 253 |
+
hindi_model_dir=HINDI_MODEL_DIR
|
| 254 |
+
)
|
| 255 |
+
tts_service.synthesize_and_save(
|
| 256 |
+
text=text,
|
| 257 |
+
voice_sample_path=voice_filepath,
|
| 258 |
+
output_path=output_path,
|
| 259 |
+
language=language
|
| 260 |
+
)
|
| 261 |
+
tts_service.cleanup()
|
| 262 |
|
| 263 |
+
print(f"[API /synthesize] ✓ Synthesis completed!")
|
| 264 |
sys.stdout.flush()
|
| 265 |
+
|
| 266 |
except Exception as synth_error:
|
| 267 |
+
print(f"[API /synthesize] ✗ Synthesis error: {synth_error}")
|
| 268 |
import traceback
|
| 269 |
traceback.print_exc()
|
| 270 |
sys.stdout.flush()
|
|
|
|
| 277 |
# Return the audio file URL
|
| 278 |
return jsonify({
|
| 279 |
'success': True,
|
| 280 |
+
'message': f'{language.capitalize()} speech synthesized successfully',
|
| 281 |
+
'audio_url': f'/api/audio/{output_filename}',
|
| 282 |
+
'language': language
|
| 283 |
}), 200
|
| 284 |
|
| 285 |
except Exception as e:
|
| 286 |
+
print(f"[API /synthesize] Unexpected error: {e}")
|
| 287 |
import traceback
|
| 288 |
traceback.print_exc()
|
| 289 |
return jsonify({'error': f'Failed to synthesize speech: {str(e)}'}), 500
|
|
|
|
| 526 |
print(f"[API] Language: {language}")
|
| 527 |
print(f"[API] Add effects: {add_effects}")
|
| 528 |
|
| 529 |
+
# Import multilingual song processor
|
| 530 |
+
from app.multilingual_song_processor import MultilingualSongProcessor
|
| 531 |
|
| 532 |
+
processor = MultilingualSongProcessor(
|
| 533 |
+
models_dir=MODELS_DIR,
|
| 534 |
+
hindi_model_dir=HINDI_MODEL_DIR if language == 'hindi' else None
|
| 535 |
+
)
|
| 536 |
result_path = processor.convert_song(
|
| 537 |
song_path=song_path,
|
| 538 |
voice_path=voice_filepath,
|
| 539 |
output_path=output_path,
|
| 540 |
language=language,
|
| 541 |
+
add_effects=add_effects
|
|
|
|
| 542 |
)
|
| 543 |
|
| 544 |
print(f"[API] Song conversion complete: {result_path}")
|
|
|
|
| 548 |
'success': True,
|
| 549 |
'message': 'Song converted successfully',
|
| 550 |
'audio_url': f'/api/audio/{output_filename}',
|
| 551 |
+
'filename': output_filename,
|
| 552 |
+
'language': language
|
| 553 |
}), 200
|
| 554 |
|
| 555 |
except Exception as e:
|
backend/requirements.txt
CHANGED
|
@@ -14,3 +14,4 @@ unidecode>=1.2.0
|
|
| 14 |
inflect>=6.0.0
|
| 15 |
demucs>=4.0.0
|
| 16 |
pydub>=0.25.1
|
|
|
|
|
|
| 14 |
inflect>=6.0.0
|
| 15 |
demucs>=4.0.0
|
| 16 |
pydub>=0.25.1
|
| 17 |
+
TTS>=0.21.0
|
frontend/.env.production
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
# Production deployment
|
| 2 |
-
VITE_API_URL=https://voice-cloning-
|
|
|
|
| 1 |
# Production deployment
|
| 2 |
+
VITE_API_URL=https://aj50-voice-cloning-backend.hf.space
|
frontend/src/components/forms/SpeechSynthesis.tsx
CHANGED
|
@@ -23,6 +23,8 @@ interface Voice {
|
|
| 23 |
|
| 24 |
interface SpeechSynthesisProps {
|
| 25 |
voices?: Voice[];
|
|
|
|
|
|
|
| 26 |
onSynthesisComplete?: (audioUrl: string) => void;
|
| 27 |
className?: string;
|
| 28 |
}
|
|
@@ -36,6 +38,8 @@ const sampleTexts = {
|
|
| 36 |
|
| 37 |
export default function SpeechSynthesis({
|
| 38 |
voices: propVoices,
|
|
|
|
|
|
|
| 39 |
onSynthesisComplete,
|
| 40 |
className = ""
|
| 41 |
}: SpeechSynthesisProps) {
|
|
@@ -113,8 +117,8 @@ export default function SpeechSynthesis({
|
|
| 113 |
setSynthesizerStartTime(Date.now()); // Record synthesis start time
|
| 114 |
|
| 115 |
try {
|
| 116 |
-
// Call backend API for synthesis
|
| 117 |
-
const result = await api.synthesize(selectedVoice, inputText);
|
| 118 |
|
| 119 |
// Get the audio file URL from backend with cache busting
|
| 120 |
const audioUrl = api.getAudioUrl(result.audio_url) + `?t=${Date.now()}`;
|
|
@@ -228,6 +232,30 @@ export default function SpeechSynthesis({
|
|
| 228 |
</Button>
|
| 229 |
</CardHeader>
|
| 230 |
<CardContent className="space-y-6">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
{/* Voice Selection */}
|
| 232 |
<div className="space-y-2">
|
| 233 |
<Label htmlFor="voice-select">Select Voice</Label>
|
|
|
|
| 23 |
|
| 24 |
interface SpeechSynthesisProps {
|
| 25 |
voices?: Voice[];
|
| 26 |
+
language?: 'english' | 'hindi';
|
| 27 |
+
onLanguageChange?: (language: 'english' | 'hindi') => void;
|
| 28 |
onSynthesisComplete?: (audioUrl: string) => void;
|
| 29 |
className?: string;
|
| 30 |
}
|
|
|
|
| 38 |
|
| 39 |
export default function SpeechSynthesis({
|
| 40 |
voices: propVoices,
|
| 41 |
+
language = 'english',
|
| 42 |
+
onLanguageChange,
|
| 43 |
onSynthesisComplete,
|
| 44 |
className = ""
|
| 45 |
}: SpeechSynthesisProps) {
|
|
|
|
| 117 |
setSynthesizerStartTime(Date.now()); // Record synthesis start time
|
| 118 |
|
| 119 |
try {
|
| 120 |
+
// Call backend API for synthesis with language support
|
| 121 |
+
const result = await api.synthesize(selectedVoice, inputText, language);
|
| 122 |
|
| 123 |
// Get the audio file URL from backend with cache busting
|
| 124 |
const audioUrl = api.getAudioUrl(result.audio_url) + `?t=${Date.now()}`;
|
|
|
|
| 232 |
</Button>
|
| 233 |
</CardHeader>
|
| 234 |
<CardContent className="space-y-6">
|
| 235 |
+
{/* Language Selector */}
|
| 236 |
+
<div className="flex gap-2">
|
| 237 |
+
<button
|
| 238 |
+
onClick={() => onLanguageChange?.('english')}
|
| 239 |
+
className={`flex-1 py-2 px-4 rounded-lg font-medium transition-all ${
|
| 240 |
+
language === 'english'
|
| 241 |
+
? 'bg-blue-600 text-white shadow-lg'
|
| 242 |
+
: 'bg-gray-200 text-gray-700 hover:bg-gray-300'
|
| 243 |
+
}`}
|
| 244 |
+
>
|
| 245 |
+
🇬🇧 English
|
| 246 |
+
</button>
|
| 247 |
+
<button
|
| 248 |
+
onClick={() => onLanguageChange?.('hindi')}
|
| 249 |
+
className={`flex-1 py-2 px-4 rounded-lg font-medium transition-all ${
|
| 250 |
+
language === 'hindi'
|
| 251 |
+
? 'bg-orange-600 text-white shadow-lg'
|
| 252 |
+
: 'bg-gray-200 text-gray-700 hover:bg-gray-300'
|
| 253 |
+
}`}
|
| 254 |
+
>
|
| 255 |
+
🇮🇳 हिन्दी
|
| 256 |
+
</button>
|
| 257 |
+
</div>
|
| 258 |
+
|
| 259 |
{/* Voice Selection */}
|
| 260 |
<div className="space-y-2">
|
| 261 |
<Label htmlFor="voice-select">Select Voice</Label>
|
frontend/src/pages/Index.tsx
CHANGED
|
@@ -251,6 +251,8 @@ const Index = () => {
|
|
| 251 |
<TabsContent value="synthesize" className="space-y-6">
|
| 252 |
<SpeechSynthesis
|
| 253 |
voices={enrolledVoices.length ? enrolledVoices : undefined}
|
|
|
|
|
|
|
| 254 |
onSynthesisComplete={handleSynthesisComplete}
|
| 255 |
/>
|
| 256 |
|
|
|
|
| 251 |
<TabsContent value="synthesize" className="space-y-6">
|
| 252 |
<SpeechSynthesis
|
| 253 |
voices={enrolledVoices.length ? enrolledVoices : undefined}
|
| 254 |
+
language={language}
|
| 255 |
+
onLanguageChange={setLanguage}
|
| 256 |
onSynthesisComplete={handleSynthesisComplete}
|
| 257 |
/>
|
| 258 |
|
frontend/src/services/api.ts
CHANGED
|
@@ -41,9 +41,9 @@ export const api = {
|
|
| 41 |
},
|
| 42 |
|
| 43 |
/**
|
| 44 |
-
* Synthesize speech from text
|
| 45 |
*/
|
| 46 |
-
synthesize: async (voiceId: string, text: string) => {
|
| 47 |
const response = await fetch(api.getUrl('/synthesize'), {
|
| 48 |
method: 'POST',
|
| 49 |
headers: {
|
|
@@ -52,6 +52,7 @@ export const api = {
|
|
| 52 |
body: JSON.stringify({
|
| 53 |
voice_id: voiceId,
|
| 54 |
text: text,
|
|
|
|
| 55 |
}),
|
| 56 |
});
|
| 57 |
if (!response.ok) {
|
|
|
|
| 41 |
},
|
| 42 |
|
| 43 |
/**
|
| 44 |
+
* Synthesize speech from text (supports multilingual: english, hindi)
|
| 45 |
*/
|
| 46 |
+
synthesize: async (voiceId: string, text: string, language: string = 'english') => {
|
| 47 |
const response = await fetch(api.getUrl('/synthesize'), {
|
| 48 |
method: 'POST',
|
| 49 |
headers: {
|
|
|
|
| 52 |
body: JSON.stringify({
|
| 53 |
voice_id: voiceId,
|
| 54 |
text: text,
|
| 55 |
+
language: language,
|
| 56 |
}),
|
| 57 |
});
|
| 58 |
if (!response.ok) {
|