AJ50 commited on
Commit
03fe1d8
·
1 Parent(s): abd73a3

Add multilingual support: English (WaveRNN) + Hindi (XTTS) [sync with pragyan]

Browse files

Backend Changes:
- multilingual_tts.py: Unified TTS service supporting English and Hindi
- multilingual_song_processor.py: Orchestrator for multilingual song conversion
- routes.py: Updated /api/synthesize and /api/convert_song to support language parameter
- requirements.txt: Added TTS>=0.21.0 for XTTS Hindi model support

Frontend Changes:
- SpeechSynthesis.tsx: Added language selector buttons (English/Hindi)
- SongGeneration.tsx: Already has language toggle (reuse existing)
- api.ts: Updated synthesize() to accept language parameter
- Index.tsx: Pass language state to SpeechSynthesis component

Synced with pragyan branch multilingual integration

backend/app/multilingual_song_processor.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Multilingual song processing - English and Hindi support."""
2
+
3
+ import gc
4
+ import torch
5
+ import numpy as np
6
+ from pathlib import Path
7
+ from typing import Optional
8
+ import sys
9
+
10
+ from app.song_conversion.vocal_separator import VocalSeparator
11
+ from app.song_conversion.audio_mixer import AudioMixer
12
+ from app.multilingual_tts import MultilingualTTSService, Language
13
+
14
+
15
+ class MultilingualSongProcessor:
16
+ """
17
+ Orchestrates song voice conversion for multiple languages.
18
+
19
+ - English songs: Uses WaveRNN voice cloning
20
+ - Hindi songs: Uses XTTS Hindi model
21
+ """
22
+
23
+ def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
24
+ """
25
+ Initialize multilingual song processor.
26
+
27
+ Args:
28
+ models_dir: Directory with English models
29
+ hindi_model_dir: Directory with Hindi XTTS model
30
+ """
31
+ self.models_dir = Path(models_dir)
32
+ self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None
33
+ self.separator = None
34
+ self.tts_service = None
35
+ self.sr = 16000
36
+
37
+ def _ensure_separator(self) -> VocalSeparator:
38
+ """Lazy load vocal separator."""
39
+ if self.separator is None:
40
+ print("[MultilingualSongProcessor] Initializing vocal separator...")
41
+ self.separator = VocalSeparator(model_name="htdemucs")
42
+ return self.separator
43
+
44
+ def _ensure_tts_service(self) -> MultilingualTTSService:
45
+ """Lazy load TTS service."""
46
+ if self.tts_service is None:
47
+ print("[MultilingualSongProcessor] Initializing multilingual TTS service...")
48
+ self.tts_service = MultilingualTTSService(
49
+ models_dir=self.models_dir,
50
+ hindi_model_dir=self.hindi_model_dir
51
+ )
52
+ return self.tts_service
53
+
54
+ def _extract_lyrics_from_audio(self, audio_path: Path) -> str:
55
+ """
56
+ Extract lyrics from audio (placeholder).
57
+ In production, would use Whisper with language detection.
58
+
59
+ Args:
60
+ audio_path: Path to audio file
61
+
62
+ Returns:
63
+ Extracted or placeholder lyrics
64
+ """
65
+ print("[MultilingualSongProcessor] Extracting lyrics from audio...")
66
+
67
+ # Placeholder: return generic phonetically rich text
68
+ # In production, use: whisper_model.transcribe(str(audio_path), language='en'/'hi')
69
+ lyrics = "The music is playing so well with this song today"
70
+
71
+ print(f"[MultilingualSongProcessor] Using default lyrics: {lyrics}")
72
+ return lyrics
73
+
74
+ def convert_song(self, song_path: Path, voice_path: Path, output_path: Path,
75
+ language: str = 'english', add_effects: bool = True) -> Path:
76
+ """
77
+ Convert song to user's voice (multilingual support).
78
+
79
+ Pipeline:
80
+ 1. Separate vocals from instrumental (Demucs)
81
+ 2. Extract lyrics (placeholder or Whisper)
82
+ 3. Synthesize with user's voice (language-aware)
83
+ 4. Mix synthesized vocals with instrumental
84
+ 5. Add audio effects
85
+
86
+ Args:
87
+ song_path: Path to input song
88
+ voice_path: Path to reference voice sample
89
+ output_path: Path for output song
90
+ language: 'english' or 'hindi'
91
+ add_effects: Whether to add reverb/compression
92
+
93
+ Returns:
94
+ Path to output song
95
+ """
96
+ song_path = Path(song_path)
97
+ voice_path = Path(voice_path)
98
+ output_path = Path(output_path)
99
+ language = language.lower()
100
+
101
+ try:
102
+ print(f"\n[MultilingualSongProcessor] ========== SONG CONVERSION START ==========")
103
+ print(f"[MultilingualSongProcessor] Language: {language.upper()}")
104
+ print(f"[MultilingualSongProcessor] Song: {song_path}")
105
+ print(f"[MultilingualSongProcessor] Voice: {voice_path}")
106
+ print(f"[MultilingualSongProcessor] Output: {output_path}")
107
+
108
+ # Step 1: Separate vocals
109
+ print(f"\n[MultilingualSongProcessor] STEP 1: Separating vocals...")
110
+ separator = self._ensure_separator()
111
+ vocals, instrumental = separator.separate(song_path, sr=self.sr)
112
+
113
+ # Step 2: Extract/prepare lyrics
114
+ print(f"\n[MultilingualSongProcessor] STEP 2: Preparing lyrics...")
115
+ lyrics = self._extract_lyrics_from_audio(song_path)
116
+
117
+ # Step 3-4: Synthesize and mix using multilingual TTS
118
+ print(f"\n[MultilingualSongProcessor] STEP 3-4: Synthesizing vocals with {language.upper()} model...")
119
+ tts_service = self._ensure_tts_service()
120
+
121
+ try:
122
+ synthesized_vocal = tts_service.synthesize(lyrics, voice_path, language)
123
+ except Exception as e:
124
+ print(f"[MultilingualSongProcessor] Synthesis error: {e}")
125
+ raise
126
+
127
+ # Resample if needed (XTTS uses 24kHz, we need 16kHz for mixing)
128
+ if len(synthesized_vocal.shape) > 1:
129
+ synthesized_vocal = np.mean(synthesized_vocal, axis=1)
130
+
131
+ if language == Language.HINDI.value:
132
+ # XTTS uses 24kHz, resample to 16kHz for consistency
133
+ from scipy import signal
134
+ num_samples = int(len(synthesized_vocal) * (self.sr / 24000))
135
+ synthesized_vocal = signal.resample(synthesized_vocal, num_samples)
136
+
137
+ synthesized_vocal = synthesized_vocal.astype(np.float32)
138
+ print(f"[MultilingualSongProcessor] Synthesized vocal shape: {synthesized_vocal.shape}")
139
+
140
+ # Step 5: Mix with instrumental
141
+ print(f"\n[MultilingualSongProcessor] STEP 5: Mixing vocals with instrumental...")
142
+ final_audio = AudioMixer.mix_and_save(
143
+ synthesized_vocal, instrumental,
144
+ output_path, sr=self.sr,
145
+ add_effects=add_effects
146
+ )
147
+
148
+ # Cleanup
149
+ print(f"\n[MultilingualSongProcessor] Cleaning up models...")
150
+ try:
151
+ gc.collect()
152
+ if torch.cuda.is_available():
153
+ torch.cuda.empty_cache()
154
+ except Exception as e:
155
+ print(f"[MultilingualSongProcessor] Warning during cleanup: {e}")
156
+
157
+ print(f"\n[MultilingualSongProcessor] ========== SONG CONVERSION COMPLETE ==========")
158
+ print(f"[MultilingualSongProcessor] Output saved to: {final_audio}")
159
+
160
+ return final_audio
161
+
162
+ except Exception as e:
163
+ print(f"\n[MultilingualSongProcessor] ✗ ERROR: {e}")
164
+ import traceback
165
+ traceback.print_exc()
166
+ sys.stdout.flush()
167
+ raise
backend/app/multilingual_tts.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Multilingual TTS Service - Supports English (WaveRNN) and Hindi (XTTS)."""
2
+
3
+ import gc
4
+ import torch
5
+ import numpy as np
6
+ from pathlib import Path
7
+ from typing import Optional, Union
8
+ from enum import Enum
9
+ import sys
10
+
11
+
12
+ class Language(str, Enum):
13
+ """Supported languages."""
14
+ ENGLISH = "english"
15
+ HINDI = "hindi"
16
+
17
+
18
+ class MultilingualTTSService:
19
+ """
20
+ Unified TTS service supporting multiple languages.
21
+
22
+ - English: Uses existing WaveRNN vocoder + Tacotron2 synthesizer + encoder
23
+ - Hindi: Uses XTTS (Coqui TTS) model
24
+ """
25
+
26
+ def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
27
+ """
28
+ Initialize multilingual TTS service.
29
+
30
+ Args:
31
+ models_dir: Directory with English models (encoder.pt, synthesizer.pt, vocoder.pt)
32
+ hindi_model_dir: Directory with XTTS Hindi model. If None, Hindi support disabled.
33
+ """
34
+ self.models_dir = Path(models_dir)
35
+ self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None
36
+
37
+ # Track loaded models
38
+ self._encoder_model = None
39
+ self._synthesizer_model = None
40
+ self._vocoder_model = None
41
+ self._xtts_model = None
42
+
43
+ self.sr = 16000
44
+
45
+ print("[MultilingualTTSService] Initialized")
46
+ print(f"[MultilingualTTSService] English models dir: {self.models_dir}")
47
+ if self.hindi_model_dir:
48
+ print(f"[MultilingualTTSService] Hindi XTTS dir: {self.hindi_model_dir}")
49
+ else:
50
+ print("[MultilingualTTSService] Hindi support: DISABLED (no model path)")
51
+
52
+ def _load_english_models(self):
53
+ """Load English voice cloning models (lazy load)."""
54
+ if self._encoder_model is None:
55
+ print("[MultilingualTTSService] Loading English encoder...")
56
+ from encoder import inference as encoder_infer
57
+ enc_path = self.models_dir / "default" / "encoder.pt"
58
+ if not enc_path.exists():
59
+ raise RuntimeError(f"English encoder model missing: {enc_path}")
60
+ encoder_infer.load_model(enc_path)
61
+ self._encoder_model = True
62
+ print("[MultilingualTTSService] ✓ English encoder loaded")
63
+
64
+ if self._synthesizer_model is None:
65
+ print("[MultilingualTTSService] Loading English synthesizer...")
66
+ from synthesizer import inference as synthesizer_infer
67
+ syn_path = self.models_dir / "default" / "synthesizer.pt"
68
+ if not syn_path.exists():
69
+ raise RuntimeError(f"English synthesizer model missing: {syn_path}")
70
+ self._synthesizer_model = synthesizer_infer.Synthesizer(syn_path)
71
+ print("[MultilingualTTSService] ✓ English synthesizer loaded")
72
+
73
+ if self._vocoder_model is None:
74
+ print("[MultilingualTTSService] Loading English vocoder...")
75
+ from app.vocoder import inference as vocoder_infer
76
+ voc_path = self.models_dir / "default" / "vocoder.pt"
77
+ if not voc_path.exists():
78
+ raise RuntimeError(f"English vocoder model missing: {voc_path}")
79
+ vocoder_infer.load_model(voc_path)
80
+ self._vocoder_model = True
81
+ print("[MultilingualTTSService] ✓ English vocoder loaded")
82
+
83
+ def _load_hindi_models(self):
84
+ """Load Hindi XTTS model (lazy load)."""
85
+ if not self.hindi_model_dir:
86
+ raise RuntimeError("Hindi model not configured. Set hindi_model_dir path.")
87
+
88
+ if self._xtts_model is None:
89
+ print("[MultilingualTTSService] Loading Hindi XTTS model...")
90
+ try:
91
+ from TTS.api import TTS
92
+ except ImportError:
93
+ raise ImportError(
94
+ "TTS library required for Hindi support. "
95
+ "Install with: pip install TTS>=0.21.0"
96
+ )
97
+
98
+ config_path = self.hindi_model_dir / "config.json"
99
+ if not config_path.exists():
100
+ raise RuntimeError(f"Hindi model config missing: {config_path}")
101
+
102
+ # Load XTTS model
103
+ self._xtts_model = TTS(
104
+ model_path=str(self.hindi_model_dir.resolve().as_posix()),
105
+ config_path=str(config_path),
106
+ gpu=False # Set to True if CUDA available and needed
107
+ )
108
+ print("[MultilingualTTSService] ✓ Hindi XTTS loaded")
109
+
110
+ def synthesize(self, text: str, voice_sample_path: Union[str, Path],
111
+ language: str = "english") -> np.ndarray:
112
+ """
113
+ Synthesize speech in specified language.
114
+
115
+ Args:
116
+ text: Text to synthesize
117
+ voice_sample_path: Path to reference voice sample
118
+ language: "english" or "hindi"
119
+
120
+ Returns:
121
+ Audio waveform as numpy array
122
+ """
123
+ language = language.lower()
124
+
125
+ if language == Language.ENGLISH:
126
+ return self._synthesize_english(text, voice_sample_path)
127
+ elif language == Language.HINDI:
128
+ return self._synthesize_hindi(text, voice_sample_path)
129
+ else:
130
+ raise ValueError(f"Unsupported language: {language}")
131
+
132
+ def _synthesize_english(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
133
+ """Synthesize English speech using WaveRNN + Tacotron2."""
134
+ from encoder import inference as encoder_infer
135
+ from app.vocoder import inference as vocoder_infer
136
+
137
+ self._load_english_models()
138
+
139
+ print(f"[MultilingualTTSService] Synthesizing English: {text[:50]}...")
140
+
141
+ # Embed voice
142
+ wav = encoder_infer.preprocess_wav(voice_sample_path)
143
+ embed = encoder_infer.embed_utterance(wav)
144
+
145
+ # Generate mel
146
+ mels = self._synthesizer_model.synthesize_spectrograms([text], [embed])
147
+ mel = mels[0]
148
+
149
+ # Vocalize
150
+ try:
151
+ synthesized = vocoder_infer.infer_waveform(
152
+ mel, normalize=True, batched=False, target=8000, overlap=800
153
+ ).astype(np.float32)
154
+ except Exception as e:
155
+ print(f"[MultilingualTTSService] Vocoder failed: {e}, using Griffin-Lim fallback")
156
+ synthesized = self._synthesizer_model.griffin_lim(mel).astype(np.float32)
157
+
158
+ # Normalize
159
+ max_val = np.max(np.abs(synthesized))
160
+ if max_val > 0:
161
+ target_level = 0.707
162
+ synthesized = synthesized * (target_level / max_val)
163
+
164
+ return np.clip(synthesized, -1.0, 1.0)
165
+
166
+ def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
167
+ """Synthesize Hindi speech using XTTS model."""
168
+ self._load_hindi_models()
169
+
170
+ print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
171
+
172
+ # XTTS synthesize
173
+ audio = self._xtts_model.tts(
174
+ text=text,
175
+ speaker_wav=str(voice_sample_path),
176
+ language="hi"
177
+ )
178
+
179
+ # Convert to float32 if needed
180
+ audio = np.asarray(audio, dtype=np.float32)
181
+
182
+ # Normalize
183
+ max_val = np.max(np.abs(audio))
184
+ if max_val > 0:
185
+ target_level = 0.707
186
+ audio = audio * (target_level / max_val)
187
+
188
+ return np.clip(audio, -1.0, 1.0)
189
+
190
+ def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path],
191
+ output_path: Union[str, Path], language: str = "english") -> Path:
192
+ """
193
+ Synthesize and save to file.
194
+
195
+ Args:
196
+ text: Text to synthesize
197
+ voice_sample_path: Path to reference voice
198
+ output_path: Where to save audio
199
+ language: "english" or "hindi"
200
+
201
+ Returns:
202
+ Path to output file
203
+ """
204
+ import soundfile as sf
205
+
206
+ output_path = Path(output_path)
207
+
208
+ try:
209
+ audio = self.synthesize(text, voice_sample_path, language)
210
+
211
+ # Determine sample rate based on language
212
+ sr = 24000 if language.lower() == Language.HINDI else 16000
213
+
214
+ sf.write(output_path, audio, sr)
215
+ print(f"[MultilingualTTSService] Audio saved: {output_path}")
216
+ return output_path
217
+
218
+ except Exception as e:
219
+ print(f"[MultilingualTTSService] Error during synthesis: {e}")
220
+ raise
221
+
222
+ def cleanup(self):
223
+ """Release model memory."""
224
+ print("[MultilingualTTSService] Cleaning up models...")
225
+ try:
226
+ self._encoder_model = None
227
+ self._synthesizer_model = None
228
+ self._vocoder_model = None
229
+ self._xtts_model = None
230
+ gc.collect()
231
+ if torch.cuda.is_available():
232
+ torch.cuda.empty_cache()
233
+ except Exception as e:
234
+ print(f"[MultilingualTTSService] Cleanup warning: {e}")
backend/app/routes.py CHANGED
@@ -1,6 +1,7 @@
1
  """
2
- Flask API Backend for Voice Cloning
3
  Integrates the Python voice cloning backend with the React frontend
 
4
  """
5
 
6
  from flask import Blueprint, request, jsonify, send_file
@@ -9,6 +10,7 @@ import uuid
9
  import json
10
  from datetime import datetime
11
  import sys
 
12
 
13
  from .voice_cloning import synthesize
14
 
@@ -22,6 +24,24 @@ OUTPUT_FOLDER = BASE_DIR / 'outputs'
22
  MODELS_DIR = BASE_DIR / 'models'
23
  VOICES_DB = UPLOAD_FOLDER / 'voices.json'
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # Create directories with parents
26
  try:
27
  UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
@@ -152,8 +172,18 @@ def get_voices():
152
  @bp.route('/synthesize', methods=['POST'])
153
  def synthesize_speech():
154
  """
155
- Synthesize speech from text using enrolled voice
156
- Frontend sends: { "text": "...", "voiceId": "voice_xxx" }
 
 
 
 
 
 
 
 
 
 
157
  """
158
  try:
159
  data = request.get_json()
@@ -162,7 +192,8 @@ def synthesize_speech():
162
  return jsonify({'error': 'No data provided'}), 400
163
 
164
  text = data.get('text', '').strip()
165
- voice_id = data.get('voice_id', '') # Changed from 'voiceId' to 'voice_id'
 
166
 
167
  if not text:
168
  return jsonify({'error': 'No text provided'}), 400
@@ -170,6 +201,16 @@ def synthesize_speech():
170
  if not voice_id:
171
  return jsonify({'error': 'No voice selected'}), 400
172
 
 
 
 
 
 
 
 
 
 
 
173
  # Find the voice in database
174
  voices = load_voices_db()
175
  voice = next((v for v in voices if v['id'] == voice_id), None)
@@ -177,7 +218,7 @@ def synthesize_speech():
177
  if not voice:
178
  return jsonify({'error': 'Voice not found'}), 404
179
 
180
- # Reconstruct path from UPLOAD_FOLDER (server-agnostic)
181
  voice_filepath = UPLOAD_FOLDER / voice['filename']
182
 
183
  if not voice_filepath.exists():
@@ -187,28 +228,43 @@ def synthesize_speech():
187
  output_filename = f"synthesis_{uuid.uuid4().hex[:8]}.wav"
188
  output_path = OUTPUT_FOLDER / output_filename
189
 
190
- # Call the voice cloning synthesis function
191
- print(f"Synthesizing: '{text}' with voice '{voice['name']}'")
192
- print(f"Voice file: {voice_filepath}")
193
- print(f"Output path: {output_path}")
194
- print(f"Models dir: {MODELS_DIR}")
195
- print("Starting synthesis... This may take 30-60 seconds...")
 
196
 
197
  try:
198
- # Flush output to see logs immediately
199
- sys.stdout.flush()
200
-
201
- synthesize(
202
- voice_path=voice_filepath,
203
- text=text,
204
- models_dir=MODELS_DIR,
205
- out_path=output_path
206
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
- print(f"Synthesis completed! Output saved to: {output_path}")
209
  sys.stdout.flush()
 
210
  except Exception as synth_error:
211
- print(f"Synthesis error: {synth_error}")
212
  import traceback
213
  traceback.print_exc()
214
  sys.stdout.flush()
@@ -221,12 +277,13 @@ def synthesize_speech():
221
  # Return the audio file URL
222
  return jsonify({
223
  'success': True,
224
- 'message': 'Speech synthesized successfully',
225
- 'audio_url': f'/api/audio/{output_filename}'
 
226
  }), 200
227
 
228
  except Exception as e:
229
- print(f"Error synthesizing speech: {e}")
230
  import traceback
231
  traceback.print_exc()
232
  return jsonify({'error': f'Failed to synthesize speech: {str(e)}'}), 500
@@ -469,17 +526,19 @@ def convert_song():
469
  print(f"[API] Language: {language}")
470
  print(f"[API] Add effects: {add_effects}")
471
 
472
- # Import song processor
473
- from app.song_conversion.song_processor import SongProcessor
474
 
475
- processor = SongProcessor(MODELS_DIR)
 
 
 
476
  result_path = processor.convert_song(
477
  song_path=song_path,
478
  voice_path=voice_filepath,
479
  output_path=output_path,
480
  language=language,
481
- add_effects=add_effects,
482
- models_dir=MODELS_DIR
483
  )
484
 
485
  print(f"[API] Song conversion complete: {result_path}")
@@ -489,7 +548,8 @@ def convert_song():
489
  'success': True,
490
  'message': 'Song converted successfully',
491
  'audio_url': f'/api/audio/{output_filename}',
492
- 'filename': output_filename
 
493
  }), 200
494
 
495
  except Exception as e:
 
1
  """
2
+ """Flask API Backend for Voice Cloning
3
  Integrates the Python voice cloning backend with the React frontend
4
+ Supports multilingual synthesis: English (WaveRNN) and Hindi (XTTS)
5
  """
6
 
7
  from flask import Blueprint, request, jsonify, send_file
 
10
  import json
11
  from datetime import datetime
12
  import sys
13
+ import os
14
 
15
  from .voice_cloning import synthesize
16
 
 
24
  MODELS_DIR = BASE_DIR / 'models'
25
  VOICES_DB = UPLOAD_FOLDER / 'voices.json'
26
 
27
+ # Hindi model directory (check multiple possible locations)
28
+ HINDI_MODEL_DIR = None
29
+ possible_hindi_dirs = [
30
+ Path(os.getenv('HINDI_MODEL_PATH', '')) if os.getenv('HINDI_MODEL_PATH') else None,
31
+ BASE_DIR.parent / 'Apoorv_hindi_model' / 'models' / 'xtts_hindi', # Local development
32
+ BASE_DIR / 'models' / 'xtts_hindi', # Alternative location
33
+ ]
34
+ for path in possible_hindi_dirs:
35
+ if path and path.exists():
36
+ HINDI_MODEL_DIR = path
37
+ print(f"✓ Hindi model found at: {HINDI_MODEL_DIR}")
38
+ break
39
+
40
+ if not HINDI_MODEL_DIR:
41
+ print("⚠ Hindi model not found. Hindi synthesis will be unavailable.")
42
+ print(" To enable Hindi support, set HINDI_MODEL_PATH environment variable")
43
+ print(" or place model at: Apoorv_hindi_model/models/xtts_hindi")
44
+
45
  # Create directories with parents
46
  try:
47
  UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
 
172
  @bp.route('/synthesize', methods=['POST'])
173
  def synthesize_speech():
174
  """
175
+ Synthesize speech from text using enrolled voice (multilingual support).
176
+
177
+ Frontend sends JSON:
178
+ {
179
+ "text": "Your text here",
180
+ "voice_id": "voice_xxx",
181
+ "language": "english" or "hindi" (optional, defaults to english)
182
+ }
183
+
184
+ Supports:
185
+ - English: Uses WaveRNN vocoder (existing model)
186
+ - Hindi: Uses XTTS model (requires hindi_model_dir)
187
  """
188
  try:
189
  data = request.get_json()
 
192
  return jsonify({'error': 'No data provided'}), 400
193
 
194
  text = data.get('text', '').strip()
195
+ voice_id = data.get('voice_id', '')
196
+ language = data.get('language', 'english').lower()
197
 
198
  if not text:
199
  return jsonify({'error': 'No text provided'}), 400
 
201
  if not voice_id:
202
  return jsonify({'error': 'No voice selected'}), 400
203
 
204
+ if language not in ['english', 'hindi']:
205
+ return jsonify({'error': f'Unsupported language: {language}. Supported: english, hindi'}), 400
206
+
207
+ # Check if Hindi model is available for Hindi synthesis
208
+ if language == 'hindi' and not HINDI_MODEL_DIR:
209
+ return jsonify({
210
+ 'error': 'Hindi synthesis unavailable. Hindi model not configured.',
211
+ 'available_languages': ['english']
212
+ }), 503
213
+
214
  # Find the voice in database
215
  voices = load_voices_db()
216
  voice = next((v for v in voices if v['id'] == voice_id), None)
 
218
  if not voice:
219
  return jsonify({'error': 'Voice not found'}), 404
220
 
221
+ # Reconstruct path from UPLOAD_FOLDER
222
  voice_filepath = UPLOAD_FOLDER / voice['filename']
223
 
224
  if not voice_filepath.exists():
 
228
  output_filename = f"synthesis_{uuid.uuid4().hex[:8]}.wav"
229
  output_path = OUTPUT_FOLDER / output_filename
230
 
231
+ print(f"\n[API /synthesize]")
232
+ print(f" Language: {language.upper()}")
233
+ print(f" Text: '{text[:50]}...'")
234
+ print(f" Voice: '{voice['name']}'")
235
+ print(f" Voice file: {voice_filepath}")
236
+ print(f" Output: {output_path}")
237
+ sys.stdout.flush()
238
 
239
  try:
240
+ if language == 'english':
241
+ # Use original English synthesis (WaveRNN)
242
+ synthesize(
243
+ voice_path=voice_filepath,
244
+ text=text,
245
+ models_dir=MODELS_DIR,
246
+ out_path=output_path
247
+ )
248
+ else:
249
+ # Use multilingual TTS for Hindi
250
+ from app.multilingual_tts import MultilingualTTSService
251
+ tts_service = MultilingualTTSService(
252
+ models_dir=MODELS_DIR,
253
+ hindi_model_dir=HINDI_MODEL_DIR
254
+ )
255
+ tts_service.synthesize_and_save(
256
+ text=text,
257
+ voice_sample_path=voice_filepath,
258
+ output_path=output_path,
259
+ language=language
260
+ )
261
+ tts_service.cleanup()
262
 
263
+ print(f"[API /synthesize] ✓ Synthesis completed!")
264
  sys.stdout.flush()
265
+
266
  except Exception as synth_error:
267
+ print(f"[API /synthesize] ✗ Synthesis error: {synth_error}")
268
  import traceback
269
  traceback.print_exc()
270
  sys.stdout.flush()
 
277
  # Return the audio file URL
278
  return jsonify({
279
  'success': True,
280
+ 'message': f'{language.capitalize()} speech synthesized successfully',
281
+ 'audio_url': f'/api/audio/{output_filename}',
282
+ 'language': language
283
  }), 200
284
 
285
  except Exception as e:
286
+ print(f"[API /synthesize] Unexpected error: {e}")
287
  import traceback
288
  traceback.print_exc()
289
  return jsonify({'error': f'Failed to synthesize speech: {str(e)}'}), 500
 
526
  print(f"[API] Language: {language}")
527
  print(f"[API] Add effects: {add_effects}")
528
 
529
+ # Import multilingual song processor
530
+ from app.multilingual_song_processor import MultilingualSongProcessor
531
 
532
+ processor = MultilingualSongProcessor(
533
+ models_dir=MODELS_DIR,
534
+ hindi_model_dir=HINDI_MODEL_DIR if language == 'hindi' else None
535
+ )
536
  result_path = processor.convert_song(
537
  song_path=song_path,
538
  voice_path=voice_filepath,
539
  output_path=output_path,
540
  language=language,
541
+ add_effects=add_effects
 
542
  )
543
 
544
  print(f"[API] Song conversion complete: {result_path}")
 
548
  'success': True,
549
  'message': 'Song converted successfully',
550
  'audio_url': f'/api/audio/{output_filename}',
551
+ 'filename': output_filename,
552
+ 'language': language
553
  }), 200
554
 
555
  except Exception as e:
backend/requirements.txt CHANGED
@@ -14,3 +14,4 @@ unidecode>=1.2.0
14
  inflect>=6.0.0
15
  demucs>=4.0.0
16
  pydub>=0.25.1
 
 
14
  inflect>=6.0.0
15
  demucs>=4.0.0
16
  pydub>=0.25.1
17
+ TTS>=0.21.0
frontend/.env.production CHANGED
@@ -1,2 +1,2 @@
1
  # Production deployment
2
- VITE_API_URL=https://voice-cloning-personalized-speech.onrender.com
 
1
  # Production deployment
2
+ VITE_API_URL=https://aj50-voice-cloning-backend.hf.space
frontend/src/components/forms/SpeechSynthesis.tsx CHANGED
@@ -23,6 +23,8 @@ interface Voice {
23
 
24
  interface SpeechSynthesisProps {
25
  voices?: Voice[];
 
 
26
  onSynthesisComplete?: (audioUrl: string) => void;
27
  className?: string;
28
  }
@@ -36,6 +38,8 @@ const sampleTexts = {
36
 
37
  export default function SpeechSynthesis({
38
  voices: propVoices,
 
 
39
  onSynthesisComplete,
40
  className = ""
41
  }: SpeechSynthesisProps) {
@@ -113,8 +117,8 @@ export default function SpeechSynthesis({
113
  setSynthesizerStartTime(Date.now()); // Record synthesis start time
114
 
115
  try {
116
- // Call backend API for synthesis
117
- const result = await api.synthesize(selectedVoice, inputText);
118
 
119
  // Get the audio file URL from backend with cache busting
120
  const audioUrl = api.getAudioUrl(result.audio_url) + `?t=${Date.now()}`;
@@ -228,6 +232,30 @@ export default function SpeechSynthesis({
228
  </Button>
229
  </CardHeader>
230
  <CardContent className="space-y-6">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  {/* Voice Selection */}
232
  <div className="space-y-2">
233
  <Label htmlFor="voice-select">Select Voice</Label>
 
23
 
24
  interface SpeechSynthesisProps {
25
  voices?: Voice[];
26
+ language?: 'english' | 'hindi';
27
+ onLanguageChange?: (language: 'english' | 'hindi') => void;
28
  onSynthesisComplete?: (audioUrl: string) => void;
29
  className?: string;
30
  }
 
38
 
39
  export default function SpeechSynthesis({
40
  voices: propVoices,
41
+ language = 'english',
42
+ onLanguageChange,
43
  onSynthesisComplete,
44
  className = ""
45
  }: SpeechSynthesisProps) {
 
117
  setSynthesizerStartTime(Date.now()); // Record synthesis start time
118
 
119
  try {
120
+ // Call backend API for synthesis with language support
121
+ const result = await api.synthesize(selectedVoice, inputText, language);
122
 
123
  // Get the audio file URL from backend with cache busting
124
  const audioUrl = api.getAudioUrl(result.audio_url) + `?t=${Date.now()}`;
 
232
  </Button>
233
  </CardHeader>
234
  <CardContent className="space-y-6">
235
+ {/* Language Selector */}
236
+ <div className="flex gap-2">
237
+ <button
238
+ onClick={() => onLanguageChange?.('english')}
239
+ className={`flex-1 py-2 px-4 rounded-lg font-medium transition-all ${
240
+ language === 'english'
241
+ ? 'bg-blue-600 text-white shadow-lg'
242
+ : 'bg-gray-200 text-gray-700 hover:bg-gray-300'
243
+ }`}
244
+ >
245
+ 🇬🇧 English
246
+ </button>
247
+ <button
248
+ onClick={() => onLanguageChange?.('hindi')}
249
+ className={`flex-1 py-2 px-4 rounded-lg font-medium transition-all ${
250
+ language === 'hindi'
251
+ ? 'bg-orange-600 text-white shadow-lg'
252
+ : 'bg-gray-200 text-gray-700 hover:bg-gray-300'
253
+ }`}
254
+ >
255
+ 🇮🇳 हिन्दी
256
+ </button>
257
+ </div>
258
+
259
  {/* Voice Selection */}
260
  <div className="space-y-2">
261
  <Label htmlFor="voice-select">Select Voice</Label>
frontend/src/pages/Index.tsx CHANGED
@@ -251,6 +251,8 @@ const Index = () => {
251
  <TabsContent value="synthesize" className="space-y-6">
252
  <SpeechSynthesis
253
  voices={enrolledVoices.length ? enrolledVoices : undefined}
 
 
254
  onSynthesisComplete={handleSynthesisComplete}
255
  />
256
 
 
251
  <TabsContent value="synthesize" className="space-y-6">
252
  <SpeechSynthesis
253
  voices={enrolledVoices.length ? enrolledVoices : undefined}
254
+ language={language}
255
+ onLanguageChange={setLanguage}
256
  onSynthesisComplete={handleSynthesisComplete}
257
  />
258
 
frontend/src/services/api.ts CHANGED
@@ -41,9 +41,9 @@ export const api = {
41
  },
42
 
43
  /**
44
- * Synthesize speech from text
45
  */
46
- synthesize: async (voiceId: string, text: string) => {
47
  const response = await fetch(api.getUrl('/synthesize'), {
48
  method: 'POST',
49
  headers: {
@@ -52,6 +52,7 @@ export const api = {
52
  body: JSON.stringify({
53
  voice_id: voiceId,
54
  text: text,
 
55
  }),
56
  });
57
  if (!response.ok) {
 
41
  },
42
 
43
  /**
44
+ * Synthesize speech from text (supports multilingual: english, hindi)
45
  */
46
+ synthesize: async (voiceId: string, text: string, language: string = 'english') => {
47
  const response = await fetch(api.getUrl('/synthesize'), {
48
  method: 'POST',
49
  headers: {
 
52
  body: JSON.stringify({
53
  voice_id: voiceId,
54
  text: text,
55
+ language: language,
56
  }),
57
  });
58
  if (!response.ok) {