AJ50's picture
Add multilingual support: English (WaveRNN) + Hindi (XTTS) [sync with pragyan]
03fe1d8
raw
history blame
21.5 kB
"""
"""Flask API Backend for Voice Cloning
Integrates the Python voice cloning backend with the React frontend
Supports multilingual synthesis: English (WaveRNN) and Hindi (XTTS)
"""
from flask import Blueprint, request, jsonify, send_file
from pathlib import Path
import uuid
import json
from datetime import datetime
import sys
import os
from .voice_cloning import synthesize
bp = Blueprint('voice_cloning', __name__, url_prefix='/api')
BASE_DIR = Path(__file__).resolve().parents[1]
# Configuration
UPLOAD_FOLDER = BASE_DIR / 'enrolled_voices'
OUTPUT_FOLDER = BASE_DIR / 'outputs'
MODELS_DIR = BASE_DIR / 'models'
VOICES_DB = UPLOAD_FOLDER / 'voices.json'
# Hindi model directory (check multiple possible locations)
HINDI_MODEL_DIR = None
possible_hindi_dirs = [
Path(os.getenv('HINDI_MODEL_PATH', '')) if os.getenv('HINDI_MODEL_PATH') else None,
BASE_DIR.parent / 'Apoorv_hindi_model' / 'models' / 'xtts_hindi', # Local development
BASE_DIR / 'models' / 'xtts_hindi', # Alternative location
]
for path in possible_hindi_dirs:
if path and path.exists():
HINDI_MODEL_DIR = path
print(f"βœ“ Hindi model found at: {HINDI_MODEL_DIR}")
break
if not HINDI_MODEL_DIR:
print("⚠ Hindi model not found. Hindi synthesis will be unavailable.")
print(" To enable Hindi support, set HINDI_MODEL_PATH environment variable")
print(" or place model at: Apoorv_hindi_model/models/xtts_hindi")
# Create directories with parents
try:
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
VOICES_DB.parent.mkdir(parents=True, exist_ok=True)
except Exception as e:
print(f"Failed to create directories: {e}")
sys.exit(1)
# Allowed audio extensions
ALLOWED_EXTENSIONS = {'mp3', 'wav', 'm4a', 'flac', 'ogg', 'webm'}
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
def load_voices_db():
"""Load the voices database"""
if VOICES_DB.exists():
with open(VOICES_DB, 'r') as f:
return json.load(f)
return []
def save_voices_db(voices):
"""Save the voices database"""
with open(VOICES_DB, 'w') as f:
json.dump(voices, f, indent=2)
@bp.route('/health', methods=['GET'])
def health_check():
"""Health check endpoint"""
return jsonify({
'status': 'healthy',
'message': 'Voice Cloning API is running'
})
@bp.route('/enroll', methods=['POST'])
def enroll_voice():
"""
Enroll a new voice by accepting audio file and voice name
Frontend sends: FormData with 'audio' (File) and 'voice_name' (string)
"""
try:
# Check if audio file is present
if 'audio' not in request.files:
return jsonify({'error': 'No audio file provided'}), 400
audio_file = request.files['audio']
voice_name = request.form.get('voice_name', 'Unnamed Voice').strip()
if audio_file.filename == '':
return jsonify({'error': 'No file selected'}), 400
if not allowed_file(audio_file.filename):
return jsonify({'error': 'Invalid file type. Supported: mp3, wav, m4a, flac, ogg, webm'}), 400
# Ensure upload folder exists
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
# Generate unique ID and secure filename
voice_id = f"voice_{uuid.uuid4().hex[:8]}"
file_extension = audio_file.filename.rsplit('.', 1)[1].lower()
filename = f"{voice_id}.{file_extension}"
filepath = UPLOAD_FOLDER / filename
# Save the audio file with error handling
try:
audio_file.save(str(filepath))
print(f"βœ“ Audio file saved: {filepath}")
except Exception as file_err:
print(f"βœ— Failed to save audio file: {file_err}")
return jsonify({'error': f'Failed to save audio: {str(file_err)}'}), 500
# Create voice entry
voice_entry = {
'id': voice_id,
'name': voice_name,
'filename': filename,
'createdAt': datetime.now().isoformat()
}
# Update voices database with error handling
try:
VOICES_DB.parent.mkdir(parents=True, exist_ok=True)
voices = load_voices_db()
voices.append(voice_entry)
save_voices_db(voices)
print(f"βœ“ Voice '{voice_name}' (ID: {voice_id}) enrolled successfully")
except Exception as db_err:
print(f"βœ— Failed to update voices DB: {db_err}")
return jsonify({'error': f'Failed to save voice metadata: {str(db_err)}'}), 500
return jsonify({
'success': True,
'message': f'Voice "{voice_name}" enrolled successfully',
'voice_id': voice_id,
'voice_name': voice_name,
'created_at': voice_entry['createdAt']
}), 201
except Exception as e:
print(f"βœ— Error enrolling voice: {e}")
import traceback
traceback.print_exc()
return jsonify({'error': f'Failed to enroll voice: {str(e)}'}), 500
@bp.route('/voices', methods=['GET'])
def get_voices():
"""
Get list of all enrolled voices
Frontend uses this to populate the voice selection dropdown
"""
try:
voices = load_voices_db()
# Return only necessary info for frontend
voices_list = [
{
'id': v['id'],
'name': v['name'],
'createdAt': v['createdAt']
}
for v in voices
]
return jsonify({'voices': voices_list}), 200
except Exception as e:
print(f"Error getting voices: {e}")
return jsonify({'error': f'Failed to get voices: {str(e)}'}), 500
@bp.route('/synthesize', methods=['POST'])
def synthesize_speech():
"""
Synthesize speech from text using enrolled voice (multilingual support).
Frontend sends JSON:
{
"text": "Your text here",
"voice_id": "voice_xxx",
"language": "english" or "hindi" (optional, defaults to english)
}
Supports:
- English: Uses WaveRNN vocoder (existing model)
- Hindi: Uses XTTS model (requires hindi_model_dir)
"""
try:
data = request.get_json()
if not data:
return jsonify({'error': 'No data provided'}), 400
text = data.get('text', '').strip()
voice_id = data.get('voice_id', '')
language = data.get('language', 'english').lower()
if not text:
return jsonify({'error': 'No text provided'}), 400
if not voice_id:
return jsonify({'error': 'No voice selected'}), 400
if language not in ['english', 'hindi']:
return jsonify({'error': f'Unsupported language: {language}. Supported: english, hindi'}), 400
# Check if Hindi model is available for Hindi synthesis
if language == 'hindi' and not HINDI_MODEL_DIR:
return jsonify({
'error': 'Hindi synthesis unavailable. Hindi model not configured.',
'available_languages': ['english']
}), 503
# Find the voice in database
voices = load_voices_db()
voice = next((v for v in voices if v['id'] == voice_id), None)
if not voice:
return jsonify({'error': 'Voice not found'}), 404
# Reconstruct path from UPLOAD_FOLDER
voice_filepath = UPLOAD_FOLDER / voice['filename']
if not voice_filepath.exists():
return jsonify({'error': f'Voice file not found: {voice_filepath}'}), 404
# Generate unique output filename
output_filename = f"synthesis_{uuid.uuid4().hex[:8]}.wav"
output_path = OUTPUT_FOLDER / output_filename
print(f"\n[API /synthesize]")
print(f" Language: {language.upper()}")
print(f" Text: '{text[:50]}...'")
print(f" Voice: '{voice['name']}'")
print(f" Voice file: {voice_filepath}")
print(f" Output: {output_path}")
sys.stdout.flush()
try:
if language == 'english':
# Use original English synthesis (WaveRNN)
synthesize(
voice_path=voice_filepath,
text=text,
models_dir=MODELS_DIR,
out_path=output_path
)
else:
# Use multilingual TTS for Hindi
from app.multilingual_tts import MultilingualTTSService
tts_service = MultilingualTTSService(
models_dir=MODELS_DIR,
hindi_model_dir=HINDI_MODEL_DIR
)
tts_service.synthesize_and_save(
text=text,
voice_sample_path=voice_filepath,
output_path=output_path,
language=language
)
tts_service.cleanup()
print(f"[API /synthesize] βœ“ Synthesis completed!")
sys.stdout.flush()
except Exception as synth_error:
print(f"[API /synthesize] βœ— Synthesis error: {synth_error}")
import traceback
traceback.print_exc()
sys.stdout.flush()
return jsonify({'error': f'Synthesis failed: {str(synth_error)}'}), 500
if not output_path.exists():
error_msg = 'Synthesis failed - output not generated'
return jsonify({'error': error_msg}), 500
# Return the audio file URL
return jsonify({
'success': True,
'message': f'{language.capitalize()} speech synthesized successfully',
'audio_url': f'/api/audio/{output_filename}',
'language': language
}), 200
except Exception as e:
print(f"[API /synthesize] Unexpected error: {e}")
import traceback
traceback.print_exc()
return jsonify({'error': f'Failed to synthesize speech: {str(e)}'}), 500
@bp.route('/audio/<filename>', methods=['GET'])
def get_audio(filename):
"""
Serve synthesized audio files
Frontend uses this URL to play/download the generated audio
"""
try:
filepath = OUTPUT_FOLDER / filename
if not filepath.exists():
return jsonify({'error': 'Audio file not found'}), 404
return send_file(
str(filepath),
mimetype='audio/wav',
as_attachment=False,
download_name=filename
)
except Exception as e:
print(f"Error serving audio: {e}")
return jsonify({'error': f'Failed to serve audio: {str(e)}'}), 500
@bp.route('/voices/<voice_id>', methods=['DELETE'])
def delete_voice(voice_id):
"""
Delete an enrolled voice
Optional: Frontend can call this to remove voices
"""
try:
voices = load_voices_db()
voice = next((v for v in voices if v['id'] == voice_id), None)
if not voice:
return jsonify({'error': 'Voice not found'}), 404
# Delete the audio file
voice_filepath = UPLOAD_FOLDER / voice['filename']
if voice_filepath.exists():
voice_filepath.unlink()
# Remove from database
voices = [v for v in voices if v['id'] != voice_id]
save_voices_db(voices)
return jsonify({
'success': True,
'message': f'Voice "{voice["name"]}" deleted successfully'
}), 200
except Exception as e:
print(f"Error deleting voice: {e}")
return jsonify({'error': f'Failed to delete voice: {str(e)}'}), 500
@bp.route('/spectrogram/<audio_filename>', methods=['GET'])
def get_spectrogram(audio_filename):
"""
Generate and return mel-spectrogram data for visualization
Frontend can use this to display real-time mel-spectrogram
"""
try:
print(f"[Spectrogram] Requested file: {audio_filename}")
filepath = OUTPUT_FOLDER / audio_filename
print(f"[Spectrogram] Full path: {filepath}")
print(f"[Spectrogram] File exists: {filepath.exists()}")
if not filepath.exists():
print(f"[Spectrogram] ERROR: File not found: {filepath}")
return jsonify({'error': f'Audio file {audio_filename} not found'}), 404
# Import librosa for mel-spectrogram generation
import librosa
import numpy as np
print(f"[Spectrogram] Loading audio file...")
# Load audio file
y, sr = librosa.load(str(filepath), sr=None)
print(f"[Spectrogram] Audio loaded: shape={y.shape}, sr={sr}")
# Generate mel-spectrogram
# 80 mel bands (common for Tacotron2), hop_length varies with sample rate
mel_spec = librosa.feature.melspectrogram(
y=y,
sr=sr,
n_mels=80,
hop_length=512
)
print(f"[Spectrogram] Mel-spec generated: shape={mel_spec.shape}")
# Convert to dB scale (log scale for better visualization)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
# Normalize to 0-255 range for visualization
mel_spec_normalized = np.clip(
((mel_spec_db + 80) / 80 * 255),
0,
255
).astype(np.uint8)
# Convert to list for JSON serialization
# Transpose to time x frequency format for frontend
spectrogram_data = mel_spec_normalized.T.tolist()
print(f"[Spectrogram] Successfully generated spectrogram: {len(spectrogram_data)} time steps")
return jsonify({
'spectrogram': spectrogram_data,
'n_mels': 80,
'shape': {
'time_steps': len(spectrogram_data),
'frequency_bins': 80
}
}), 200
except Exception as e:
print(f"[Spectrogram] ERROR: {str(e)}")
import traceback
traceback.print_exc()
return jsonify({'error': f'Failed to generate spectrogram: {str(e)}'}), 500
@bp.route('/waveform/<audio_filename>', methods=['GET'])
def get_waveform(audio_filename):
"""
Serve audio waveform as numeric array for real-time FFT visualization
Frontend fetches this and computes FFT using Web Audio API
"""
try:
filepath = OUTPUT_FOLDER / audio_filename
if not filepath.exists():
return jsonify({'error': 'Audio file not found'}), 404
import soundfile as sf
import numpy as np
# Load audio file
# soundfile returns (data, sample_rate)
y, sr = sf.read(str(filepath))
# If stereo, convert to mono by taking first channel or averaging
if len(y.shape) > 1:
y = np.mean(y, axis=1)
# Ensure float32 for compatibility
y = np.asarray(y, dtype=np.float32)
# Downsample if very long to reduce JSON payload
# Typical waveform for 60s at 22050Hz = 1.3M samples
# For FFT we can use 8000 Hz safely (captures up to 4 kHz)
target_sr = 8000
if sr > target_sr:
# Calculate downsample factor
resample_ratio = target_sr / sr
new_length = int(len(y) * resample_ratio)
# Simple linear interpolation for downsampling
indices = np.linspace(0, len(y) - 1, new_length)
y = np.interp(indices, np.arange(len(y)), y)
sr = target_sr
# Convert to list for JSON serialization
waveform_data = y.tolist()
return jsonify({
'waveform': waveform_data,
'sample_rate': sr,
'duration': len(y) / sr,
'samples': len(y)
}), 200
except ImportError as ie:
err_msg = f'Soundfile library not available: {str(ie)}'
return jsonify({'error': err_msg}), 500
except Exception as e:
print(f"Error serving waveform: {e}")
import traceback
traceback.print_exc()
err_msg = f'Failed to generate waveform: {str(e)}'
return jsonify({'error': err_msg}), 500
# ============================================================================
# SONG GENERATION ENDPOINTS
# ============================================================================
@bp.route('/convert_song', methods=['POST'])
def convert_song():
"""
Convert a song to user's voice.
Form data:
- song: audio file (mp3, wav, etc.)
- voice_id: ID of enrolled voice to use
- language: 'english' or 'hindi'
- add_effects: 'true' or 'false' to add reverb/compression
Returns: Generated song audio file
"""
try:
print("\n[API] POST /api/convert_song")
# Validate input
if 'song' not in request.files:
return jsonify({'error': 'No song file provided'}), 400
if 'voice_id' not in request.form:
return jsonify({'error': 'No voice_id provided'}), 400
song_file = request.files['song']
voice_id = request.form.get('voice_id')
language = request.form.get('language', 'english')
add_effects = request.form.get('add_effects', 'true').lower() == 'true'
if not allowed_file(song_file.filename):
return jsonify({'error': f'File type not allowed. Allowed: {ALLOWED_EXTENSIONS}'}), 400
# Load voices database
voices_db = load_voices_db()
voice_data = next((v for v in voices_db if v['id'] == voice_id), None)
if not voice_data:
return jsonify({'error': f'Voice {voice_id} not found'}), 404
# Save uploaded song
song_filename = f"song_{uuid.uuid4().hex}.wav"
song_path = OUTPUT_FOLDER / song_filename
song_file.save(song_path)
print(f"[API] Song saved: {song_path}")
# Get voice file path
voice_filepath = UPLOAD_FOLDER / voice_data['filename']
if not voice_filepath.exists():
return jsonify({'error': 'Voice file not found'}), 404
# Output path
output_filename = f"converted_song_{uuid.uuid4().hex}.wav"
output_path = OUTPUT_FOLDER / output_filename
print(f"[API] Starting song conversion...")
print(f"[API] Language: {language}")
print(f"[API] Add effects: {add_effects}")
# Import multilingual song processor
from app.multilingual_song_processor import MultilingualSongProcessor
processor = MultilingualSongProcessor(
models_dir=MODELS_DIR,
hindi_model_dir=HINDI_MODEL_DIR if language == 'hindi' else None
)
result_path = processor.convert_song(
song_path=song_path,
voice_path=voice_filepath,
output_path=output_path,
language=language,
add_effects=add_effects
)
print(f"[API] Song conversion complete: {result_path}")
# Return download URL
return jsonify({
'success': True,
'message': 'Song converted successfully',
'audio_url': f'/api/audio/{output_filename}',
'filename': output_filename,
'language': language
}), 200
except Exception as e:
print(f"[API] βœ— Error in convert_song: {e}")
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@bp.route('/separate_vocals', methods=['POST'])
def separate_vocals():
"""
Separate vocals from a song file.
Form data:
- song: audio file
Returns: JSON with vocal and instrumental file URLs
"""
try:
print("\n[API] POST /api/separate_vocals")
if 'song' not in request.files:
return jsonify({'error': 'No song file provided'}), 400
song_file = request.files['song']
if not allowed_file(song_file.filename):
return jsonify({'error': f'File type not allowed'}), 400
# Save uploaded song
song_filename = f"song_{uuid.uuid4().hex}.wav"
song_path = OUTPUT_FOLDER / song_filename
song_file.save(song_path)
print(f"[API] Song saved: {song_path}")
print(f"[API] Separating vocals...")
from app.song_conversion.vocal_separator import VocalSeparator
separator = VocalSeparator()
vocals_path, instrumental_path = separator.separate_and_save(
song_path,
OUTPUT_FOLDER,
sr=16000
)
return jsonify({
'success': True,
'vocals_url': f'/api/audio/{vocals_path.name}',
'instrumental_url': f'/api/audio/{instrumental_path.name}',
'vocals_file': vocals_path.name,
'instrumental_file': instrumental_path.name
}), 200
except Exception as e:
print(f"[API] βœ— Error in separate_vocals: {e}")
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500