Spaces:

AJ50
/

voice-cloning-backend

Sleeping

App Files Files Community

voice-cloning-backend / backend /app /routes.py

AJ50

Add multilingual support: English (WaveRNN) + Hindi (XTTS) [sync with pragyan]

03fe1d8 16 days ago

raw

history blame

21.5 kB

	"""
	"""Flask API Backend for Voice Cloning
	Integrates the Python voice cloning backend with the React frontend
	Supports multilingual synthesis: English (WaveRNN) and Hindi (XTTS)
	"""

	from flask import Blueprint, request, jsonify, send_file
	from pathlib import Path
	import uuid
	import json
	from datetime import datetime
	import sys
	import os

	from .voice_cloning import synthesize

	bp = Blueprint('voice_cloning', __name__, url_prefix='/api')

	BASE_DIR = Path(__file__).resolve().parents[1]

	# Configuration
	UPLOAD_FOLDER = BASE_DIR / 'enrolled_voices'
	OUTPUT_FOLDER = BASE_DIR / 'outputs'
	MODELS_DIR = BASE_DIR / 'models'
	VOICES_DB = UPLOAD_FOLDER / 'voices.json'

	# Hindi model directory (check multiple possible locations)
	HINDI_MODEL_DIR = None
	possible_hindi_dirs = [
	Path(os.getenv('HINDI_MODEL_PATH', '')) if os.getenv('HINDI_MODEL_PATH') else None,
	BASE_DIR.parent / 'Apoorv_hindi_model' / 'models' / 'xtts_hindi', # Local development
	BASE_DIR / 'models' / 'xtts_hindi', # Alternative location
	]
	for path in possible_hindi_dirs:
	if path and path.exists():
	HINDI_MODEL_DIR = path
	print(f"✓ Hindi model found at: {HINDI_MODEL_DIR}")
	break

	if not HINDI_MODEL_DIR:
	print("⚠ Hindi model not found. Hindi synthesis will be unavailable.")
	print(" To enable Hindi support, set HINDI_MODEL_PATH environment variable")
	print(" or place model at: Apoorv_hindi_model/models/xtts_hindi")

	# Create directories with parents
	try:
	UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
	OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
	VOICES_DB.parent.mkdir(parents=True, exist_ok=True)
	except Exception as e:
	print(f"Failed to create directories: {e}")
	sys.exit(1)

	# Allowed audio extensions
	ALLOWED_EXTENSIONS = {'mp3', 'wav', 'm4a', 'flac', 'ogg', 'webm'}

	def allowed_file(filename):
	return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

	def load_voices_db():
	"""Load the voices database"""
	if VOICES_DB.exists():
	with open(VOICES_DB, 'r') as f:
	return json.load(f)
	return []

	def save_voices_db(voices):
	"""Save the voices database"""
	with open(VOICES_DB, 'w') as f:
	json.dump(voices, f, indent=2)

	@bp.route('/health', methods=['GET'])
	def health_check():
	"""Health check endpoint"""
	return jsonify({
	'status': 'healthy',
	'message': 'Voice Cloning API is running'
	})

	@bp.route('/enroll', methods=['POST'])
	def enroll_voice():
	"""
	Enroll a new voice by accepting audio file and voice name
	Frontend sends: FormData with 'audio' (File) and 'voice_name' (string)
	"""
	try:
	# Check if audio file is present
	if 'audio' not in request.files:
	return jsonify({'error': 'No audio file provided'}), 400

	audio_file = request.files['audio']
	voice_name = request.form.get('voice_name', 'Unnamed Voice').strip()

	if audio_file.filename == '':
	return jsonify({'error': 'No file selected'}), 400

	if not allowed_file(audio_file.filename):
	return jsonify({'error': 'Invalid file type. Supported: mp3, wav, m4a, flac, ogg, webm'}), 400

	# Ensure upload folder exists
	UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)

	# Generate unique ID and secure filename
	voice_id = f"voice_{uuid.uuid4().hex[:8]}"
	file_extension = audio_file.filename.rsplit('.', 1)[1].lower()
	filename = f"{voice_id}.{file_extension}"
	filepath = UPLOAD_FOLDER / filename

	# Save the audio file with error handling
	try:
	audio_file.save(str(filepath))
	print(f"✓ Audio file saved: {filepath}")
	except Exception as file_err:
	print(f"✗ Failed to save audio file: {file_err}")
	return jsonify({'error': f'Failed to save audio: {str(file_err)}'}), 500

	# Create voice entry
	voice_entry = {
	'id': voice_id,
	'name': voice_name,
	'filename': filename,
	'createdAt': datetime.now().isoformat()
	}

	# Update voices database with error handling
	try:
	VOICES_DB.parent.mkdir(parents=True, exist_ok=True)
	voices = load_voices_db()
	voices.append(voice_entry)
	save_voices_db(voices)
	print(f"✓ Voice '{voice_name}' (ID: {voice_id}) enrolled successfully")
	except Exception as db_err:
	print(f"✗ Failed to update voices DB: {db_err}")
	return jsonify({'error': f'Failed to save voice metadata: {str(db_err)}'}), 500

	return jsonify({
	'success': True,
	'message': f'Voice "{voice_name}" enrolled successfully',
	'voice_id': voice_id,
	'voice_name': voice_name,
	'created_at': voice_entry['createdAt']
	}), 201

	except Exception as e:
	print(f"✗ Error enrolling voice: {e}")
	import traceback
	traceback.print_exc()
	return jsonify({'error': f'Failed to enroll voice: {str(e)}'}), 500

	@bp.route('/voices', methods=['GET'])
	def get_voices():
	"""
	Get list of all enrolled voices
	Frontend uses this to populate the voice selection dropdown
	"""
	try:
	voices = load_voices_db()
	# Return only necessary info for frontend
	voices_list = [
	{
	'id': v['id'],
	'name': v['name'],
	'createdAt': v['createdAt']
	}
	for v in voices
	]
	return jsonify({'voices': voices_list}), 200
	except Exception as e:
	print(f"Error getting voices: {e}")
	return jsonify({'error': f'Failed to get voices: {str(e)}'}), 500

	@bp.route('/synthesize', methods=['POST'])
	def synthesize_speech():
	"""
	Synthesize speech from text using enrolled voice (multilingual support).

	Frontend sends JSON:
	{
	"text": "Your text here",
	"voice_id": "voice_xxx",
	"language": "english" or "hindi" (optional, defaults to english)
	}

	Supports:
	- English: Uses WaveRNN vocoder (existing model)
	- Hindi: Uses XTTS model (requires hindi_model_dir)
	"""
	try:
	data = request.get_json()

	if not data:
	return jsonify({'error': 'No data provided'}), 400

	text = data.get('text', '').strip()
	voice_id = data.get('voice_id', '')
	language = data.get('language', 'english').lower()

	if not text:
	return jsonify({'error': 'No text provided'}), 400

	if not voice_id:
	return jsonify({'error': 'No voice selected'}), 400

	if language not in ['english', 'hindi']:
	return jsonify({'error': f'Unsupported language: {language}. Supported: english, hindi'}), 400

	# Check if Hindi model is available for Hindi synthesis
	if language == 'hindi' and not HINDI_MODEL_DIR:
	return jsonify({
	'error': 'Hindi synthesis unavailable. Hindi model not configured.',
	'available_languages': ['english']
	}), 503

	# Find the voice in database
	voices = load_voices_db()
	voice = next((v for v in voices if v['id'] == voice_id), None)

	if not voice:
	return jsonify({'error': 'Voice not found'}), 404

	# Reconstruct path from UPLOAD_FOLDER
	voice_filepath = UPLOAD_FOLDER / voice['filename']

	if not voice_filepath.exists():
	return jsonify({'error': f'Voice file not found: {voice_filepath}'}), 404

	# Generate unique output filename
	output_filename = f"synthesis_{uuid.uuid4().hex[:8]}.wav"
	output_path = OUTPUT_FOLDER / output_filename

	print(f"\n[API /synthesize]")
	print(f" Language: {language.upper()}")
	print(f" Text: '{text[:50]}...'")
	print(f" Voice: '{voice['name']}'")
	print(f" Voice file: {voice_filepath}")
	print(f" Output: {output_path}")
	sys.stdout.flush()

	try:
	if language == 'english':
	# Use original English synthesis (WaveRNN)
	synthesize(
	voice_path=voice_filepath,
	text=text,
	models_dir=MODELS_DIR,
	out_path=output_path
	)
	else:
	# Use multilingual TTS for Hindi
	from app.multilingual_tts import MultilingualTTSService
	tts_service = MultilingualTTSService(
	models_dir=MODELS_DIR,
	hindi_model_dir=HINDI_MODEL_DIR
	)
	tts_service.synthesize_and_save(
	text=text,
	voice_sample_path=voice_filepath,
	output_path=output_path,
	language=language
	)
	tts_service.cleanup()

	print(f"[API /synthesize] ✓ Synthesis completed!")
	sys.stdout.flush()

	except Exception as synth_error:
	print(f"[API /synthesize] ✗ Synthesis error: {synth_error}")
	import traceback
	traceback.print_exc()
	sys.stdout.flush()
	return jsonify({'error': f'Synthesis failed: {str(synth_error)}'}), 500

	if not output_path.exists():
	error_msg = 'Synthesis failed - output not generated'
	return jsonify({'error': error_msg}), 500

	# Return the audio file URL
	return jsonify({
	'success': True,
	'message': f'{language.capitalize()} speech synthesized successfully',
	'audio_url': f'/api/audio/{output_filename}',
	'language': language
	}), 200

	except Exception as e:
	print(f"[API /synthesize] Unexpected error: {e}")
	import traceback
	traceback.print_exc()
	return jsonify({'error': f'Failed to synthesize speech: {str(e)}'}), 500

	@bp.route('/audio/<filename>', methods=['GET'])
	def get_audio(filename):
	"""
	Serve synthesized audio files
	Frontend uses this URL to play/download the generated audio
	"""
	try:
	filepath = OUTPUT_FOLDER / filename
	if not filepath.exists():
	return jsonify({'error': 'Audio file not found'}), 404

	return send_file(
	str(filepath),
	mimetype='audio/wav',
	as_attachment=False,
	download_name=filename
	)
	except Exception as e:
	print(f"Error serving audio: {e}")
	return jsonify({'error': f'Failed to serve audio: {str(e)}'}), 500

	@bp.route('/voices/<voice_id>', methods=['DELETE'])
	def delete_voice(voice_id):
	"""
	Delete an enrolled voice
	Optional: Frontend can call this to remove voices
	"""
	try:
	voices = load_voices_db()
	voice = next((v for v in voices if v['id'] == voice_id), None)

	if not voice:
	return jsonify({'error': 'Voice not found'}), 404

	# Delete the audio file
	voice_filepath = UPLOAD_FOLDER / voice['filename']
	if voice_filepath.exists():
	voice_filepath.unlink()

	# Remove from database
	voices = [v for v in voices if v['id'] != voice_id]
	save_voices_db(voices)

	return jsonify({
	'success': True,
	'message': f'Voice "{voice["name"]}" deleted successfully'
	}), 200

	except Exception as e:
	print(f"Error deleting voice: {e}")
	return jsonify({'error': f'Failed to delete voice: {str(e)}'}), 500

	@bp.route('/spectrogram/<audio_filename>', methods=['GET'])
	def get_spectrogram(audio_filename):
	"""
	Generate and return mel-spectrogram data for visualization
	Frontend can use this to display real-time mel-spectrogram
	"""
	try:
	print(f"[Spectrogram] Requested file: {audio_filename}")
	filepath = OUTPUT_FOLDER / audio_filename
	print(f"[Spectrogram] Full path: {filepath}")
	print(f"[Spectrogram] File exists: {filepath.exists()}")

	if not filepath.exists():
	print(f"[Spectrogram] ERROR: File not found: {filepath}")
	return jsonify({'error': f'Audio file {audio_filename} not found'}), 404

	# Import librosa for mel-spectrogram generation
	import librosa
	import numpy as np

	print(f"[Spectrogram] Loading audio file...")
	# Load audio file
	y, sr = librosa.load(str(filepath), sr=None)
	print(f"[Spectrogram] Audio loaded: shape={y.shape}, sr={sr}")

	# Generate mel-spectrogram
	# 80 mel bands (common for Tacotron2), hop_length varies with sample rate
	mel_spec = librosa.feature.melspectrogram(
	y=y,
	sr=sr,
	n_mels=80,
	hop_length=512
	)
	print(f"[Spectrogram] Mel-spec generated: shape={mel_spec.shape}")

	# Convert to dB scale (log scale for better visualization)
	mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

	# Normalize to 0-255 range for visualization
	mel_spec_normalized = np.clip(
	((mel_spec_db + 80) / 80 * 255),
	0,
	255
	).astype(np.uint8)

	# Convert to list for JSON serialization
	# Transpose to time x frequency format for frontend
	spectrogram_data = mel_spec_normalized.T.tolist()

	print(f"[Spectrogram] Successfully generated spectrogram: {len(spectrogram_data)} time steps")

	return jsonify({
	'spectrogram': spectrogram_data,
	'n_mels': 80,
	'shape': {
	'time_steps': len(spectrogram_data),
	'frequency_bins': 80
	}
	}), 200

	except Exception as e:
	print(f"[Spectrogram] ERROR: {str(e)}")
	import traceback
	traceback.print_exc()
	return jsonify({'error': f'Failed to generate spectrogram: {str(e)}'}), 500

	@bp.route('/waveform/<audio_filename>', methods=['GET'])
	def get_waveform(audio_filename):
	"""
	Serve audio waveform as numeric array for real-time FFT visualization
	Frontend fetches this and computes FFT using Web Audio API
	"""
	try:
	filepath = OUTPUT_FOLDER / audio_filename
	if not filepath.exists():
	return jsonify({'error': 'Audio file not found'}), 404

	import soundfile as sf
	import numpy as np

	# Load audio file
	# soundfile returns (data, sample_rate)
	y, sr = sf.read(str(filepath))

	# If stereo, convert to mono by taking first channel or averaging
	if len(y.shape) > 1:
	y = np.mean(y, axis=1)

	# Ensure float32 for compatibility
	y = np.asarray(y, dtype=np.float32)

	# Downsample if very long to reduce JSON payload
	# Typical waveform for 60s at 22050Hz = 1.3M samples
	# For FFT we can use 8000 Hz safely (captures up to 4 kHz)
	target_sr = 8000
	if sr > target_sr:
	# Calculate downsample factor
	resample_ratio = target_sr / sr
	new_length = int(len(y) * resample_ratio)
	# Simple linear interpolation for downsampling
	indices = np.linspace(0, len(y) - 1, new_length)
	y = np.interp(indices, np.arange(len(y)), y)
	sr = target_sr

	# Convert to list for JSON serialization
	waveform_data = y.tolist()

	return jsonify({
	'waveform': waveform_data,
	'sample_rate': sr,
	'duration': len(y) / sr,
	'samples': len(y)
	}), 200

	except ImportError as ie:
	err_msg = f'Soundfile library not available: {str(ie)}'
	return jsonify({'error': err_msg}), 500
	except Exception as e:
	print(f"Error serving waveform: {e}")
	import traceback
	traceback.print_exc()
	err_msg = f'Failed to generate waveform: {str(e)}'
	return jsonify({'error': err_msg}), 500


	# ============================================================================
	# SONG GENERATION ENDPOINTS
	# ============================================================================

	@bp.route('/convert_song', methods=['POST'])
	def convert_song():
	"""
	Convert a song to user's voice.

	Form data:
	- song: audio file (mp3, wav, etc.)
	- voice_id: ID of enrolled voice to use
	- language: 'english' or 'hindi'
	- add_effects: 'true' or 'false' to add reverb/compression

	Returns: Generated song audio file
	"""
	try:
	print("\n[API] POST /api/convert_song")

	# Validate input
	if 'song' not in request.files:
	return jsonify({'error': 'No song file provided'}), 400

	if 'voice_id' not in request.form:
	return jsonify({'error': 'No voice_id provided'}), 400

	song_file = request.files['song']
	voice_id = request.form.get('voice_id')
	language = request.form.get('language', 'english')
	add_effects = request.form.get('add_effects', 'true').lower() == 'true'

	if not allowed_file(song_file.filename):
	return jsonify({'error': f'File type not allowed. Allowed: {ALLOWED_EXTENSIONS}'}), 400

	# Load voices database
	voices_db = load_voices_db()
	voice_data = next((v for v in voices_db if v['id'] == voice_id), None)

	if not voice_data:
	return jsonify({'error': f'Voice {voice_id} not found'}), 404

	# Save uploaded song
	song_filename = f"song_{uuid.uuid4().hex}.wav"
	song_path = OUTPUT_FOLDER / song_filename
	song_file.save(song_path)
	print(f"[API] Song saved: {song_path}")

	# Get voice file path
	voice_filepath = UPLOAD_FOLDER / voice_data['filename']
	if not voice_filepath.exists():
	return jsonify({'error': 'Voice file not found'}), 404

	# Output path
	output_filename = f"converted_song_{uuid.uuid4().hex}.wav"
	output_path = OUTPUT_FOLDER / output_filename

	print(f"[API] Starting song conversion...")
	print(f"[API] Language: {language}")
	print(f"[API] Add effects: {add_effects}")

	# Import multilingual song processor
	from app.multilingual_song_processor import MultilingualSongProcessor

	processor = MultilingualSongProcessor(
	models_dir=MODELS_DIR,
	hindi_model_dir=HINDI_MODEL_DIR if language == 'hindi' else None
	)
	result_path = processor.convert_song(
	song_path=song_path,
	voice_path=voice_filepath,
	output_path=output_path,
	language=language,
	add_effects=add_effects
	)

	print(f"[API] Song conversion complete: {result_path}")

	# Return download URL
	return jsonify({
	'success': True,
	'message': 'Song converted successfully',
	'audio_url': f'/api/audio/{output_filename}',
	'filename': output_filename,
	'language': language
	}), 200

	except Exception as e:
	print(f"[API] ✗ Error in convert_song: {e}")
	import traceback
	traceback.print_exc()
	return jsonify({'error': str(e)}), 500


	@bp.route('/separate_vocals', methods=['POST'])
	def separate_vocals():
	"""
	Separate vocals from a song file.

	Form data:
	- song: audio file

	Returns: JSON with vocal and instrumental file URLs
	"""
	try:
	print("\n[API] POST /api/separate_vocals")

	if 'song' not in request.files:
	return jsonify({'error': 'No song file provided'}), 400

	song_file = request.files['song']

	if not allowed_file(song_file.filename):
	return jsonify({'error': f'File type not allowed'}), 400

	# Save uploaded song
	song_filename = f"song_{uuid.uuid4().hex}.wav"
	song_path = OUTPUT_FOLDER / song_filename
	song_file.save(song_path)

	print(f"[API] Song saved: {song_path}")
	print(f"[API] Separating vocals...")

	from app.song_conversion.vocal_separator import VocalSeparator

	separator = VocalSeparator()
	vocals_path, instrumental_path = separator.separate_and_save(
	song_path,
	OUTPUT_FOLDER,
	sr=16000
	)

	return jsonify({
	'success': True,
	'vocals_url': f'/api/audio/{vocals_path.name}',
	'instrumental_url': f'/api/audio/{instrumental_path.name}',
	'vocals_file': vocals_path.name,
	'instrumental_file': instrumental_path.name
	}), 200

	except Exception as e:
	print(f"[API] ✗ Error in separate_vocals: {e}")
	import traceback
	traceback.print_exc()
	return jsonify({'error': str(e)}), 500