""" """Flask API Backend for Voice Cloning Integrates the Python voice cloning backend with the React frontend Supports multilingual synthesis: English (WaveRNN) and Hindi (XTTS) """ from flask import Blueprint, request, jsonify, send_file from pathlib import Path import uuid import json from datetime import datetime import sys import os from .voice_cloning import synthesize bp = Blueprint('voice_cloning', __name__, url_prefix='/api') BASE_DIR = Path(__file__).resolve().parents[1] # Configuration UPLOAD_FOLDER = BASE_DIR / 'enrolled_voices' OUTPUT_FOLDER = BASE_DIR / 'outputs' MODELS_DIR = BASE_DIR / 'models' VOICES_DB = UPLOAD_FOLDER / 'voices.json' # Hindi model directory (check multiple possible locations) HINDI_MODEL_DIR = None possible_hindi_dirs = [ Path(os.getenv('HINDI_MODEL_PATH', '')) if os.getenv('HINDI_MODEL_PATH') else None, BASE_DIR.parent / 'Apoorv_hindi_model' / 'models' / 'xtts_hindi', # Local development BASE_DIR / 'models' / 'xtts_hindi', # Alternative location ] for path in possible_hindi_dirs: if path and path.exists(): HINDI_MODEL_DIR = path print(f"✓ Hindi model found at: {HINDI_MODEL_DIR}") break if not HINDI_MODEL_DIR: print("⚠ Hindi model not found. Hindi synthesis will be unavailable.") print(" To enable Hindi support, set HINDI_MODEL_PATH environment variable") print(" or place model at: Apoorv_hindi_model/models/xtts_hindi") # Create directories with parents try: UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True) VOICES_DB.parent.mkdir(parents=True, exist_ok=True) except Exception as e: print(f"Failed to create directories: {e}") sys.exit(1) # Allowed audio extensions ALLOWED_EXTENSIONS = {'mp3', 'wav', 'm4a', 'flac', 'ogg', 'webm'} def allowed_file(filename): return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS def load_voices_db(): """Load the voices database""" if VOICES_DB.exists(): with open(VOICES_DB, 'r') as f: return json.load(f) return [] def save_voices_db(voices): """Save the voices database""" with open(VOICES_DB, 'w') as f: json.dump(voices, f, indent=2) @bp.route('/health', methods=['GET']) def health_check(): """Health check endpoint""" return jsonify({ 'status': 'healthy', 'message': 'Voice Cloning API is running' }) @bp.route('/enroll', methods=['POST']) def enroll_voice(): """ Enroll a new voice by accepting audio file and voice name Frontend sends: FormData with 'audio' (File) and 'voice_name' (string) """ try: # Check if audio file is present if 'audio' not in request.files: return jsonify({'error': 'No audio file provided'}), 400 audio_file = request.files['audio'] voice_name = request.form.get('voice_name', 'Unnamed Voice').strip() if audio_file.filename == '': return jsonify({'error': 'No file selected'}), 400 if not allowed_file(audio_file.filename): return jsonify({'error': 'Invalid file type. Supported: mp3, wav, m4a, flac, ogg, webm'}), 400 # Ensure upload folder exists UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) # Generate unique ID and secure filename voice_id = f"voice_{uuid.uuid4().hex[:8]}" file_extension = audio_file.filename.rsplit('.', 1)[1].lower() filename = f"{voice_id}.{file_extension}" filepath = UPLOAD_FOLDER / filename # Save the audio file with error handling try: audio_file.save(str(filepath)) print(f"✓ Audio file saved: {filepath}") except Exception as file_err: print(f"✗ Failed to save audio file: {file_err}") return jsonify({'error': f'Failed to save audio: {str(file_err)}'}), 500 # Create voice entry voice_entry = { 'id': voice_id, 'name': voice_name, 'filename': filename, 'createdAt': datetime.now().isoformat() } # Update voices database with error handling try: VOICES_DB.parent.mkdir(parents=True, exist_ok=True) voices = load_voices_db() voices.append(voice_entry) save_voices_db(voices) print(f"✓ Voice '{voice_name}' (ID: {voice_id}) enrolled successfully") except Exception as db_err: print(f"✗ Failed to update voices DB: {db_err}") return jsonify({'error': f'Failed to save voice metadata: {str(db_err)}'}), 500 return jsonify({ 'success': True, 'message': f'Voice "{voice_name}" enrolled successfully', 'voice_id': voice_id, 'voice_name': voice_name, 'created_at': voice_entry['createdAt'] }), 201 except Exception as e: print(f"✗ Error enrolling voice: {e}") import traceback traceback.print_exc() return jsonify({'error': f'Failed to enroll voice: {str(e)}'}), 500 @bp.route('/voices', methods=['GET']) def get_voices(): """ Get list of all enrolled voices Frontend uses this to populate the voice selection dropdown """ try: voices = load_voices_db() # Return only necessary info for frontend voices_list = [ { 'id': v['id'], 'name': v['name'], 'createdAt': v['createdAt'] } for v in voices ] return jsonify({'voices': voices_list}), 200 except Exception as e: print(f"Error getting voices: {e}") return jsonify({'error': f'Failed to get voices: {str(e)}'}), 500 @bp.route('/synthesize', methods=['POST']) def synthesize_speech(): """ Synthesize speech from text using enrolled voice (multilingual support). Frontend sends JSON: { "text": "Your text here", "voice_id": "voice_xxx", "language": "english" or "hindi" (optional, defaults to english) } Supports: - English: Uses WaveRNN vocoder (existing model) - Hindi: Uses XTTS model (requires hindi_model_dir) """ try: data = request.get_json() if not data: return jsonify({'error': 'No data provided'}), 400 text = data.get('text', '').strip() voice_id = data.get('voice_id', '') language = data.get('language', 'english').lower() if not text: return jsonify({'error': 'No text provided'}), 400 if not voice_id: return jsonify({'error': 'No voice selected'}), 400 if language not in ['english', 'hindi']: return jsonify({'error': f'Unsupported language: {language}. Supported: english, hindi'}), 400 # Check if Hindi model is available for Hindi synthesis if language == 'hindi' and not HINDI_MODEL_DIR: return jsonify({ 'error': 'Hindi synthesis unavailable. Hindi model not configured.', 'available_languages': ['english'] }), 503 # Find the voice in database voices = load_voices_db() voice = next((v for v in voices if v['id'] == voice_id), None) if not voice: return jsonify({'error': 'Voice not found'}), 404 # Reconstruct path from UPLOAD_FOLDER voice_filepath = UPLOAD_FOLDER / voice['filename'] if not voice_filepath.exists(): return jsonify({'error': f'Voice file not found: {voice_filepath}'}), 404 # Generate unique output filename output_filename = f"synthesis_{uuid.uuid4().hex[:8]}.wav" output_path = OUTPUT_FOLDER / output_filename print(f"\n[API /synthesize]") print(f" Language: {language.upper()}") print(f" Text: '{text[:50]}...'") print(f" Voice: '{voice['name']}'") print(f" Voice file: {voice_filepath}") print(f" Output: {output_path}") sys.stdout.flush() try: if language == 'english': # Use original English synthesis (WaveRNN) synthesize( voice_path=voice_filepath, text=text, models_dir=MODELS_DIR, out_path=output_path ) else: # Use multilingual TTS for Hindi from app.multilingual_tts import MultilingualTTSService tts_service = MultilingualTTSService( models_dir=MODELS_DIR, hindi_model_dir=HINDI_MODEL_DIR ) tts_service.synthesize_and_save( text=text, voice_sample_path=voice_filepath, output_path=output_path, language=language ) tts_service.cleanup() print(f"[API /synthesize] ✓ Synthesis completed!") sys.stdout.flush() except Exception as synth_error: print(f"[API /synthesize] ✗ Synthesis error: {synth_error}") import traceback traceback.print_exc() sys.stdout.flush() return jsonify({'error': f'Synthesis failed: {str(synth_error)}'}), 500 if not output_path.exists(): error_msg = 'Synthesis failed - output not generated' return jsonify({'error': error_msg}), 500 # Return the audio file URL return jsonify({ 'success': True, 'message': f'{language.capitalize()} speech synthesized successfully', 'audio_url': f'/api/audio/{output_filename}', 'language': language }), 200 except Exception as e: print(f"[API /synthesize] Unexpected error: {e}") import traceback traceback.print_exc() return jsonify({'error': f'Failed to synthesize speech: {str(e)}'}), 500 @bp.route('/audio/', methods=['GET']) def get_audio(filename): """ Serve synthesized audio files Frontend uses this URL to play/download the generated audio """ try: filepath = OUTPUT_FOLDER / filename if not filepath.exists(): return jsonify({'error': 'Audio file not found'}), 404 return send_file( str(filepath), mimetype='audio/wav', as_attachment=False, download_name=filename ) except Exception as e: print(f"Error serving audio: {e}") return jsonify({'error': f'Failed to serve audio: {str(e)}'}), 500 @bp.route('/voices/', methods=['DELETE']) def delete_voice(voice_id): """ Delete an enrolled voice Optional: Frontend can call this to remove voices """ try: voices = load_voices_db() voice = next((v for v in voices if v['id'] == voice_id), None) if not voice: return jsonify({'error': 'Voice not found'}), 404 # Delete the audio file voice_filepath = UPLOAD_FOLDER / voice['filename'] if voice_filepath.exists(): voice_filepath.unlink() # Remove from database voices = [v for v in voices if v['id'] != voice_id] save_voices_db(voices) return jsonify({ 'success': True, 'message': f'Voice "{voice["name"]}" deleted successfully' }), 200 except Exception as e: print(f"Error deleting voice: {e}") return jsonify({'error': f'Failed to delete voice: {str(e)}'}), 500 @bp.route('/spectrogram/', methods=['GET']) def get_spectrogram(audio_filename): """ Generate and return mel-spectrogram data for visualization Frontend can use this to display real-time mel-spectrogram """ try: print(f"[Spectrogram] Requested file: {audio_filename}") filepath = OUTPUT_FOLDER / audio_filename print(f"[Spectrogram] Full path: {filepath}") print(f"[Spectrogram] File exists: {filepath.exists()}") if not filepath.exists(): print(f"[Spectrogram] ERROR: File not found: {filepath}") return jsonify({'error': f'Audio file {audio_filename} not found'}), 404 # Import librosa for mel-spectrogram generation import librosa import numpy as np print(f"[Spectrogram] Loading audio file...") # Load audio file y, sr = librosa.load(str(filepath), sr=None) print(f"[Spectrogram] Audio loaded: shape={y.shape}, sr={sr}") # Generate mel-spectrogram # 80 mel bands (common for Tacotron2), hop_length varies with sample rate mel_spec = librosa.feature.melspectrogram( y=y, sr=sr, n_mels=80, hop_length=512 ) print(f"[Spectrogram] Mel-spec generated: shape={mel_spec.shape}") # Convert to dB scale (log scale for better visualization) mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max) # Normalize to 0-255 range for visualization mel_spec_normalized = np.clip( ((mel_spec_db + 80) / 80 * 255), 0, 255 ).astype(np.uint8) # Convert to list for JSON serialization # Transpose to time x frequency format for frontend spectrogram_data = mel_spec_normalized.T.tolist() print(f"[Spectrogram] Successfully generated spectrogram: {len(spectrogram_data)} time steps") return jsonify({ 'spectrogram': spectrogram_data, 'n_mels': 80, 'shape': { 'time_steps': len(spectrogram_data), 'frequency_bins': 80 } }), 200 except Exception as e: print(f"[Spectrogram] ERROR: {str(e)}") import traceback traceback.print_exc() return jsonify({'error': f'Failed to generate spectrogram: {str(e)}'}), 500 @bp.route('/waveform/', methods=['GET']) def get_waveform(audio_filename): """ Serve audio waveform as numeric array for real-time FFT visualization Frontend fetches this and computes FFT using Web Audio API """ try: filepath = OUTPUT_FOLDER / audio_filename if not filepath.exists(): return jsonify({'error': 'Audio file not found'}), 404 import soundfile as sf import numpy as np # Load audio file # soundfile returns (data, sample_rate) y, sr = sf.read(str(filepath)) # If stereo, convert to mono by taking first channel or averaging if len(y.shape) > 1: y = np.mean(y, axis=1) # Ensure float32 for compatibility y = np.asarray(y, dtype=np.float32) # Downsample if very long to reduce JSON payload # Typical waveform for 60s at 22050Hz = 1.3M samples # For FFT we can use 8000 Hz safely (captures up to 4 kHz) target_sr = 8000 if sr > target_sr: # Calculate downsample factor resample_ratio = target_sr / sr new_length = int(len(y) * resample_ratio) # Simple linear interpolation for downsampling indices = np.linspace(0, len(y) - 1, new_length) y = np.interp(indices, np.arange(len(y)), y) sr = target_sr # Convert to list for JSON serialization waveform_data = y.tolist() return jsonify({ 'waveform': waveform_data, 'sample_rate': sr, 'duration': len(y) / sr, 'samples': len(y) }), 200 except ImportError as ie: err_msg = f'Soundfile library not available: {str(ie)}' return jsonify({'error': err_msg}), 500 except Exception as e: print(f"Error serving waveform: {e}") import traceback traceback.print_exc() err_msg = f'Failed to generate waveform: {str(e)}' return jsonify({'error': err_msg}), 500 # ============================================================================ # SONG GENERATION ENDPOINTS # ============================================================================ @bp.route('/convert_song', methods=['POST']) def convert_song(): """ Convert a song to user's voice. Form data: - song: audio file (mp3, wav, etc.) - voice_id: ID of enrolled voice to use - language: 'english' or 'hindi' - add_effects: 'true' or 'false' to add reverb/compression Returns: Generated song audio file """ try: print("\n[API] POST /api/convert_song") # Validate input if 'song' not in request.files: return jsonify({'error': 'No song file provided'}), 400 if 'voice_id' not in request.form: return jsonify({'error': 'No voice_id provided'}), 400 song_file = request.files['song'] voice_id = request.form.get('voice_id') language = request.form.get('language', 'english') add_effects = request.form.get('add_effects', 'true').lower() == 'true' if not allowed_file(song_file.filename): return jsonify({'error': f'File type not allowed. Allowed: {ALLOWED_EXTENSIONS}'}), 400 # Load voices database voices_db = load_voices_db() voice_data = next((v for v in voices_db if v['id'] == voice_id), None) if not voice_data: return jsonify({'error': f'Voice {voice_id} not found'}), 404 # Save uploaded song song_filename = f"song_{uuid.uuid4().hex}.wav" song_path = OUTPUT_FOLDER / song_filename song_file.save(song_path) print(f"[API] Song saved: {song_path}") # Get voice file path voice_filepath = UPLOAD_FOLDER / voice_data['filename'] if not voice_filepath.exists(): return jsonify({'error': 'Voice file not found'}), 404 # Output path output_filename = f"converted_song_{uuid.uuid4().hex}.wav" output_path = OUTPUT_FOLDER / output_filename print(f"[API] Starting song conversion...") print(f"[API] Language: {language}") print(f"[API] Add effects: {add_effects}") # Import multilingual song processor from app.multilingual_song_processor import MultilingualSongProcessor processor = MultilingualSongProcessor( models_dir=MODELS_DIR, hindi_model_dir=HINDI_MODEL_DIR if language == 'hindi' else None ) result_path = processor.convert_song( song_path=song_path, voice_path=voice_filepath, output_path=output_path, language=language, add_effects=add_effects ) print(f"[API] Song conversion complete: {result_path}") # Return download URL return jsonify({ 'success': True, 'message': 'Song converted successfully', 'audio_url': f'/api/audio/{output_filename}', 'filename': output_filename, 'language': language }), 200 except Exception as e: print(f"[API] ✗ Error in convert_song: {e}") import traceback traceback.print_exc() return jsonify({'error': str(e)}), 500 @bp.route('/separate_vocals', methods=['POST']) def separate_vocals(): """ Separate vocals from a song file. Form data: - song: audio file Returns: JSON with vocal and instrumental file URLs """ try: print("\n[API] POST /api/separate_vocals") if 'song' not in request.files: return jsonify({'error': 'No song file provided'}), 400 song_file = request.files['song'] if not allowed_file(song_file.filename): return jsonify({'error': f'File type not allowed'}), 400 # Save uploaded song song_filename = f"song_{uuid.uuid4().hex}.wav" song_path = OUTPUT_FOLDER / song_filename song_file.save(song_path) print(f"[API] Song saved: {song_path}") print(f"[API] Separating vocals...") from app.song_conversion.vocal_separator import VocalSeparator separator = VocalSeparator() vocals_path, instrumental_path = separator.separate_and_save( song_path, OUTPUT_FOLDER, sr=16000 ) return jsonify({ 'success': True, 'vocals_url': f'/api/audio/{vocals_path.name}', 'instrumental_url': f'/api/audio/{instrumental_path.name}', 'vocals_file': vocals_path.name, 'instrumental_file': instrumental_path.name }), 200 except Exception as e: print(f"[API] ✗ Error in separate_vocals: {e}") import traceback traceback.print_exc() return jsonify({'error': str(e)}), 500