Spaces:

AJ50
/

voice-cloning-backend

Sleeping

File size: 21,511 Bytes

"""
"""Flask API Backend for Voice Cloning
Integrates the Python voice cloning backend with the React frontend
Supports multilingual synthesis: English (WaveRNN) and Hindi (XTTS)
"""

from flask import Blueprint, request, jsonify, send_file
from pathlib import Path
import uuid
import json
from datetime import datetime
import sys
import os

from .voice_cloning import synthesize

bp = Blueprint('voice_cloning', __name__, url_prefix='/api')

BASE_DIR = Path(__file__).resolve().parents[1]

# Configuration
UPLOAD_FOLDER = BASE_DIR / 'enrolled_voices'
OUTPUT_FOLDER = BASE_DIR / 'outputs'
MODELS_DIR = BASE_DIR / 'models'
VOICES_DB = UPLOAD_FOLDER / 'voices.json'

# Hindi model directory (check multiple possible locations)
HINDI_MODEL_DIR = None
possible_hindi_dirs = [
    Path(os.getenv('HINDI_MODEL_PATH', '')) if os.getenv('HINDI_MODEL_PATH') else None,
    BASE_DIR.parent / 'Apoorv_hindi_model' / 'models' / 'xtts_hindi',  # Local development
    BASE_DIR / 'models' / 'xtts_hindi',  # Alternative location
]
for path in possible_hindi_dirs:
    if path and path.exists():
        HINDI_MODEL_DIR = path
        print(f"✓ Hindi model found at: {HINDI_MODEL_DIR}")
        break

if not HINDI_MODEL_DIR:
    print("⚠ Hindi model not found. Hindi synthesis will be unavailable.")
    print("  To enable Hindi support, set HINDI_MODEL_PATH environment variable")
    print("  or place model at: Apoorv_hindi_model/models/xtts_hindi")

# Create directories with parents
try:
    UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
    OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
    VOICES_DB.parent.mkdir(parents=True, exist_ok=True)
except Exception as e:
    print(f"Failed to create directories: {e}")
    sys.exit(1)

# Allowed audio extensions
ALLOWED_EXTENSIONS = {'mp3', 'wav', 'm4a', 'flac', 'ogg', 'webm'}

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def load_voices_db():
    """Load the voices database"""
    if VOICES_DB.exists():
        with open(VOICES_DB, 'r') as f:
            return json.load(f)
    return []

def save_voices_db(voices):
    """Save the voices database"""
    with open(VOICES_DB, 'w') as f:
        json.dump(voices, f, indent=2)

@bp.route('/health', methods=['GET'])
def health_check():
    """Health check endpoint"""
    return jsonify({
        'status': 'healthy',
        'message': 'Voice Cloning API is running'
    })

@bp.route('/enroll', methods=['POST'])
def enroll_voice():
    """
    Enroll a new voice by accepting audio file and voice name
    Frontend sends: FormData with 'audio' (File) and 'voice_name' (string)
    """
    try:
        # Check if audio file is present
        if 'audio' not in request.files:
            return jsonify({'error': 'No audio file provided'}), 400
        
        audio_file = request.files['audio']
        voice_name = request.form.get('voice_name', 'Unnamed Voice').strip()
        
        if audio_file.filename == '':
            return jsonify({'error': 'No file selected'}), 400
        
        if not allowed_file(audio_file.filename):
            return jsonify({'error': 'Invalid file type. Supported: mp3, wav, m4a, flac, ogg, webm'}), 400
        
        # Ensure upload folder exists
        UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
        
        # Generate unique ID and secure filename
        voice_id = f"voice_{uuid.uuid4().hex[:8]}"
        file_extension = audio_file.filename.rsplit('.', 1)[1].lower()
        filename = f"{voice_id}.{file_extension}"
        filepath = UPLOAD_FOLDER / filename
        
        # Save the audio file with error handling
        try:
            audio_file.save(str(filepath))
            print(f"✓ Audio file saved: {filepath}")
        except Exception as file_err:
            print(f"✗ Failed to save audio file: {file_err}")
            return jsonify({'error': f'Failed to save audio: {str(file_err)}'}), 500
        
        # Create voice entry
        voice_entry = {
            'id': voice_id,
            'name': voice_name,
            'filename': filename,
            'createdAt': datetime.now().isoformat()
        }
        
        # Update voices database with error handling
        try:
            VOICES_DB.parent.mkdir(parents=True, exist_ok=True)
            voices = load_voices_db()
            voices.append(voice_entry)
            save_voices_db(voices)
            print(f"✓ Voice '{voice_name}' (ID: {voice_id}) enrolled successfully")
        except Exception as db_err:
            print(f"✗ Failed to update voices DB: {db_err}")
            return jsonify({'error': f'Failed to save voice metadata: {str(db_err)}'}), 500
        
        return jsonify({
            'success': True,
            'message': f'Voice "{voice_name}" enrolled successfully',
            'voice_id': voice_id,
            'voice_name': voice_name,
            'created_at': voice_entry['createdAt']
        }), 201
        
    except Exception as e:
        print(f"✗ Error enrolling voice: {e}")
        import traceback
        traceback.print_exc()
        return jsonify({'error': f'Failed to enroll voice: {str(e)}'}), 500

@bp.route('/voices', methods=['GET'])
def get_voices():
    """
    Get list of all enrolled voices
    Frontend uses this to populate the voice selection dropdown
    """
    try:
        voices = load_voices_db()
        # Return only necessary info for frontend
        voices_list = [
            {
                'id': v['id'],
                'name': v['name'],
                'createdAt': v['createdAt']
            }
            for v in voices
        ]
        return jsonify({'voices': voices_list}), 200
    except Exception as e:
        print(f"Error getting voices: {e}")
        return jsonify({'error': f'Failed to get voices: {str(e)}'}), 500

@bp.route('/synthesize', methods=['POST'])
def synthesize_speech():
    """
    Synthesize speech from text using enrolled voice (multilingual support).
    
    Frontend sends JSON:
    {
        "text": "Your text here",
        "voice_id": "voice_xxx",
        "language": "english" or "hindi"  (optional, defaults to english)
    }
    
    Supports:
    - English: Uses WaveRNN vocoder (existing model)
    - Hindi: Uses XTTS model (requires hindi_model_dir)
    """
    try:
        data = request.get_json()
        
        if not data:
            return jsonify({'error': 'No data provided'}), 400
        
        text = data.get('text', '').strip()
        voice_id = data.get('voice_id', '')
        language = data.get('language', 'english').lower()
        
        if not text:
            return jsonify({'error': 'No text provided'}), 400
        
        if not voice_id:
            return jsonify({'error': 'No voice selected'}), 400
        
        if language not in ['english', 'hindi']:
            return jsonify({'error': f'Unsupported language: {language}. Supported: english, hindi'}), 400
        
        # Check if Hindi model is available for Hindi synthesis
        if language == 'hindi' and not HINDI_MODEL_DIR:
            return jsonify({
                'error': 'Hindi synthesis unavailable. Hindi model not configured.',
                'available_languages': ['english']
            }), 503
        
        # Find the voice in database
        voices = load_voices_db()
        voice = next((v for v in voices if v['id'] == voice_id), None)
        
        if not voice:
            return jsonify({'error': 'Voice not found'}), 404
        
        # Reconstruct path from UPLOAD_FOLDER
        voice_filepath = UPLOAD_FOLDER / voice['filename']
            
        if not voice_filepath.exists():
            return jsonify({'error': f'Voice file not found: {voice_filepath}'}), 404
        
        # Generate unique output filename
        output_filename = f"synthesis_{uuid.uuid4().hex[:8]}.wav"
        output_path = OUTPUT_FOLDER / output_filename
        
        print(f"\n[API /synthesize]")
        print(f"  Language: {language.upper()}")
        print(f"  Text: '{text[:50]}...'")
        print(f"  Voice: '{voice['name']}'")
        print(f"  Voice file: {voice_filepath}")
        print(f"  Output: {output_path}")
        sys.stdout.flush()
        
        try:
            if language == 'english':
                # Use original English synthesis (WaveRNN)
                synthesize(
                    voice_path=voice_filepath,
                    text=text,
                    models_dir=MODELS_DIR,
                    out_path=output_path
                )
            else:
                # Use multilingual TTS for Hindi
                from app.multilingual_tts import MultilingualTTSService
                tts_service = MultilingualTTSService(
                    models_dir=MODELS_DIR,
                    hindi_model_dir=HINDI_MODEL_DIR
                )
                tts_service.synthesize_and_save(
                    text=text,
                    voice_sample_path=voice_filepath,
                    output_path=output_path,
                    language=language
                )
                tts_service.cleanup()
            
            print(f"[API /synthesize] ✓ Synthesis completed!")
            sys.stdout.flush()
            
        except Exception as synth_error:
            print(f"[API /synthesize] ✗ Synthesis error: {synth_error}")
            import traceback
            traceback.print_exc()
            sys.stdout.flush()
            return jsonify({'error': f'Synthesis failed: {str(synth_error)}'}), 500
        
        if not output_path.exists():
            error_msg = 'Synthesis failed - output not generated'
            return jsonify({'error': error_msg}), 500
        
        # Return the audio file URL
        return jsonify({
            'success': True,
            'message': f'{language.capitalize()} speech synthesized successfully',
            'audio_url': f'/api/audio/{output_filename}',
            'language': language
        }), 200
        
    except Exception as e:
        print(f"[API /synthesize] Unexpected error: {e}")
        import traceback
        traceback.print_exc()
        return jsonify({'error': f'Failed to synthesize speech: {str(e)}'}), 500

@bp.route('/audio/<filename>', methods=['GET'])
def get_audio(filename):
    """
    Serve synthesized audio files
    Frontend uses this URL to play/download the generated audio
    """
    try:
        filepath = OUTPUT_FOLDER / filename
        if not filepath.exists():
            return jsonify({'error': 'Audio file not found'}), 404
        
        return send_file(
            str(filepath),
            mimetype='audio/wav',
            as_attachment=False,
            download_name=filename
        )
    except Exception as e:
        print(f"Error serving audio: {e}")
        return jsonify({'error': f'Failed to serve audio: {str(e)}'}), 500

@bp.route('/voices/<voice_id>', methods=['DELETE'])
def delete_voice(voice_id):
    """
    Delete an enrolled voice
    Optional: Frontend can call this to remove voices
    """
    try:
        voices = load_voices_db()
        voice = next((v for v in voices if v['id'] == voice_id), None)
        
        if not voice:
            return jsonify({'error': 'Voice not found'}), 404
        
        # Delete the audio file
        voice_filepath = UPLOAD_FOLDER / voice['filename']
        if voice_filepath.exists():
            voice_filepath.unlink()
        
        # Remove from database
        voices = [v for v in voices if v['id'] != voice_id]
        save_voices_db(voices)
        
        return jsonify({
            'success': True,
            'message': f'Voice "{voice["name"]}" deleted successfully'
        }), 200
        
    except Exception as e:
        print(f"Error deleting voice: {e}")
        return jsonify({'error': f'Failed to delete voice: {str(e)}'}), 500

@bp.route('/spectrogram/<audio_filename>', methods=['GET'])
def get_spectrogram(audio_filename):
    """
    Generate and return mel-spectrogram data for visualization
    Frontend can use this to display real-time mel-spectrogram
    """
    try:
        print(f"[Spectrogram] Requested file: {audio_filename}")
        filepath = OUTPUT_FOLDER / audio_filename
        print(f"[Spectrogram] Full path: {filepath}")
        print(f"[Spectrogram] File exists: {filepath.exists()}")

        if not filepath.exists():
            print(f"[Spectrogram] ERROR: File not found: {filepath}")
            return jsonify({'error': f'Audio file {audio_filename} not found'}), 404
        
        # Import librosa for mel-spectrogram generation
        import librosa
        import numpy as np
        
        print(f"[Spectrogram] Loading audio file...")
        # Load audio file
        y, sr = librosa.load(str(filepath), sr=None)
        print(f"[Spectrogram] Audio loaded: shape={y.shape}, sr={sr}")
        
        # Generate mel-spectrogram
        # 80 mel bands (common for Tacotron2), hop_length varies with sample rate
        mel_spec = librosa.feature.melspectrogram(
            y=y, 
            sr=sr,
            n_mels=80,
            hop_length=512
        )
        print(f"[Spectrogram] Mel-spec generated: shape={mel_spec.shape}")
        
        # Convert to dB scale (log scale for better visualization)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        
        # Normalize to 0-255 range for visualization
        mel_spec_normalized = np.clip(
            ((mel_spec_db + 80) / 80 * 255), 
            0, 
            255
        ).astype(np.uint8)
        
        # Convert to list for JSON serialization
        # Transpose to time x frequency format for frontend
        spectrogram_data = mel_spec_normalized.T.tolist()
        
        print(f"[Spectrogram] Successfully generated spectrogram: {len(spectrogram_data)} time steps")
        
        return jsonify({
            'spectrogram': spectrogram_data,
            'n_mels': 80,
            'shape': {
                'time_steps': len(spectrogram_data),
                'frequency_bins': 80
            }
        }), 200
        
    except Exception as e:
        print(f"[Spectrogram] ERROR: {str(e)}")
        import traceback
        traceback.print_exc()
        return jsonify({'error': f'Failed to generate spectrogram: {str(e)}'}), 500

@bp.route('/waveform/<audio_filename>', methods=['GET'])
def get_waveform(audio_filename):
    """
    Serve audio waveform as numeric array for real-time FFT visualization
    Frontend fetches this and computes FFT using Web Audio API
    """
    try:
        filepath = OUTPUT_FOLDER / audio_filename
        if not filepath.exists():
            return jsonify({'error': 'Audio file not found'}), 404
        
        import soundfile as sf
        import numpy as np
        
        # Load audio file
        # soundfile returns (data, sample_rate)
        y, sr = sf.read(str(filepath))
        
        # If stereo, convert to mono by taking first channel or averaging
        if len(y.shape) > 1:
            y = np.mean(y, axis=1)
        
        # Ensure float32 for compatibility
        y = np.asarray(y, dtype=np.float32)
        
        # Downsample if very long to reduce JSON payload
        # Typical waveform for 60s at 22050Hz = 1.3M samples
        # For FFT we can use 8000 Hz safely (captures up to 4 kHz)
        target_sr = 8000
        if sr > target_sr:
            # Calculate downsample factor
            resample_ratio = target_sr / sr
            new_length = int(len(y) * resample_ratio)
            # Simple linear interpolation for downsampling
            indices = np.linspace(0, len(y) - 1, new_length)
            y = np.interp(indices, np.arange(len(y)), y)
            sr = target_sr
        
        # Convert to list for JSON serialization
        waveform_data = y.tolist()
        
        return jsonify({
            'waveform': waveform_data,
            'sample_rate': sr,
            'duration': len(y) / sr,
            'samples': len(y)
        }), 200
        
    except ImportError as ie:
        err_msg = f'Soundfile library not available: {str(ie)}'
        return jsonify({'error': err_msg}), 500
    except Exception as e:
        print(f"Error serving waveform: {e}")
        import traceback
        traceback.print_exc()
        err_msg = f'Failed to generate waveform: {str(e)}'
        return jsonify({'error': err_msg}), 500


# ============================================================================
# SONG GENERATION ENDPOINTS
# ============================================================================

@bp.route('/convert_song', methods=['POST'])
def convert_song():
    """
    Convert a song to user's voice.
    
    Form data:
    - song: audio file (mp3, wav, etc.)
    - voice_id: ID of enrolled voice to use
    - language: 'english' or 'hindi'
    - add_effects: 'true' or 'false' to add reverb/compression
    
    Returns: Generated song audio file
    """
    try:
        print("\n[API] POST /api/convert_song")
        
        # Validate input
        if 'song' not in request.files:
            return jsonify({'error': 'No song file provided'}), 400
        
        if 'voice_id' not in request.form:
            return jsonify({'error': 'No voice_id provided'}), 400
        
        song_file = request.files['song']
        voice_id = request.form.get('voice_id')
        language = request.form.get('language', 'english')
        add_effects = request.form.get('add_effects', 'true').lower() == 'true'
        
        if not allowed_file(song_file.filename):
            return jsonify({'error': f'File type not allowed. Allowed: {ALLOWED_EXTENSIONS}'}), 400
        
        # Load voices database
        voices_db = load_voices_db()
        voice_data = next((v for v in voices_db if v['id'] == voice_id), None)
        
        if not voice_data:
            return jsonify({'error': f'Voice {voice_id} not found'}), 404
        
        # Save uploaded song
        song_filename = f"song_{uuid.uuid4().hex}.wav"
        song_path = OUTPUT_FOLDER / song_filename
        song_file.save(song_path)
        print(f"[API] Song saved: {song_path}")
        
        # Get voice file path
        voice_filepath = UPLOAD_FOLDER / voice_data['filename']
        if not voice_filepath.exists():
            return jsonify({'error': 'Voice file not found'}), 404
        
        # Output path
        output_filename = f"converted_song_{uuid.uuid4().hex}.wav"
        output_path = OUTPUT_FOLDER / output_filename
        
        print(f"[API] Starting song conversion...")
        print(f"[API] Language: {language}")
        print(f"[API] Add effects: {add_effects}")
        
        # Import multilingual song processor
        from app.multilingual_song_processor import MultilingualSongProcessor
        
        processor = MultilingualSongProcessor(
            models_dir=MODELS_DIR,
            hindi_model_dir=HINDI_MODEL_DIR if language == 'hindi' else None
        )
        result_path = processor.convert_song(
            song_path=song_path,
            voice_path=voice_filepath,
            output_path=output_path,
            language=language,
            add_effects=add_effects
        )
        
        print(f"[API] Song conversion complete: {result_path}")
        
        # Return download URL
        return jsonify({
            'success': True,
            'message': 'Song converted successfully',
            'audio_url': f'/api/audio/{output_filename}',
            'filename': output_filename,
            'language': language
        }), 200
        
    except Exception as e:
        print(f"[API] ✗ Error in convert_song: {e}")
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500


@bp.route('/separate_vocals', methods=['POST'])
def separate_vocals():
    """
    Separate vocals from a song file.
    
    Form data:
    - song: audio file
    
    Returns: JSON with vocal and instrumental file URLs
    """
    try:
        print("\n[API] POST /api/separate_vocals")
        
        if 'song' not in request.files:
            return jsonify({'error': 'No song file provided'}), 400
        
        song_file = request.files['song']
        
        if not allowed_file(song_file.filename):
            return jsonify({'error': f'File type not allowed'}), 400
        
        # Save uploaded song
        song_filename = f"song_{uuid.uuid4().hex}.wav"
        song_path = OUTPUT_FOLDER / song_filename
        song_file.save(song_path)
        
        print(f"[API] Song saved: {song_path}")
        print(f"[API] Separating vocals...")
        
        from app.song_conversion.vocal_separator import VocalSeparator
        
        separator = VocalSeparator()
        vocals_path, instrumental_path = separator.separate_and_save(
            song_path,
            OUTPUT_FOLDER,
            sr=16000
        )
        
        return jsonify({
            'success': True,
            'vocals_url': f'/api/audio/{vocals_path.name}',
            'instrumental_url': f'/api/audio/{instrumental_path.name}',
            'vocals_file': vocals_path.name,
            'instrumental_file': instrumental_path.name
        }), 200
        
    except Exception as e:
        print(f"[API] ✗ Error in separate_vocals: {e}")
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500