Spaces:
Sleeping
Sleeping
| """ | |
| Voice Service for optional Text-to-Speech (TTS) and Automatic Speech Recognition (ASR) | |
| Provides voice interaction capabilities when enabled by user. | |
| """ | |
| import asyncio | |
| import logging | |
| import tempfile | |
| import os | |
| from typing import Optional, Dict, Any | |
| from pathlib import Path | |
| from config import ( | |
| ENABLE_VOICE_FEATURES, TTS_PROVIDER, ASR_PROVIDER, | |
| VOICE_LANGUAGE, DEFAULT_VOICE_SPEED | |
| ) | |
| logger = logging.getLogger("voicebot") | |
| class VoiceService: | |
| def __init__(self): | |
| self.voice_enabled = ENABLE_VOICE_FEATURES | |
| self.tts_provider = TTS_PROVIDER | |
| self.asr_provider = ASR_PROVIDER | |
| self.language = VOICE_LANGUAGE | |
| self.voice_speed = DEFAULT_VOICE_SPEED | |
| # Initialize services if voice is enabled | |
| if self.voice_enabled: | |
| self._init_tts_service() | |
| self._init_asr_service() | |
| logger.info(f"🎤 Voice Service initialized - TTS: {self.tts_provider}, ASR: {self.asr_provider}") | |
| else: | |
| logger.info("🔇 Voice features disabled") | |
| def _init_tts_service(self): | |
| """Initialize Text-to-Speech service""" | |
| try: | |
| if self.tts_provider == "edge-tts": | |
| import edge_tts | |
| self.tts_available = True | |
| logger.info("✅ Edge TTS initialized") | |
| elif self.tts_provider == "openai-tts": | |
| # OpenAI TTS would require OpenAI API key | |
| self.tts_available = False | |
| logger.info("⚠️ OpenAI TTS not configured") | |
| else: | |
| self.tts_available = False | |
| logger.warning(f"⚠️ Unknown TTS provider: {self.tts_provider}") | |
| except ImportError as e: | |
| self.tts_available = False | |
| logger.warning(f"⚠️ TTS dependencies not available: {e}") | |
| def _init_asr_service(self): | |
| """Initialize Automatic Speech Recognition service""" | |
| try: | |
| if self.asr_provider == "whisper": | |
| import whisper | |
| # Use base model for balance between speed and accuracy | |
| self.whisper_model = whisper.load_model("base") | |
| self.asr_available = True | |
| logger.info("✅ Whisper ASR initialized (base model for accuracy)") | |
| elif self.asr_provider == "browser-native": | |
| # Browser-based ASR doesn't require server-side setup | |
| self.asr_available = True | |
| logger.info("✅ Browser ASR configured") | |
| else: | |
| self.asr_available = False | |
| logger.warning(f"⚠️ Unknown ASR provider: {self.asr_provider}") | |
| except ImportError as e: | |
| self.asr_available = False | |
| logger.warning(f"⚠️ ASR dependencies not available: {e}") | |
| def _get_language_code(self, user_language: str = None) -> str: | |
| """ | |
| Convert user language preference to Whisper language code | |
| Args: | |
| user_language: User's language preference ('english', 'hindi', 'hi-IN', etc.) | |
| Returns: | |
| Two-letter language code for Whisper (e.g., 'en', 'hi') | |
| """ | |
| if not user_language: | |
| # Fallback to default config language | |
| return self.language.split('-')[0] if self.language else 'en' | |
| # Handle different language format inputs | |
| user_lang_lower = user_language.lower() | |
| # Map common language names to codes | |
| language_mapping = { | |
| 'english': 'en', | |
| 'hindi': 'hi', | |
| 'hinglish': 'hi', # Treat Hinglish as Hindi for better results | |
| 'en': 'en', | |
| 'hi': 'hi', | |
| 'en-in': 'en', | |
| 'hi-in': 'hi', | |
| 'en-us': 'en' | |
| } | |
| # Extract base language if it's a locale code (e.g., 'hi-IN' -> 'hi') | |
| if '-' in user_lang_lower: | |
| base_lang = user_lang_lower.split('-')[0] | |
| return language_mapping.get(base_lang, 'en') | |
| return language_mapping.get(user_lang_lower, 'en') | |
| def _get_default_voice(self) -> str: | |
| """Get default voice based on language setting""" | |
| language_voices = { | |
| 'hi-IN': 'hi-IN-SwaraNeural', # Hindi (India) female voice | |
| 'en-IN': 'en-IN-NeerjaNeural', # English (India) female voice | |
| 'en-US': 'en-US-AriaNeural', # English (US) female voice | |
| 'es-ES': 'es-ES-ElviraNeural', # Spanish (Spain) female voice | |
| 'fr-FR': 'fr-FR-DeniseNeural', # French (France) female voice | |
| 'de-DE': 'de-DE-KatjaNeural', # German (Germany) female voice | |
| 'ja-JP': 'ja-JP-NanamiNeural', # Japanese female voice | |
| 'ko-KR': 'ko-KR-SunHiNeural', # Korean female voice | |
| 'zh-CN': 'zh-CN-XiaoxiaoNeural' # Chinese (Simplified) female voice | |
| } | |
| return language_voices.get(self.language, 'en-US-AriaNeural') | |
| async def text_to_speech(self, text: str, voice: str = None) -> Optional[bytes]: | |
| """ | |
| Convert text to speech audio | |
| Returns audio bytes or None if TTS not available | |
| """ | |
| if not self.voice_enabled or not self.tts_available: | |
| return None | |
| # Use default voice for the configured language if no voice specified | |
| if voice is None: | |
| voice = self._get_default_voice() | |
| try: | |
| if self.tts_provider == "edge-tts": | |
| import edge_tts | |
| # Create TTS communication | |
| communicate = edge_tts.Communicate(text, voice, rate=f"{int((self.voice_speed - 1) * 100):+d}%") | |
| # Generate audio | |
| audio_data = b"" | |
| async for chunk in communicate.stream(): | |
| if chunk["type"] == "audio": | |
| audio_data += chunk["data"] | |
| return audio_data | |
| except Exception as e: | |
| logger.error(f"❌ TTS Error: {e}") | |
| return None | |
| async def speech_to_text(self, audio_file_path: str, user_language: str = None) -> Optional[str]: | |
| """ | |
| Convert speech audio to text | |
| Returns transcribed text or None if ASR not available | |
| Args: | |
| audio_file_path: Path to the audio file | |
| user_language: User's preferred language (e.g., 'english', 'hindi', 'hi-IN') | |
| """ | |
| if not self.voice_enabled or not self.asr_available: | |
| return None | |
| try: | |
| if self.asr_provider == "whisper": | |
| # Determine language code based on user preference or default | |
| language_code = self._get_language_code(user_language) | |
| logger.info(f"🎤 Using Whisper with language: {language_code} (user_pref: {user_language})") | |
| # Use enhanced transcription options for better accuracy | |
| transcribe_options = { | |
| "fp16": False, # Use FP32 for better accuracy on CPU | |
| "temperature": 0.0, # Deterministic output | |
| "best_of": 1, # Use best transcription | |
| "beam_size": 5, # Better beam search | |
| "patience": 1.0, # Wait for better results | |
| } | |
| if language_code and language_code != 'en': | |
| transcribe_options["language"] = language_code | |
| result = self.whisper_model.transcribe(audio_file_path, **transcribe_options) | |
| logger.info(f"🎤 {language_code.upper()} transcription result: {result.get('text', '')}") | |
| else: | |
| result = self.whisper_model.transcribe(audio_file_path, **transcribe_options) | |
| logger.info(f"🎤 English transcription result: {result.get('text', '')}") | |
| transcribed_text = result["text"].strip() | |
| # Log confidence/quality metrics if available | |
| if "segments" in result and result["segments"]: | |
| avg_confidence = sum(seg.get("no_speech_prob", 0) for seg in result["segments"]) / len(result["segments"]) | |
| logger.info(f"🎤 Average confidence: {1-avg_confidence:.2f}") | |
| return transcribed_text | |
| except Exception as e: | |
| logger.error(f"❌ ASR Error: {e}") | |
| return None | |
| def get_available_voices(self) -> Dict[str, Any]: | |
| """Get list of available TTS voices""" | |
| if not self.voice_enabled or self.tts_provider != "edge-tts": | |
| return {} | |
| # Common Edge TTS voices | |
| voices = { | |
| "english": { | |
| "female": ["en-US-AriaNeural", "en-US-JennyNeural", "en-GB-SoniaNeural"], | |
| "male": ["en-US-GuyNeural", "en-US-DavisNeural", "en-GB-RyanNeural"] | |
| }, | |
| "multilingual": { | |
| "spanish": ["es-ES-ElviraNeural", "es-MX-DaliaNeural"], | |
| "french": ["fr-FR-DeniseNeural", "fr-CA-SylvieNeural"], | |
| "german": ["de-DE-KatjaNeural", "de-AT-IngridNeural"], | |
| "italian": ["it-IT-ElsaNeural", "it-IT-IsabellaNeural"], | |
| "hindi": ["hi-IN-SwaraNeural", "hi-IN-MadhurNeural"] | |
| } | |
| } | |
| return voices | |
| def create_voice_response_with_guidance(self, | |
| answer: str, | |
| suggested_resources: list = None, | |
| redirect_info: str = None) -> str: | |
| """ | |
| Create a comprehensive voice response with guidance and redirection | |
| """ | |
| response_parts = [] | |
| # Main answer | |
| response_parts.append(answer) | |
| # Add guidance for further information | |
| if suggested_resources: | |
| response_parts.append("\nFor more detailed information, I recommend checking:") | |
| for resource in suggested_resources: | |
| response_parts.append(f"• {resource}") | |
| # Add redirection information | |
| if redirect_info: | |
| response_parts.append(f"\nYou can also {redirect_info}") | |
| # Add helpful voice interaction tips | |
| response_parts.append("\nIs there anything specific you'd like me to explain further? Just ask!") | |
| return " ".join(response_parts) | |
| def generate_redirect_suggestions(self, topic: str, query_type: str) -> Dict[str, Any]: | |
| """ | |
| Generate contextual redirect suggestions based on the topic and query type | |
| """ | |
| suggestions = { | |
| "documents": [], | |
| "websites": [], | |
| "departments": [], | |
| "redirect_text": "" | |
| } | |
| # Government policy topics | |
| if "digital india" in topic.lower(): | |
| suggestions["documents"] = [ | |
| "Digital India Policy Framework 2023", | |
| "E-Governance Implementation Guidelines" | |
| ] | |
| suggestions["websites"] = ["digitalindia.gov.in", "meity.gov.in"] | |
| suggestions["departments"] = ["Ministry of Electronics & IT"] | |
| suggestions["redirect_text"] = "visit the official Digital India portal or contact your local e-governance center" | |
| elif "education" in topic.lower(): | |
| suggestions["documents"] = [ | |
| "National Education Policy 2020", | |
| "Sarva Shiksha Abhiyan Guidelines" | |
| ] | |
| suggestions["websites"] = ["education.gov.in", "mhrd.gov.in"] | |
| suggestions["departments"] = ["Ministry of Education"] | |
| suggestions["redirect_text"] = "contact your District Education Officer or visit the nearest education department office" | |
| elif "health" in topic.lower(): | |
| suggestions["documents"] = [ | |
| "National Health Policy 2017", | |
| "Ayushman Bharat Implementation Guide" | |
| ] | |
| suggestions["websites"] = ["mohfw.gov.in", "pmjay.gov.in"] | |
| suggestions["departments"] = ["Ministry of Health & Family Welfare"] | |
| suggestions["redirect_text"] = "visit your nearest Primary Health Center or call the health helpline" | |
| elif "employment" in topic.lower() or "job" in topic.lower(): | |
| suggestions["documents"] = [ | |
| "Employment Generation Schemes", | |
| "Skill Development Programs Guide" | |
| ] | |
| suggestions["websites"] = ["nrega.nic.in", "msde.gov.in"] | |
| suggestions["departments"] = ["Ministry of Rural Development", "Ministry of Skill Development"] | |
| suggestions["redirect_text"] = "visit your local employment exchange or skill development center" | |
| # Default for other topics | |
| if not suggestions["redirect_text"]: | |
| suggestions["redirect_text"] = "contact the relevant government department or visit your local district collector's office" | |
| return suggestions | |
| def is_voice_enabled(self) -> bool: | |
| """Check if voice features are enabled""" | |
| return self.voice_enabled | |
| def get_voice_status(self) -> Dict[str, Any]: | |
| """Get current voice service status""" | |
| return { | |
| "voice_enabled": self.voice_enabled, | |
| "tts_available": getattr(self, 'tts_available', False), | |
| "asr_available": getattr(self, 'asr_available', False), | |
| "tts_provider": self.tts_provider, | |
| "asr_provider": self.asr_provider, | |
| "language": self.language, | |
| "voice_speed": self.voice_speed | |
| } | |
| # Global instance | |
| voice_service = VoiceService() | |