Spaces:
Sleeping
Sleeping
File size: 13,824 Bytes
cf02b2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 |
"""
Voice Service for optional Text-to-Speech (TTS) and Automatic Speech Recognition (ASR)
Provides voice interaction capabilities when enabled by user.
"""
import asyncio
import logging
import tempfile
import os
from typing import Optional, Dict, Any
from pathlib import Path
from config import (
ENABLE_VOICE_FEATURES, TTS_PROVIDER, ASR_PROVIDER,
VOICE_LANGUAGE, DEFAULT_VOICE_SPEED
)
logger = logging.getLogger("voicebot")
class VoiceService:
def __init__(self):
self.voice_enabled = ENABLE_VOICE_FEATURES
self.tts_provider = TTS_PROVIDER
self.asr_provider = ASR_PROVIDER
self.language = VOICE_LANGUAGE
self.voice_speed = DEFAULT_VOICE_SPEED
# Initialize services if voice is enabled
if self.voice_enabled:
self._init_tts_service()
self._init_asr_service()
logger.info(f"π€ Voice Service initialized - TTS: {self.tts_provider}, ASR: {self.asr_provider}")
else:
logger.info("π Voice features disabled")
def _init_tts_service(self):
"""Initialize Text-to-Speech service"""
try:
if self.tts_provider == "edge-tts":
import edge_tts
self.tts_available = True
logger.info("β
Edge TTS initialized")
elif self.tts_provider == "openai-tts":
# OpenAI TTS would require OpenAI API key
self.tts_available = False
logger.info("β οΈ OpenAI TTS not configured")
else:
self.tts_available = False
logger.warning(f"β οΈ Unknown TTS provider: {self.tts_provider}")
except ImportError as e:
self.tts_available = False
logger.warning(f"β οΈ TTS dependencies not available: {e}")
def _init_asr_service(self):
"""Initialize Automatic Speech Recognition service"""
try:
if self.asr_provider == "whisper":
import whisper
# Use base model for balance between speed and accuracy
self.whisper_model = whisper.load_model("base")
self.asr_available = True
logger.info("β
Whisper ASR initialized (base model for accuracy)")
elif self.asr_provider == "browser-native":
# Browser-based ASR doesn't require server-side setup
self.asr_available = True
logger.info("β
Browser ASR configured")
else:
self.asr_available = False
logger.warning(f"β οΈ Unknown ASR provider: {self.asr_provider}")
except ImportError as e:
self.asr_available = False
logger.warning(f"β οΈ ASR dependencies not available: {e}")
def _get_language_code(self, user_language: str = None) -> str:
"""
Convert user language preference to Whisper language code
Args:
user_language: User's language preference ('english', 'hindi', 'hi-IN', etc.)
Returns:
Two-letter language code for Whisper (e.g., 'en', 'hi')
"""
if not user_language:
# Fallback to default config language
return self.language.split('-')[0] if self.language else 'en'
# Handle different language format inputs
user_lang_lower = user_language.lower()
# Map common language names to codes
language_mapping = {
'english': 'en',
'hindi': 'hi',
'hinglish': 'hi', # Treat Hinglish as Hindi for better results
'en': 'en',
'hi': 'hi',
'en-in': 'en',
'hi-in': 'hi',
'en-us': 'en'
}
# Extract base language if it's a locale code (e.g., 'hi-IN' -> 'hi')
if '-' in user_lang_lower:
base_lang = user_lang_lower.split('-')[0]
return language_mapping.get(base_lang, 'en')
return language_mapping.get(user_lang_lower, 'en')
def _get_default_voice(self) -> str:
"""Get default voice based on language setting"""
language_voices = {
'hi-IN': 'hi-IN-SwaraNeural', # Hindi (India) female voice
'en-IN': 'en-IN-NeerjaNeural', # English (India) female voice
'en-US': 'en-US-AriaNeural', # English (US) female voice
'es-ES': 'es-ES-ElviraNeural', # Spanish (Spain) female voice
'fr-FR': 'fr-FR-DeniseNeural', # French (France) female voice
'de-DE': 'de-DE-KatjaNeural', # German (Germany) female voice
'ja-JP': 'ja-JP-NanamiNeural', # Japanese female voice
'ko-KR': 'ko-KR-SunHiNeural', # Korean female voice
'zh-CN': 'zh-CN-XiaoxiaoNeural' # Chinese (Simplified) female voice
}
return language_voices.get(self.language, 'en-US-AriaNeural')
async def text_to_speech(self, text: str, voice: str = None) -> Optional[bytes]:
"""
Convert text to speech audio
Returns audio bytes or None if TTS not available
"""
if not self.voice_enabled or not self.tts_available:
return None
# Use default voice for the configured language if no voice specified
if voice is None:
voice = self._get_default_voice()
try:
if self.tts_provider == "edge-tts":
import edge_tts
# Create TTS communication
communicate = edge_tts.Communicate(text, voice, rate=f"{int((self.voice_speed - 1) * 100):+d}%")
# Generate audio
audio_data = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
return audio_data
except Exception as e:
logger.error(f"β TTS Error: {e}")
return None
async def speech_to_text(self, audio_file_path: str, user_language: str = None) -> Optional[str]:
"""
Convert speech audio to text
Returns transcribed text or None if ASR not available
Args:
audio_file_path: Path to the audio file
user_language: User's preferred language (e.g., 'english', 'hindi', 'hi-IN')
"""
if not self.voice_enabled or not self.asr_available:
return None
try:
if self.asr_provider == "whisper":
# Determine language code based on user preference or default
language_code = self._get_language_code(user_language)
logger.info(f"π€ Using Whisper with language: {language_code} (user_pref: {user_language})")
# Use enhanced transcription options for better accuracy
transcribe_options = {
"fp16": False, # Use FP32 for better accuracy on CPU
"temperature": 0.0, # Deterministic output
"best_of": 1, # Use best transcription
"beam_size": 5, # Better beam search
"patience": 1.0, # Wait for better results
}
if language_code and language_code != 'en':
transcribe_options["language"] = language_code
result = self.whisper_model.transcribe(audio_file_path, **transcribe_options)
logger.info(f"π€ {language_code.upper()} transcription result: {result.get('text', '')}")
else:
result = self.whisper_model.transcribe(audio_file_path, **transcribe_options)
logger.info(f"π€ English transcription result: {result.get('text', '')}")
transcribed_text = result["text"].strip()
# Log confidence/quality metrics if available
if "segments" in result and result["segments"]:
avg_confidence = sum(seg.get("no_speech_prob", 0) for seg in result["segments"]) / len(result["segments"])
logger.info(f"π€ Average confidence: {1-avg_confidence:.2f}")
return transcribed_text
except Exception as e:
logger.error(f"β ASR Error: {e}")
return None
def get_available_voices(self) -> Dict[str, Any]:
"""Get list of available TTS voices"""
if not self.voice_enabled or self.tts_provider != "edge-tts":
return {}
# Common Edge TTS voices
voices = {
"english": {
"female": ["en-US-AriaNeural", "en-US-JennyNeural", "en-GB-SoniaNeural"],
"male": ["en-US-GuyNeural", "en-US-DavisNeural", "en-GB-RyanNeural"]
},
"multilingual": {
"spanish": ["es-ES-ElviraNeural", "es-MX-DaliaNeural"],
"french": ["fr-FR-DeniseNeural", "fr-CA-SylvieNeural"],
"german": ["de-DE-KatjaNeural", "de-AT-IngridNeural"],
"italian": ["it-IT-ElsaNeural", "it-IT-IsabellaNeural"],
"hindi": ["hi-IN-SwaraNeural", "hi-IN-MadhurNeural"]
}
}
return voices
def create_voice_response_with_guidance(self,
answer: str,
suggested_resources: list = None,
redirect_info: str = None) -> str:
"""
Create a comprehensive voice response with guidance and redirection
"""
response_parts = []
# Main answer
response_parts.append(answer)
# Add guidance for further information
if suggested_resources:
response_parts.append("\nFor more detailed information, I recommend checking:")
for resource in suggested_resources:
response_parts.append(f"β’ {resource}")
# Add redirection information
if redirect_info:
response_parts.append(f"\nYou can also {redirect_info}")
# Add helpful voice interaction tips
response_parts.append("\nIs there anything specific you'd like me to explain further? Just ask!")
return " ".join(response_parts)
def generate_redirect_suggestions(self, topic: str, query_type: str) -> Dict[str, Any]:
"""
Generate contextual redirect suggestions based on the topic and query type
"""
suggestions = {
"documents": [],
"websites": [],
"departments": [],
"redirect_text": ""
}
# Government policy topics
if "digital india" in topic.lower():
suggestions["documents"] = [
"Digital India Policy Framework 2023",
"E-Governance Implementation Guidelines"
]
suggestions["websites"] = ["digitalindia.gov.in", "meity.gov.in"]
suggestions["departments"] = ["Ministry of Electronics & IT"]
suggestions["redirect_text"] = "visit the official Digital India portal or contact your local e-governance center"
elif "education" in topic.lower():
suggestions["documents"] = [
"National Education Policy 2020",
"Sarva Shiksha Abhiyan Guidelines"
]
suggestions["websites"] = ["education.gov.in", "mhrd.gov.in"]
suggestions["departments"] = ["Ministry of Education"]
suggestions["redirect_text"] = "contact your District Education Officer or visit the nearest education department office"
elif "health" in topic.lower():
suggestions["documents"] = [
"National Health Policy 2017",
"Ayushman Bharat Implementation Guide"
]
suggestions["websites"] = ["mohfw.gov.in", "pmjay.gov.in"]
suggestions["departments"] = ["Ministry of Health & Family Welfare"]
suggestions["redirect_text"] = "visit your nearest Primary Health Center or call the health helpline"
elif "employment" in topic.lower() or "job" in topic.lower():
suggestions["documents"] = [
"Employment Generation Schemes",
"Skill Development Programs Guide"
]
suggestions["websites"] = ["nrega.nic.in", "msde.gov.in"]
suggestions["departments"] = ["Ministry of Rural Development", "Ministry of Skill Development"]
suggestions["redirect_text"] = "visit your local employment exchange or skill development center"
# Default for other topics
if not suggestions["redirect_text"]:
suggestions["redirect_text"] = "contact the relevant government department or visit your local district collector's office"
return suggestions
def is_voice_enabled(self) -> bool:
"""Check if voice features are enabled"""
return self.voice_enabled
def get_voice_status(self) -> Dict[str, Any]:
"""Get current voice service status"""
return {
"voice_enabled": self.voice_enabled,
"tts_available": getattr(self, 'tts_available', False),
"asr_available": getattr(self, 'asr_available', False),
"tts_provider": self.tts_provider,
"asr_provider": self.asr_provider,
"language": self.language,
"voice_speed": self.voice_speed
}
# Global instance
voice_service = VoiceService()
|