File size: 13,824 Bytes
cf02b2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
"""
Voice Service for optional Text-to-Speech (TTS) and Automatic Speech Recognition (ASR)
Provides voice interaction capabilities when enabled by user.
"""

import asyncio
import logging
import tempfile
import os
from typing import Optional, Dict, Any
from pathlib import Path

from config import (
    ENABLE_VOICE_FEATURES, TTS_PROVIDER, ASR_PROVIDER, 
    VOICE_LANGUAGE, DEFAULT_VOICE_SPEED
)

logger = logging.getLogger("voicebot")

class VoiceService:
    def __init__(self):
        self.voice_enabled = ENABLE_VOICE_FEATURES
        self.tts_provider = TTS_PROVIDER
        self.asr_provider = ASR_PROVIDER
        self.language = VOICE_LANGUAGE
        self.voice_speed = DEFAULT_VOICE_SPEED
        
        # Initialize services if voice is enabled
        if self.voice_enabled:
            self._init_tts_service()
            self._init_asr_service()
            logger.info(f"🎀 Voice Service initialized - TTS: {self.tts_provider}, ASR: {self.asr_provider}")
        else:
            logger.info("πŸ”‡ Voice features disabled")

    def _init_tts_service(self):
        """Initialize Text-to-Speech service"""
        try:
            if self.tts_provider == "edge-tts":
                import edge_tts
                self.tts_available = True
                logger.info("βœ… Edge TTS initialized")
            elif self.tts_provider == "openai-tts":
                # OpenAI TTS would require OpenAI API key
                self.tts_available = False
                logger.info("⚠️ OpenAI TTS not configured")
            else:
                self.tts_available = False
                logger.warning(f"⚠️ Unknown TTS provider: {self.tts_provider}")
        except ImportError as e:
            self.tts_available = False
            logger.warning(f"⚠️ TTS dependencies not available: {e}")

    def _init_asr_service(self):
        """Initialize Automatic Speech Recognition service"""
        try:
            if self.asr_provider == "whisper":
                import whisper
                # Use base model for balance between speed and accuracy
                self.whisper_model = whisper.load_model("base")
                self.asr_available = True
                logger.info("βœ… Whisper ASR initialized (base model for accuracy)")
            elif self.asr_provider == "browser-native":
                # Browser-based ASR doesn't require server-side setup
                self.asr_available = True
                logger.info("βœ… Browser ASR configured")
            else:
                self.asr_available = False
                logger.warning(f"⚠️ Unknown ASR provider: {self.asr_provider}")
        except ImportError as e:
            self.asr_available = False
            logger.warning(f"⚠️ ASR dependencies not available: {e}")

    def _get_language_code(self, user_language: str = None) -> str:
        """
        Convert user language preference to Whisper language code
        
        Args:
            user_language: User's language preference ('english', 'hindi', 'hi-IN', etc.)
            
        Returns:
            Two-letter language code for Whisper (e.g., 'en', 'hi')
        """
        if not user_language:
            # Fallback to default config language
            return self.language.split('-')[0] if self.language else 'en'
        
        # Handle different language format inputs
        user_lang_lower = user_language.lower()
        
        # Map common language names to codes
        language_mapping = {
            'english': 'en',
            'hindi': 'hi',
            'hinglish': 'hi',  # Treat Hinglish as Hindi for better results
            'en': 'en',
            'hi': 'hi',
            'en-in': 'en',
            'hi-in': 'hi',
            'en-us': 'en'
        }
        
        # Extract base language if it's a locale code (e.g., 'hi-IN' -> 'hi')
        if '-' in user_lang_lower:
            base_lang = user_lang_lower.split('-')[0]
            return language_mapping.get(base_lang, 'en')
        
        return language_mapping.get(user_lang_lower, 'en')

    def _get_default_voice(self) -> str:
        """Get default voice based on language setting"""
        language_voices = {
            'hi-IN': 'hi-IN-SwaraNeural',  # Hindi (India) female voice
            'en-IN': 'en-IN-NeerjaNeural',  # English (India) female voice
            'en-US': 'en-US-AriaNeural',   # English (US) female voice
            'es-ES': 'es-ES-ElviraNeural', # Spanish (Spain) female voice
            'fr-FR': 'fr-FR-DeniseNeural', # French (France) female voice
            'de-DE': 'de-DE-KatjaNeural',  # German (Germany) female voice
            'ja-JP': 'ja-JP-NanamiNeural', # Japanese female voice
            'ko-KR': 'ko-KR-SunHiNeural',  # Korean female voice
            'zh-CN': 'zh-CN-XiaoxiaoNeural' # Chinese (Simplified) female voice
        }
        return language_voices.get(self.language, 'en-US-AriaNeural')

    async def text_to_speech(self, text: str, voice: str = None) -> Optional[bytes]:
        """
        Convert text to speech audio
        Returns audio bytes or None if TTS not available
        """
        if not self.voice_enabled or not self.tts_available:
            return None

        # Use default voice for the configured language if no voice specified
        if voice is None:
            voice = self._get_default_voice()

        try:
            if self.tts_provider == "edge-tts":
                import edge_tts
                
                # Create TTS communication
                communicate = edge_tts.Communicate(text, voice, rate=f"{int((self.voice_speed - 1) * 100):+d}%")
                
                # Generate audio
                audio_data = b""
                async for chunk in communicate.stream():
                    if chunk["type"] == "audio":
                        audio_data += chunk["data"]
                
                return audio_data
                
        except Exception as e:
            logger.error(f"❌ TTS Error: {e}")
            return None

    async def speech_to_text(self, audio_file_path: str, user_language: str = None) -> Optional[str]:
        """
        Convert speech audio to text
        Returns transcribed text or None if ASR not available
        
        Args:
            audio_file_path: Path to the audio file
            user_language: User's preferred language (e.g., 'english', 'hindi', 'hi-IN')
        """
        if not self.voice_enabled or not self.asr_available:
            return None

        try:
            if self.asr_provider == "whisper":
                # Determine language code based on user preference or default
                language_code = self._get_language_code(user_language)
                
                logger.info(f"🎀 Using Whisper with language: {language_code} (user_pref: {user_language})")
                
                # Use enhanced transcription options for better accuracy
                transcribe_options = {
                    "fp16": False,  # Use FP32 for better accuracy on CPU
                    "temperature": 0.0,  # Deterministic output
                    "best_of": 1,  # Use best transcription
                    "beam_size": 5,  # Better beam search
                    "patience": 1.0,  # Wait for better results
                }
                
                if language_code and language_code != 'en':
                    transcribe_options["language"] = language_code
                    result = self.whisper_model.transcribe(audio_file_path, **transcribe_options)
                    logger.info(f"🎀 {language_code.upper()} transcription result: {result.get('text', '')}")
                else:
                    result = self.whisper_model.transcribe(audio_file_path, **transcribe_options)
                    logger.info(f"🎀 English transcription result: {result.get('text', '')}")
                
                transcribed_text = result["text"].strip()
                
                # Log confidence/quality metrics if available
                if "segments" in result and result["segments"]:
                    avg_confidence = sum(seg.get("no_speech_prob", 0) for seg in result["segments"]) / len(result["segments"])
                    logger.info(f"🎀 Average confidence: {1-avg_confidence:.2f}")
                
                return transcribed_text
                
        except Exception as e:
            logger.error(f"❌ ASR Error: {e}")
            return None

    def get_available_voices(self) -> Dict[str, Any]:
        """Get list of available TTS voices"""
        if not self.voice_enabled or self.tts_provider != "edge-tts":
            return {}
            
        # Common Edge TTS voices
        voices = {
            "english": {
                "female": ["en-US-AriaNeural", "en-US-JennyNeural", "en-GB-SoniaNeural"],
                "male": ["en-US-GuyNeural", "en-US-DavisNeural", "en-GB-RyanNeural"]
            },
            "multilingual": {
                "spanish": ["es-ES-ElviraNeural", "es-MX-DaliaNeural"],
                "french": ["fr-FR-DeniseNeural", "fr-CA-SylvieNeural"],
                "german": ["de-DE-KatjaNeural", "de-AT-IngridNeural"],
                "italian": ["it-IT-ElsaNeural", "it-IT-IsabellaNeural"],
                "hindi": ["hi-IN-SwaraNeural", "hi-IN-MadhurNeural"]
            }
        }
        return voices

    def create_voice_response_with_guidance(self, 
                                          answer: str, 
                                          suggested_resources: list = None,
                                          redirect_info: str = None) -> str:
        """
        Create a comprehensive voice response with guidance and redirection
        """
        response_parts = []
        
        # Main answer
        response_parts.append(answer)
        
        # Add guidance for further information
        if suggested_resources:
            response_parts.append("\nFor more detailed information, I recommend checking:")
            for resource in suggested_resources:
                response_parts.append(f"β€’ {resource}")
        
        # Add redirection information
        if redirect_info:
            response_parts.append(f"\nYou can also {redirect_info}")
        
        # Add helpful voice interaction tips
        response_parts.append("\nIs there anything specific you'd like me to explain further? Just ask!")
        
        return " ".join(response_parts)

    def generate_redirect_suggestions(self, topic: str, query_type: str) -> Dict[str, Any]:
        """
        Generate contextual redirect suggestions based on the topic and query type
        """
        suggestions = {
            "documents": [],
            "websites": [],
            "departments": [],
            "redirect_text": ""
        }
        
        # Government policy topics
        if "digital india" in topic.lower():
            suggestions["documents"] = [
                "Digital India Policy Framework 2023",
                "E-Governance Implementation Guidelines"
            ]
            suggestions["websites"] = ["digitalindia.gov.in", "meity.gov.in"]
            suggestions["departments"] = ["Ministry of Electronics & IT"]
            suggestions["redirect_text"] = "visit the official Digital India portal or contact your local e-governance center"
            
        elif "education" in topic.lower():
            suggestions["documents"] = [
                "National Education Policy 2020",
                "Sarva Shiksha Abhiyan Guidelines"
            ]
            suggestions["websites"] = ["education.gov.in", "mhrd.gov.in"]
            suggestions["departments"] = ["Ministry of Education"]
            suggestions["redirect_text"] = "contact your District Education Officer or visit the nearest education department office"
            
        elif "health" in topic.lower():
            suggestions["documents"] = [
                "National Health Policy 2017",
                "Ayushman Bharat Implementation Guide"
            ]
            suggestions["websites"] = ["mohfw.gov.in", "pmjay.gov.in"]
            suggestions["departments"] = ["Ministry of Health & Family Welfare"]
            suggestions["redirect_text"] = "visit your nearest Primary Health Center or call the health helpline"
            
        elif "employment" in topic.lower() or "job" in topic.lower():
            suggestions["documents"] = [
                "Employment Generation Schemes",
                "Skill Development Programs Guide"
            ]
            suggestions["websites"] = ["nrega.nic.in", "msde.gov.in"]
            suggestions["departments"] = ["Ministry of Rural Development", "Ministry of Skill Development"]
            suggestions["redirect_text"] = "visit your local employment exchange or skill development center"
            
        # Default for other topics
        if not suggestions["redirect_text"]:
            suggestions["redirect_text"] = "contact the relevant government department or visit your local district collector's office"
        
        return suggestions

    def is_voice_enabled(self) -> bool:
        """Check if voice features are enabled"""
        return self.voice_enabled

    def get_voice_status(self) -> Dict[str, Any]:
        """Get current voice service status"""
        return {
            "voice_enabled": self.voice_enabled,
            "tts_available": getattr(self, 'tts_available', False),
            "asr_available": getattr(self, 'asr_available', False),
            "tts_provider": self.tts_provider,
            "asr_provider": self.asr_provider,
            "language": self.language,
            "voice_speed": self.voice_speed
        }

# Global instance
voice_service = VoiceService()