Spaces:

ChAbhishek28
/

PensionBot

Sleeping

ChAbhishek28 commited on Sep 26

Commit

8a78e3e

1 Parent(s): 102aa44

🎤 Enhance voice bot with multi-language support and fix MediaRecorder issues

- Fix WebSocket message handling to properly process voice messages with type field
- Add comprehensive language support (English, Hindi, Hinglish) for ASR and TTS
- Enhance error handling for voice processing
- Fix KeyError: 'text' by improving message validation
- Add language parameter support in voice_service.py speech_to_text method
- Improve audio data processing with base64 encoding support

Files changed (1) hide show

enhanced_websocket_handler.py +56 -16

enhanced_websocket_handler.py CHANGED Viewed

@@ -265,7 +265,7 @@ async def handle_text_message(websocket: WebSocket, data: dict, session_data: di
 async def handle_voice_message(websocket: WebSocket, data: dict, session_data: dict,
                              use_hybrid: bool, config: dict, knowledge_base: str, graph=None):
-    """Handle voice message with ASR and TTS"""
     if not voice_service.is_voice_enabled():
         await websocket.send_json({
@@ -275,16 +275,31 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
         return
     try:
-        # Decode audio data
-        audio_data = base64.b64decode(data["audio_data"])
         # Save to temporary file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
             temp_file.write(audio_data)
             temp_file_path = temp_file.name
-        # Convert speech to text
-        transcribed_text = await voice_service.speech_to_text(temp_file_path)
         # Clean up temp file
         Path(temp_file_path).unlink()
@@ -292,25 +307,36 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
         if not transcribed_text:
             await websocket.send_json({
                 "type": "error",
-                "message": "Could not transcribe audio"
             })
             return
-        logger.info(f"🎤 Transcribed: {transcribed_text}")
-        # Send transcription
         await websocket.send_json({
             "type": "transcription",
-            "text": transcribed_text
         })
-        # Process as text message
         if use_hybrid:
             response_text, provider_used = await get_hybrid_response(
-                transcribed_text, session_data["context"], config, knowledge_base
             )
         else:
-            session_data["messages"].append(HumanMessage(content=transcribed_text))
             result = await graph.ainvoke({"messages": session_data["messages"]}, config)
             response_text = result["messages"][-1].content
             provider_used = "traditional"
@@ -320,29 +346,43 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
         # Send voice response if enabled
         if session_data["user_preferences"]["response_mode"] in ["voice", "both"]:
             voice_text = voice_service.create_voice_response_with_guidance(
                 response_text,
-                suggested_resources=["Government portal", "Local offices"],
                 redirect_info="contact your local government office for personalized assistance"
             )
             audio_response = await voice_service.text_to_speech(
                 voice_text,
-                session_data["user_preferences"]["preferred_voice"]
             )
             if audio_response:
                 await websocket.send_json({
                     "type": "audio_response",
                     "audio_data": base64.b64encode(audio_response).decode(),
-                    "format": "mp3"
                 })
     except Exception as e:
         logger.error(f"❌ Error processing voice message: {e}")
         await websocket.send_json({
             "type": "error",
-            "message": f"Error processing voice message: {str(e)}"
         })
 async def get_hybrid_response(user_message: str, context: str, config: dict, knowledge_base: str):

 async def handle_voice_message(websocket: WebSocket, data: dict, session_data: dict,
                              use_hybrid: bool, config: dict, knowledge_base: str, graph=None):
+    """Handle voice message with enhanced multi-language ASR and TTS"""
     if not voice_service.is_voice_enabled():
         await websocket.send_json({
         return
     try:
+        # Get audio data - handle both old and new format
+        if "audio_data" in data:
+            audio_data = base64.b64decode(data["audio_data"])
+        else:
+            # Handle old format or direct binary data
+            logger.error("❌ No audio_data field found in voice message")
+            await websocket.send_json({
+                "type": "error",
+                "message": "No audio data provided"
+            })
+            return
+        # Extract user language preference
+        user_language = data.get("lang") or data.get("language") or session_data.get("language") or session_data["user_preferences"].get("language") or "english"
+        logger.info(f"🌍 Processing voice with language: {user_language}")
         # Save to temporary file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
             temp_file.write(audio_data)
             temp_file_path = temp_file.name
+        logger.info(f"🎤 Processing audio with language preference: {user_language}")
+        # Convert speech to text with language support
+        transcribed_text = await voice_service.speech_to_text(temp_file_path, user_language)
         # Clean up temp file
         Path(temp_file_path).unlink()
         if not transcribed_text:
             await websocket.send_json({
                 "type": "error",
+                "message": "Could not transcribe audio. Please try speaking clearly or check your microphone."
             })
             return
+        logger.info(f"🎤 Transcribed ({user_language}): {transcribed_text}")
+        # Send transcription with detected language info
         await websocket.send_json({
             "type": "transcription",
+            "text": transcribed_text,
+            "language": user_language or "auto-detected",
+            "confidence": "high"  # Could be dynamic based on Whisper confidence
         })
+        # Add language context to the prompt for better responses
+        language_context = ""
+        if user_language and user_language.lower() in ['hindi', 'hi', 'hi-in']:
+            language_context = " (User is speaking in Hindi, so you may include Hindi terms where appropriate for government policies in India)"
+        elif user_language and user_language.lower() in ['hinglish']:
+            language_context = " (User is speaking in Hinglish - Hindi-English mix, so feel free to use both languages in your response)"
+        enhanced_message = transcribed_text + language_context
+        # Process as text message with language context
         if use_hybrid:
             response_text, provider_used = await get_hybrid_response(
+                enhanced_message, session_data["context"], config, knowledge_base
             )
         else:
+            session_data["messages"].append(HumanMessage(content=enhanced_message))
             result = await graph.ainvoke({"messages": session_data["messages"]}, config)
             response_text = result["messages"][-1].content
             provider_used = "traditional"
         # Send voice response if enabled
         if session_data["user_preferences"]["response_mode"] in ["voice", "both"]:
+            # Choose appropriate voice based on user's language
+            voice_preference = session_data["user_preferences"]["preferred_voice"]
+            if not voice_preference and user_language:
+                if user_language.lower() in ['hindi', 'hi', 'hi-in']:
+                    voice_preference = "hi-IN-SwaraNeural"  # Hindi female voice
+                elif user_language.lower() in ['english', 'en', 'en-in']:
+                    voice_preference = "en-IN-NeerjaNeural"  # Indian English female voice
+                else:
+                    voice_preference = "en-US-AriaNeural"  # Default English
             voice_text = voice_service.create_voice_response_with_guidance(
                 response_text,
+                suggested_resources=["Government portal", "Local offices", "Helpline numbers"],
                 redirect_info="contact your local government office for personalized assistance"
             )
             audio_response = await voice_service.text_to_speech(
                 voice_text,
+                voice_preference
             )
             if audio_response:
                 await websocket.send_json({
                     "type": "audio_response",
                     "audio_data": base64.b64encode(audio_response).decode(),
+                    "format": "mp3",
+                    "voice_used": voice_preference,
+                    "language": user_language or "en"
                 })
+            else:
+                logger.warning("⚠️ Could not generate audio response")
     except Exception as e:
         logger.error(f"❌ Error processing voice message: {e}")
         await websocket.send_json({
             "type": "error",
+            "message": f"Error processing voice message: {str(e)}. Please try again or switch to text mode."
         })
 async def get_hybrid_response(user_message: str, context: str, config: dict, knowledge_base: str):