Spaces:
Sleeping
Sleeping
Commit
Β·
8a78e3e
1
Parent(s):
102aa44
π€ Enhance voice bot with multi-language support and fix MediaRecorder issues
Browse files- Fix WebSocket message handling to properly process voice messages with type field
- Add comprehensive language support (English, Hindi, Hinglish) for ASR and TTS
- Enhance error handling for voice processing
- Fix KeyError: 'text' by improving message validation
- Add language parameter support in voice_service.py speech_to_text method
- Improve audio data processing with base64 encoding support
- enhanced_websocket_handler.py +56 -16
enhanced_websocket_handler.py
CHANGED
|
@@ -265,7 +265,7 @@ async def handle_text_message(websocket: WebSocket, data: dict, session_data: di
|
|
| 265 |
|
| 266 |
async def handle_voice_message(websocket: WebSocket, data: dict, session_data: dict,
|
| 267 |
use_hybrid: bool, config: dict, knowledge_base: str, graph=None):
|
| 268 |
-
"""Handle voice message with ASR and TTS"""
|
| 269 |
|
| 270 |
if not voice_service.is_voice_enabled():
|
| 271 |
await websocket.send_json({
|
|
@@ -275,16 +275,31 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
|
|
| 275 |
return
|
| 276 |
|
| 277 |
try:
|
| 278 |
-
#
|
| 279 |
-
audio_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
# Save to temporary file
|
| 282 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
| 283 |
temp_file.write(audio_data)
|
| 284 |
temp_file_path = temp_file.name
|
| 285 |
|
| 286 |
-
|
| 287 |
-
|
|
|
|
|
|
|
| 288 |
|
| 289 |
# Clean up temp file
|
| 290 |
Path(temp_file_path).unlink()
|
|
@@ -292,25 +307,36 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
|
|
| 292 |
if not transcribed_text:
|
| 293 |
await websocket.send_json({
|
| 294 |
"type": "error",
|
| 295 |
-
"message": "Could not transcribe audio"
|
| 296 |
})
|
| 297 |
return
|
| 298 |
|
| 299 |
-
logger.info(f"π€ Transcribed: {transcribed_text}")
|
| 300 |
|
| 301 |
-
# Send transcription
|
| 302 |
await websocket.send_json({
|
| 303 |
"type": "transcription",
|
| 304 |
-
"text": transcribed_text
|
|
|
|
|
|
|
| 305 |
})
|
| 306 |
|
| 307 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
if use_hybrid:
|
| 309 |
response_text, provider_used = await get_hybrid_response(
|
| 310 |
-
|
| 311 |
)
|
| 312 |
else:
|
| 313 |
-
session_data["messages"].append(HumanMessage(content=
|
| 314 |
result = await graph.ainvoke({"messages": session_data["messages"]}, config)
|
| 315 |
response_text = result["messages"][-1].content
|
| 316 |
provider_used = "traditional"
|
|
@@ -320,29 +346,43 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
|
|
| 320 |
|
| 321 |
# Send voice response if enabled
|
| 322 |
if session_data["user_preferences"]["response_mode"] in ["voice", "both"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
voice_text = voice_service.create_voice_response_with_guidance(
|
| 324 |
response_text,
|
| 325 |
-
suggested_resources=["Government portal", "Local offices"],
|
| 326 |
redirect_info="contact your local government office for personalized assistance"
|
| 327 |
)
|
| 328 |
|
| 329 |
audio_response = await voice_service.text_to_speech(
|
| 330 |
voice_text,
|
| 331 |
-
|
| 332 |
)
|
| 333 |
|
| 334 |
if audio_response:
|
| 335 |
await websocket.send_json({
|
| 336 |
"type": "audio_response",
|
| 337 |
"audio_data": base64.b64encode(audio_response).decode(),
|
| 338 |
-
"format": "mp3"
|
|
|
|
|
|
|
| 339 |
})
|
|
|
|
|
|
|
| 340 |
|
| 341 |
except Exception as e:
|
| 342 |
logger.error(f"β Error processing voice message: {e}")
|
| 343 |
await websocket.send_json({
|
| 344 |
"type": "error",
|
| 345 |
-
"message": f"Error processing voice message: {str(e)}"
|
| 346 |
})
|
| 347 |
|
| 348 |
async def get_hybrid_response(user_message: str, context: str, config: dict, knowledge_base: str):
|
|
|
|
| 265 |
|
| 266 |
async def handle_voice_message(websocket: WebSocket, data: dict, session_data: dict,
|
| 267 |
use_hybrid: bool, config: dict, knowledge_base: str, graph=None):
|
| 268 |
+
"""Handle voice message with enhanced multi-language ASR and TTS"""
|
| 269 |
|
| 270 |
if not voice_service.is_voice_enabled():
|
| 271 |
await websocket.send_json({
|
|
|
|
| 275 |
return
|
| 276 |
|
| 277 |
try:
|
| 278 |
+
# Get audio data - handle both old and new format
|
| 279 |
+
if "audio_data" in data:
|
| 280 |
+
audio_data = base64.b64decode(data["audio_data"])
|
| 281 |
+
else:
|
| 282 |
+
# Handle old format or direct binary data
|
| 283 |
+
logger.error("β No audio_data field found in voice message")
|
| 284 |
+
await websocket.send_json({
|
| 285 |
+
"type": "error",
|
| 286 |
+
"message": "No audio data provided"
|
| 287 |
+
})
|
| 288 |
+
return
|
| 289 |
+
|
| 290 |
+
# Extract user language preference
|
| 291 |
+
user_language = data.get("lang") or data.get("language") or session_data.get("language") or session_data["user_preferences"].get("language") or "english"
|
| 292 |
+
logger.info(f"π Processing voice with language: {user_language}")
|
| 293 |
|
| 294 |
# Save to temporary file
|
| 295 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
| 296 |
temp_file.write(audio_data)
|
| 297 |
temp_file_path = temp_file.name
|
| 298 |
|
| 299 |
+
logger.info(f"π€ Processing audio with language preference: {user_language}")
|
| 300 |
+
|
| 301 |
+
# Convert speech to text with language support
|
| 302 |
+
transcribed_text = await voice_service.speech_to_text(temp_file_path, user_language)
|
| 303 |
|
| 304 |
# Clean up temp file
|
| 305 |
Path(temp_file_path).unlink()
|
|
|
|
| 307 |
if not transcribed_text:
|
| 308 |
await websocket.send_json({
|
| 309 |
"type": "error",
|
| 310 |
+
"message": "Could not transcribe audio. Please try speaking clearly or check your microphone."
|
| 311 |
})
|
| 312 |
return
|
| 313 |
|
| 314 |
+
logger.info(f"π€ Transcribed ({user_language}): {transcribed_text}")
|
| 315 |
|
| 316 |
+
# Send transcription with detected language info
|
| 317 |
await websocket.send_json({
|
| 318 |
"type": "transcription",
|
| 319 |
+
"text": transcribed_text,
|
| 320 |
+
"language": user_language or "auto-detected",
|
| 321 |
+
"confidence": "high" # Could be dynamic based on Whisper confidence
|
| 322 |
})
|
| 323 |
|
| 324 |
+
# Add language context to the prompt for better responses
|
| 325 |
+
language_context = ""
|
| 326 |
+
if user_language and user_language.lower() in ['hindi', 'hi', 'hi-in']:
|
| 327 |
+
language_context = " (User is speaking in Hindi, so you may include Hindi terms where appropriate for government policies in India)"
|
| 328 |
+
elif user_language and user_language.lower() in ['hinglish']:
|
| 329 |
+
language_context = " (User is speaking in Hinglish - Hindi-English mix, so feel free to use both languages in your response)"
|
| 330 |
+
|
| 331 |
+
enhanced_message = transcribed_text + language_context
|
| 332 |
+
|
| 333 |
+
# Process as text message with language context
|
| 334 |
if use_hybrid:
|
| 335 |
response_text, provider_used = await get_hybrid_response(
|
| 336 |
+
enhanced_message, session_data["context"], config, knowledge_base
|
| 337 |
)
|
| 338 |
else:
|
| 339 |
+
session_data["messages"].append(HumanMessage(content=enhanced_message))
|
| 340 |
result = await graph.ainvoke({"messages": session_data["messages"]}, config)
|
| 341 |
response_text = result["messages"][-1].content
|
| 342 |
provider_used = "traditional"
|
|
|
|
| 346 |
|
| 347 |
# Send voice response if enabled
|
| 348 |
if session_data["user_preferences"]["response_mode"] in ["voice", "both"]:
|
| 349 |
+
# Choose appropriate voice based on user's language
|
| 350 |
+
voice_preference = session_data["user_preferences"]["preferred_voice"]
|
| 351 |
+
if not voice_preference and user_language:
|
| 352 |
+
if user_language.lower() in ['hindi', 'hi', 'hi-in']:
|
| 353 |
+
voice_preference = "hi-IN-SwaraNeural" # Hindi female voice
|
| 354 |
+
elif user_language.lower() in ['english', 'en', 'en-in']:
|
| 355 |
+
voice_preference = "en-IN-NeerjaNeural" # Indian English female voice
|
| 356 |
+
else:
|
| 357 |
+
voice_preference = "en-US-AriaNeural" # Default English
|
| 358 |
+
|
| 359 |
voice_text = voice_service.create_voice_response_with_guidance(
|
| 360 |
response_text,
|
| 361 |
+
suggested_resources=["Government portal", "Local offices", "Helpline numbers"],
|
| 362 |
redirect_info="contact your local government office for personalized assistance"
|
| 363 |
)
|
| 364 |
|
| 365 |
audio_response = await voice_service.text_to_speech(
|
| 366 |
voice_text,
|
| 367 |
+
voice_preference
|
| 368 |
)
|
| 369 |
|
| 370 |
if audio_response:
|
| 371 |
await websocket.send_json({
|
| 372 |
"type": "audio_response",
|
| 373 |
"audio_data": base64.b64encode(audio_response).decode(),
|
| 374 |
+
"format": "mp3",
|
| 375 |
+
"voice_used": voice_preference,
|
| 376 |
+
"language": user_language or "en"
|
| 377 |
})
|
| 378 |
+
else:
|
| 379 |
+
logger.warning("β οΈ Could not generate audio response")
|
| 380 |
|
| 381 |
except Exception as e:
|
| 382 |
logger.error(f"β Error processing voice message: {e}")
|
| 383 |
await websocket.send_json({
|
| 384 |
"type": "error",
|
| 385 |
+
"message": f"Error processing voice message: {str(e)}. Please try again or switch to text mode."
|
| 386 |
})
|
| 387 |
|
| 388 |
async def get_hybrid_response(user_message: str, context: str, config: dict, knowledge_base: str):
|