ChAbhishek28 commited on
Commit
8a78e3e
Β·
1 Parent(s): 102aa44

🎀 Enhance voice bot with multi-language support and fix MediaRecorder issues

Browse files

- Fix WebSocket message handling to properly process voice messages with type field
- Add comprehensive language support (English, Hindi, Hinglish) for ASR and TTS
- Enhance error handling for voice processing
- Fix KeyError: 'text' by improving message validation
- Add language parameter support in voice_service.py speech_to_text method
- Improve audio data processing with base64 encoding support

Files changed (1) hide show
  1. enhanced_websocket_handler.py +56 -16
enhanced_websocket_handler.py CHANGED
@@ -265,7 +265,7 @@ async def handle_text_message(websocket: WebSocket, data: dict, session_data: di
265
 
266
  async def handle_voice_message(websocket: WebSocket, data: dict, session_data: dict,
267
  use_hybrid: bool, config: dict, knowledge_base: str, graph=None):
268
- """Handle voice message with ASR and TTS"""
269
 
270
  if not voice_service.is_voice_enabled():
271
  await websocket.send_json({
@@ -275,16 +275,31 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
275
  return
276
 
277
  try:
278
- # Decode audio data
279
- audio_data = base64.b64decode(data["audio_data"])
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
  # Save to temporary file
282
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
283
  temp_file.write(audio_data)
284
  temp_file_path = temp_file.name
285
 
286
- # Convert speech to text
287
- transcribed_text = await voice_service.speech_to_text(temp_file_path)
 
 
288
 
289
  # Clean up temp file
290
  Path(temp_file_path).unlink()
@@ -292,25 +307,36 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
292
  if not transcribed_text:
293
  await websocket.send_json({
294
  "type": "error",
295
- "message": "Could not transcribe audio"
296
  })
297
  return
298
 
299
- logger.info(f"🎀 Transcribed: {transcribed_text}")
300
 
301
- # Send transcription
302
  await websocket.send_json({
303
  "type": "transcription",
304
- "text": transcribed_text
 
 
305
  })
306
 
307
- # Process as text message
 
 
 
 
 
 
 
 
 
308
  if use_hybrid:
309
  response_text, provider_used = await get_hybrid_response(
310
- transcribed_text, session_data["context"], config, knowledge_base
311
  )
312
  else:
313
- session_data["messages"].append(HumanMessage(content=transcribed_text))
314
  result = await graph.ainvoke({"messages": session_data["messages"]}, config)
315
  response_text = result["messages"][-1].content
316
  provider_used = "traditional"
@@ -320,29 +346,43 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
320
 
321
  # Send voice response if enabled
322
  if session_data["user_preferences"]["response_mode"] in ["voice", "both"]:
 
 
 
 
 
 
 
 
 
 
323
  voice_text = voice_service.create_voice_response_with_guidance(
324
  response_text,
325
- suggested_resources=["Government portal", "Local offices"],
326
  redirect_info="contact your local government office for personalized assistance"
327
  )
328
 
329
  audio_response = await voice_service.text_to_speech(
330
  voice_text,
331
- session_data["user_preferences"]["preferred_voice"]
332
  )
333
 
334
  if audio_response:
335
  await websocket.send_json({
336
  "type": "audio_response",
337
  "audio_data": base64.b64encode(audio_response).decode(),
338
- "format": "mp3"
 
 
339
  })
 
 
340
 
341
  except Exception as e:
342
  logger.error(f"❌ Error processing voice message: {e}")
343
  await websocket.send_json({
344
  "type": "error",
345
- "message": f"Error processing voice message: {str(e)}"
346
  })
347
 
348
  async def get_hybrid_response(user_message: str, context: str, config: dict, knowledge_base: str):
 
265
 
266
  async def handle_voice_message(websocket: WebSocket, data: dict, session_data: dict,
267
  use_hybrid: bool, config: dict, knowledge_base: str, graph=None):
268
+ """Handle voice message with enhanced multi-language ASR and TTS"""
269
 
270
  if not voice_service.is_voice_enabled():
271
  await websocket.send_json({
 
275
  return
276
 
277
  try:
278
+ # Get audio data - handle both old and new format
279
+ if "audio_data" in data:
280
+ audio_data = base64.b64decode(data["audio_data"])
281
+ else:
282
+ # Handle old format or direct binary data
283
+ logger.error("❌ No audio_data field found in voice message")
284
+ await websocket.send_json({
285
+ "type": "error",
286
+ "message": "No audio data provided"
287
+ })
288
+ return
289
+
290
+ # Extract user language preference
291
+ user_language = data.get("lang") or data.get("language") or session_data.get("language") or session_data["user_preferences"].get("language") or "english"
292
+ logger.info(f"🌍 Processing voice with language: {user_language}")
293
 
294
  # Save to temporary file
295
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
296
  temp_file.write(audio_data)
297
  temp_file_path = temp_file.name
298
 
299
+ logger.info(f"🎀 Processing audio with language preference: {user_language}")
300
+
301
+ # Convert speech to text with language support
302
+ transcribed_text = await voice_service.speech_to_text(temp_file_path, user_language)
303
 
304
  # Clean up temp file
305
  Path(temp_file_path).unlink()
 
307
  if not transcribed_text:
308
  await websocket.send_json({
309
  "type": "error",
310
+ "message": "Could not transcribe audio. Please try speaking clearly or check your microphone."
311
  })
312
  return
313
 
314
+ logger.info(f"🎀 Transcribed ({user_language}): {transcribed_text}")
315
 
316
+ # Send transcription with detected language info
317
  await websocket.send_json({
318
  "type": "transcription",
319
+ "text": transcribed_text,
320
+ "language": user_language or "auto-detected",
321
+ "confidence": "high" # Could be dynamic based on Whisper confidence
322
  })
323
 
324
+ # Add language context to the prompt for better responses
325
+ language_context = ""
326
+ if user_language and user_language.lower() in ['hindi', 'hi', 'hi-in']:
327
+ language_context = " (User is speaking in Hindi, so you may include Hindi terms where appropriate for government policies in India)"
328
+ elif user_language and user_language.lower() in ['hinglish']:
329
+ language_context = " (User is speaking in Hinglish - Hindi-English mix, so feel free to use both languages in your response)"
330
+
331
+ enhanced_message = transcribed_text + language_context
332
+
333
+ # Process as text message with language context
334
  if use_hybrid:
335
  response_text, provider_used = await get_hybrid_response(
336
+ enhanced_message, session_data["context"], config, knowledge_base
337
  )
338
  else:
339
+ session_data["messages"].append(HumanMessage(content=enhanced_message))
340
  result = await graph.ainvoke({"messages": session_data["messages"]}, config)
341
  response_text = result["messages"][-1].content
342
  provider_used = "traditional"
 
346
 
347
  # Send voice response if enabled
348
  if session_data["user_preferences"]["response_mode"] in ["voice", "both"]:
349
+ # Choose appropriate voice based on user's language
350
+ voice_preference = session_data["user_preferences"]["preferred_voice"]
351
+ if not voice_preference and user_language:
352
+ if user_language.lower() in ['hindi', 'hi', 'hi-in']:
353
+ voice_preference = "hi-IN-SwaraNeural" # Hindi female voice
354
+ elif user_language.lower() in ['english', 'en', 'en-in']:
355
+ voice_preference = "en-IN-NeerjaNeural" # Indian English female voice
356
+ else:
357
+ voice_preference = "en-US-AriaNeural" # Default English
358
+
359
  voice_text = voice_service.create_voice_response_with_guidance(
360
  response_text,
361
+ suggested_resources=["Government portal", "Local offices", "Helpline numbers"],
362
  redirect_info="contact your local government office for personalized assistance"
363
  )
364
 
365
  audio_response = await voice_service.text_to_speech(
366
  voice_text,
367
+ voice_preference
368
  )
369
 
370
  if audio_response:
371
  await websocket.send_json({
372
  "type": "audio_response",
373
  "audio_data": base64.b64encode(audio_response).decode(),
374
+ "format": "mp3",
375
+ "voice_used": voice_preference,
376
+ "language": user_language or "en"
377
  })
378
+ else:
379
+ logger.warning("⚠️ Could not generate audio response")
380
 
381
  except Exception as e:
382
  logger.error(f"❌ Error processing voice message: {e}")
383
  await websocket.send_json({
384
  "type": "error",
385
+ "message": f"Error processing voice message: {str(e)}. Please try again or switch to text mode."
386
  })
387
 
388
  async def get_hybrid_response(user_message: str, context: str, config: dict, knowledge_base: str):