ChAbhishek28 commited on
Commit
75546b0
Β·
1 Parent(s): fbf0654

πŸ”§ Improve FFmpeg installation and browser ASR handling

Browse files

- Update Dockerfile to explicitly install FFmpeg system dependency
- Enhance packages.txt with additional build dependencies
- Add intelligent browser-native ASR support in WebSocket handler
- Handle transcription from browser when server ASR is unavailable
- Improve error messaging for different ASR modes

Files changed (3) hide show
  1. Dockerfile +2 -1
  2. enhanced_websocket_handler.py +31 -13
  3. packages.txt +3 -1
Dockerfile CHANGED
@@ -1,9 +1,10 @@
1
  # Use Python 3.12 as specified
2
  FROM python:3.12-slim
3
 
4
- # Install system dependencies
5
  RUN apt-get update && apt-get install -y \
6
  curl \
 
7
  && rm -rf /var/lib/apt/lists/*
8
 
9
  # Create a non-root user
 
1
  # Use Python 3.12 as specified
2
  FROM python:3.12-slim
3
 
4
+ # Install system dependencies including FFmpeg
5
  RUN apt-get update && apt-get install -y \
6
  curl \
7
+ ffmpeg \
8
  && rm -rf /var/lib/apt/lists/*
9
 
10
  # Create a non-root user
enhanced_websocket_handler.py CHANGED
@@ -296,20 +296,38 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
296
  temp_file.write(audio_data)
297
  temp_file_path = temp_file.name
298
 
299
- logger.info(f"🎀 Processing audio with language preference: {user_language}")
300
-
301
- # Convert speech to text with language support
302
- transcribed_text = await voice_service.speech_to_text(temp_file_path, user_language)
303
-
304
- # Clean up temp file
305
- Path(temp_file_path).unlink()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
- if not transcribed_text:
308
- await websocket.send_json({
309
- "type": "error",
310
- "message": "Could not transcribe audio. Please try speaking clearly or check your microphone."
311
- })
312
- return
313
 
314
  logger.info(f"🎀 Transcribed ({user_language}): {transcribed_text}")
315
 
 
296
  temp_file.write(audio_data)
297
  temp_file_path = temp_file.name
298
 
299
+ # Check if we should use server-side ASR or expect browser transcription
300
+ if voice_service.asr_provider == "browser-native":
301
+ # Expect transcription to come from browser, not from audio processing
302
+ logger.info("οΏ½ Using browser-native ASR - expecting transcription from client")
303
+
304
+ # Clean up temp file since we won't process it
305
+ Path(temp_file_path).unlink()
306
+
307
+ # Check if transcription was provided in the message
308
+ if "transcription" in data:
309
+ transcribed_text = data["transcription"]
310
+ logger.info(f"🎀 Browser transcription ({user_language}): {transcribed_text}")
311
+ else:
312
+ await websocket.send_json({
313
+ "type": "info",
314
+ "message": "Browser ASR mode - please ensure your browser supports speech recognition"
315
+ })
316
+ return
317
+ else:
318
+ # Use server-side ASR (Whisper)
319
+ logger.info(f"🎀 Processing audio with language preference: {user_language}")
320
+ transcribed_text = await voice_service.speech_to_text(temp_file_path, user_language)
321
+
322
+ # Clean up temp file
323
+ Path(temp_file_path).unlink()
324
 
325
+ if not transcribed_text:
326
+ await websocket.send_json({
327
+ "type": "error",
328
+ "message": "Could not transcribe audio. Please try speaking clearly or check your microphone."
329
+ })
330
+ return
331
 
332
  logger.info(f"🎀 Transcribed ({user_language}): {transcribed_text}")
333
 
packages.txt CHANGED
@@ -1 +1,3 @@
1
- ffmpeg
 
 
 
1
+ ffmpeg
2
+ libffi-dev
3
+ build-essential