""" Enhanced Voice Service with Groq ASR for superior transcription accuracy Based on friend's proven implementation that achieves much better transcription quality """ import asyncio import logging import tempfile import os import aiohttp import base64 from typing import Optional, Dict, Any from pathlib import Path from groq import Groq from config import ( ENABLE_VOICE_FEATURES, TTS_PROVIDER, ASR_PROVIDER, VOICE_LANGUAGE, DEFAULT_VOICE_SPEED, GROQ_API_KEY ) logger = logging.getLogger("voicebot") class GroqVoiceService: def __init__(self): self.voice_enabled = ENABLE_VOICE_FEATURES self.tts_provider = TTS_PROVIDER self.asr_provider = "groq" # Force Groq ASR for better accuracy self.language = VOICE_LANGUAGE self.voice_speed = DEFAULT_VOICE_SPEED # Initialize Groq client if GROQ_API_KEY: self.groq_client = Groq(api_key=GROQ_API_KEY) logger.info("✅ Groq ASR client initialized") else: logger.error("❌ GROQ_API_KEY not found - ASR will not work") self.groq_client = None # Initialize services if voice is enabled if self.voice_enabled: self._init_tts_service() self._init_asr_service() logger.info(f"🎤 Enhanced Voice Service initialized - TTS: {self.tts_provider}, ASR: Groq") else: logger.info("🔇 Voice features disabled") def _init_tts_service(self): """Initialize Text-to-Speech service""" try: if self.tts_provider == "edge-tts": import edge_tts self.tts_available = True logger.info("✅ Edge TTS initialized") elif self.tts_provider == "murf": self.tts_available = True logger.info("✅ Murf AI TTS initialized") else: self.tts_available = False logger.warning(f"⚠️ Unknown TTS provider: {self.tts_provider}") except ImportError as e: self.tts_available = False logger.warning(f"⚠️ TTS dependencies not available: {e}") def _init_asr_service(self): """Initialize Groq ASR service""" if self.groq_client: self.asr_available = True logger.info("✅ Groq ASR initialized - superior transcription quality") else: self.asr_available = False logger.error("❌ Groq ASR not available - API key missing") def _get_default_voice(self) -> str: """Get default voice based on language setting""" language_voices = { 'hi-IN': 'hi-IN-SwaraNeural', # Hindi (India) female voice 'en-IN': 'en-IN-NeerjaNeural', # English (India) female voice 'en-US': 'en-US-AriaNeural', # English (US) female voice 'es-ES': 'es-ES-ElviraNeural', # Spanish (Spain) female voice 'fr-FR': 'fr-FR-DeniseNeural', # French (France) female voice 'de-DE': 'de-DE-KatjaNeural', # German (Germany) female voice 'ja-JP': 'ja-JP-NanamiNeural', # Japanese female voice 'ko-KR': 'ko-KR-SunHiNeural', # Korean female voice 'zh-CN': 'zh-CN-XiaoxiaoNeural' # Chinese (Simplified) female voice } return language_voices.get(self.language, 'en-US-AriaNeural') async def text_to_speech(self, text: str, voice: str = None) -> Optional[bytes]: """ Convert text to speech audio Returns audio bytes or None if TTS not available """ if not self.voice_enabled or not self.tts_available: return None # Use default voice for the configured language if no voice specified if voice is None: voice = self._get_default_voice() logger.info(f"🔊 Generating TTS with voice: {voice}, language: {self.language}") try: if self.tts_provider == "edge-tts": import edge_tts # Create TTS communication communicate = edge_tts.Communicate(text, voice, rate=f"{int((self.voice_speed - 1) * 100):+d}%") audio_data = b"" async for chunk in communicate.stream(): if chunk["type"] == "audio": audio_data += chunk["data"] # Validate audio data was received if not audio_data: logger.warning(f"⚠️ No audio generated from TTS for voice: {voice}") # Try fallback voice fallback_voice = "en-US-AriaNeural" logger.info(f"🔄 Retrying with fallback voice: {fallback_voice}") communicate = edge_tts.Communicate(text, fallback_voice, rate=f"{int((self.voice_speed - 1) * 100):+d}%") audio_data = b"" async for chunk in communicate.stream(): if chunk["type"] == "audio": audio_data += chunk["data"] if not audio_data: logger.error("❌ Fallback TTS also failed") return None return audio_data elif self.tts_provider == "murf": audio_data = await self._murf_tts(text, voice) return audio_data except Exception as e: logger.error(f"❌ TTS Error: {e}") # Try one last fallback with basic US English voice try: import edge_tts logger.info("🔄 Attempting emergency fallback TTS") communicate = edge_tts.Communicate(text, "en-US-AriaNeural") audio_data = b"" async for chunk in communicate.stream(): if chunk["type"] == "audio": audio_data += chunk["data"] return audio_data if audio_data else None except: logger.error("❌ All TTS attempts failed") return None async def _murf_tts(self, text: str, voice: str = None) -> Optional[bytes]: """ Call Murf AI TTS API to convert text to speech Returns audio bytes or None """ murf_api_key = os.environ.get("MURF_API_KEY", "ap2_947765d6-b958-4493-a681-d05f89a63276") murf_url = "https://api.murf.ai/v1/speech/generate" payload = { "text": text, "voice": voice or "en-US-1", # Default Murf voice "format": "mp3" } headers = { "Authorization": f"Bearer {murf_api_key}", "Content-Type": "application/json" } try: async with aiohttp.ClientSession() as session: async with session.post(murf_url, json=payload, headers=headers) as resp: if resp.status == 200: result = await resp.json() audio_url = result.get("audio_url") if audio_url: async with session.get(audio_url) as audio_resp: if audio_resp.status == 200: return await audio_resp.read() logger.error(f"❌ Murf TTS: No audio_url in response: {result}") else: logger.error(f"❌ Murf TTS API error: {resp.status} {await resp.text()}") except Exception as e: logger.error(f"❌ Murf TTS Exception: {e}") return None async def groq_asr_bytes(self, audio_bytes: bytes, user_language: str = None) -> Optional[str]: """ Enhanced Groq ASR function that processes audio bytes directly Based on friend's proven implementation for superior accuracy Args: audio_bytes: Raw audio data in bytes user_language: User's preferred language Returns: Transcribed text with much better accuracy than Whisper """ if not self.groq_client or not self.asr_available: logger.error("❌ Groq ASR not available") return None try: # Create temporary file for Groq API with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: temp_file.write(audio_bytes) temp_file_path = temp_file.name try: # Use Groq's whisper-large-v3 model for superior accuracy with open(temp_file_path, "rb") as audio_file: transcription = self.groq_client.audio.transcriptions.create( file=audio_file, model="whisper-large-v3", # Best available model language=self._get_groq_language_code(user_language), temperature=0.0, # Deterministic output response_format="json" ) transcribed_text = transcription.text.strip() logger.info(f"🎤 Groq ASR result: {transcribed_text}") # Log quality metrics if hasattr(transcription, 'confidence'): logger.info(f"🎤 Groq confidence: {transcription.confidence:.2f}") return transcribed_text finally: # Clean up temporary file try: os.unlink(temp_file_path) except Exception as cleanup_error: logger.warning(f"⚠️ Failed to cleanup temp file: {cleanup_error}") except Exception as e: logger.error(f"❌ Groq ASR Error: {e}") return None def _get_groq_language_code(self, user_language: str = None) -> str: """ Convert user language preference to Groq language code Args: user_language: User's language preference ('english', 'hindi', 'hi-IN', etc.) Returns: Language code for Groq (e.g., 'en', 'hi') """ if not user_language: # Fallback to default config language return self.language.split('-')[0] if self.language else 'en' # Handle different language format inputs user_lang_lower = user_language.lower() # Map common language names to codes language_mapping = { 'english': 'en', 'hindi': 'hi', 'hinglish': 'hi', # Treat Hinglish as Hindi for better results 'en': 'en', 'hi': 'hi', 'en-in': 'en', 'hi-in': 'hi', 'en-us': 'en' } # Extract base language if it's a locale code (e.g., 'hi-IN' -> 'hi') if '-' in user_lang_lower: base_lang = user_lang_lower.split('-')[0] return language_mapping.get(base_lang, 'en') return language_mapping.get(user_lang_lower, 'en') async def speech_to_text(self, audio_file_path: str, user_language: str = None) -> Optional[str]: """ Convert speech audio to text using Groq ASR for superior accuracy Args: audio_file_path: Path to the audio file user_language: User's preferred language """ if not self.voice_enabled or not self.asr_available: logger.warning("🔇 Voice features or Groq ASR not available") return None try: # Read audio file and process with Groq ASR with open(audio_file_path, 'rb') as audio_file: audio_bytes = audio_file.read() return await self.groq_asr_bytes(audio_bytes, user_language) except Exception as e: logger.error(f"❌ Groq ASR Error: {e}") return None def get_available_voices(self) -> Dict[str, Any]: """Get list of available TTS voices""" if not self.voice_enabled or self.tts_provider != "edge-tts": return {} # Common Edge TTS voices voices = { "english": { "female": ["en-US-AriaNeural", "en-US-JennyNeural", "en-GB-SoniaNeural"], "male": ["en-US-GuyNeural", "en-US-DavisNeural", "en-GB-RyanNeural"] }, "multilingual": { "spanish": ["es-ES-ElviraNeural", "es-MX-DaliaNeural"], "french": ["fr-FR-DeniseNeural", "fr-CA-SylvieNeural"], "german": ["de-DE-KatjaNeural", "de-AT-IngridNeural"], "italian": ["it-IT-ElsaNeural", "it-IT-IsabellaNeural"], "hindi": ["hi-IN-SwaraNeural", "hi-IN-MadhurNeural"] } } return voices def is_voice_enabled(self) -> bool: """Check if voice features are enabled""" return self.voice_enabled def get_voice_status(self) -> Dict[str, Any]: """Get current voice service status""" return { "voice_enabled": self.voice_enabled, "tts_available": getattr(self, 'tts_available', False), "asr_available": getattr(self, 'asr_available', False), "tts_provider": self.tts_provider, "asr_provider": "groq", # Always Groq for superior quality "language": self.language, "voice_speed": self.voice_speed, "groq_available": self.groq_client is not None } # Global instance groq_voice_service = GroqVoiceService()