File size: 2,728 Bytes
cf02b2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from groq import AsyncGroq
from config import GROQ_API_KEY, ASR_MODEL, MURF_API_KEY
import soundfile as sf
import numpy as np
from huggingface_hub import hf_hub_download
from concurrent.futures import ThreadPoolExecutor
from murf import AsyncMurf


groq = AsyncGroq(api_key=GROQ_API_KEY)
executor = ThreadPoolExecutor(max_workers=4)

# kokoro_device = "cuda" if torch.cuda.is_available() else "cpu"
# kokoro_model = KModel().to(kokoro_device).eval()
# model_path = hf_hub_download(repo_id='hexgrad/Kokoro-82M', filename="kokoro-v1_0.pth")
# kokoro_model.load_state_dict(torch.load(model_path, map_location=kokoro_device), strict=False)
# kokoro_pipeline = KPipeline(lang_code='a', model=False)
# voice_path = hf_hub_download("hexgrad/Kokoro-82M", "voices/af_heart.pt")
# kokoro_voice = torch.load(voice_path, weights_only=True).to(kokoro_device)

async def groq_asr_bytes(audio_bytes: bytes, model: str = ASR_MODEL, language: str = "en") -> str:
    """Transcribes audio using Groq ASR."""
    # Groq client is already async, so we can use it directly
    resp = await groq.audio.transcriptions.create(
        model=model,
        file=("audio.wav", audio_bytes, "audio/wav"),
        response_format="text",
        language=language
    )
    return resp

murf_client = AsyncMurf(api_key=MURF_API_KEY)

async def murf_tts(text: str, voice_id: str = "en-IN-isha", format: str = "MP3") -> bytes:
    resp = murf_client.text_to_speech.stream(
        text=text,
        voice_id=voice_id,
        format=format,
        sample_rate=44100.0
    )
    chunks = [chunk async for chunk in resp]
    full_audio = b''.join(chunks)
    return full_audio











# def groq_tts(text: str, speed: float = 1.0) -> bytes:
#     try:
#         audio_segments = []
#         for _, ps, _ in kokoro_pipeline(text, kokoro_voice, speed):
#             ref_s = kokoro_voice[len(ps) - 1]
#             audio = kokoro_model(ps, ref_s, speed)
#             audio_np = audio.cpu().numpy().astype(np.float32)
#             audio_segments.append(audio_np)

#         full_audio = np.concatenate(audio_segments)

#         # Write to WAV bytes
#         buf = io.BytesIO()
#         sf.write(buf, full_audio, samplerate=24000, format="WAV", subtype="PCM_16")
#         buf.seek(0)
#         return buf.read()

#     except Exception as e:
#         print("Kokoro TTS synthesis failed")
#         raise RuntimeError(f"Kokoro TTS failed: {e}")


'''def groq_tts(text: str, model: str = TTS_MODEL, voice: str = TTS_VOICE) -> bytes:
    text = text[:1000]
    resp = groq.audio.speech.create(
        model=model,
        voice=voice,
        input=text,
        response_format="wav"
    )
    print(resp.read()[:10])
    return resp.read()
    '''