Loren commited on
Commit
f962716
·
verified ·
1 Parent(s): 8559f42

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +625 -625
app.py CHANGED
@@ -1,625 +1,625 @@
1
- import gradio as gr
2
- import torch
3
- from transformers import AutoProcessor, VoxtralForConditionalGeneration
4
- from pydub import AudioSegment
5
- from pydub.silence import detect_silence
6
- import yt_dlp
7
- import requests
8
- import validators
9
- from urllib.parse import urlparse
10
- import subprocess
11
- import os
12
- import re
13
- import glob
14
- import spaces
15
-
16
- ### Initializations
17
-
18
- MAX_TOKENS = 32000
19
-
20
- device = "cuda" if torch.cuda.is_available() else "cpu"
21
- print(f"*** Device: {device}")
22
- model_name = 'mistralai/Voxtral-Mini-3B-2507'
23
-
24
- processor = AutoProcessor.from_pretrained(model_name)
25
- model = VoxtralForConditionalGeneration.from_pretrained(model_name,
26
- torch_dtype=torch.bfloat16,
27
- device_map=device)
28
- # Supported languages
29
- dict_languages = {"English": "en",
30
- "French": "fr",
31
- "German": "de",
32
- "Spanish": "es",
33
- "Italian": "it",
34
- "Portuguese": "pt",
35
- "Dutch": "nl",
36
- "Hindi": "hi"}
37
-
38
- # Whitelist of allowed MIME types for audio and video
39
- ALLOWED_MIME_TYPES = {
40
- # Audio
41
- 'audio/mpeg', 'audio/wav', 'audio/wave', 'audio/x-wav', 'audio/x-pn-wav',
42
- 'audio/ogg', 'audio/vorbis', 'audio/aac', 'audio/mp4', 'audio/flac',
43
- 'audio/x-flac', 'audio/opus', 'audio/webm',
44
- # Video
45
- 'video/mp4', 'video/mpeg', 'video/ogg', 'video/webm', 'video/quicktime',
46
- 'video/x-msvideo', 'video/x-matroska'
47
- }
48
-
49
- # Maximum allowed file size (in bytes). Ex: 1 GB
50
- MAX_FILE_SIZE = 1 * 1024 * 1024 * 1024 # 1 GB
51
-
52
- # Directory where the files will be saved
53
- DOWNLOAD_DIR = "downloaded_files"
54
- if not os.path.exists(DOWNLOAD_DIR):
55
- os.makedirs(DOWNLOAD_DIR)
56
-
57
- MAX_LEN = 1800000 # 30 mn
58
- one_second_silence = AudioSegment.silent(duration=1000)
59
-
60
- #### Functions
61
-
62
- @spaces.GPU
63
- def chunks_creation(audio_path):
64
- list_audio_path = [audio_path]
65
- audio = AudioSegment.from_file(audio_path)
66
- status = gr.Markdown("👍 Audio duration less than max")
67
- # Input too large ?
68
- if len(audio) > MAX_LEN:
69
- list_audio_path = []
70
- try:
71
- # Create list of chunks
72
- list_silent = detect_silence(audio,min_silence_len=300,
73
- # silent if quieter than -14 dBFS threshold
74
- silence_thresh=audio.dBFS-14, seek_step=100)
75
- list_interval = [(start, stop) for start, stop in list_silent]
76
-
77
- # Calculate speech intervals
78
- list_speech = []
79
- current_start = 0
80
- for start, stop in list_interval:
81
- if current_start < start:
82
- list_interval.append((current_start, start))
83
- current_start = stop
84
- # Add last interval if needed
85
- if current_start < len(audio):
86
- list_speech.append((current_start, len(audio)))
87
-
88
- # Determination of chunks, to fit within the maximum duration
89
- list_chunks = []
90
- deb_chunk, fin_chunk = 0, list_speech[0][1]
91
-
92
- for start, end in list_speech[1:]:
93
- if end - deb_chunk + one_second_silence <= MAX_LEN:
94
- fin_chunk = end + one_second_silence
95
- else:
96
- list_chunks.append([deb_chunk, fin_chunk])
97
- deb_chunk, fin_chunk = start, end
98
- list_chunks.append([deb_chunk, fin_chunk+one_second_silence])
99
-
100
- # Save chunks
101
- for i, (start, stop) in enumerate(list_chunks):
102
- segment = audio[start:stop]
103
- segment.export(f"chunk_{i}.wav", format="wav")
104
- list_audio_path.append(f"chunk_{i}.wav")
105
-
106
- status = f"✅ **Success!** {len(list_audio_path)} chunks saved."
107
- except Exception as e:
108
- status = gr.Markdown(f"❌ **Unexpected error during chuncks creation:** {e}")
109
-
110
- return list_audio_path, status
111
- ###
112
-
113
- @spaces.GPU
114
- def process_transcript(language: str, audio_path: str) -> str:
115
- """Process the audio file to return its transcription.
116
-
117
- Args:
118
- language: The language of the audio.
119
- audio_path: The path to the audio file.
120
-
121
- Returns:
122
- The transcribed text of the audio.
123
- The status of transcription : with or without chunking.
124
- """
125
- result = ""
126
- status = gr.Markdown()
127
-
128
- if audio_path is None:
129
- status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
130
- else:
131
- id_language = dict_languages[language]
132
-
133
- # Verification of the duration, for possible division into chunks
134
- list_audio_path, status = chunks_creation(audio_path)
135
-
136
- # Transcription process
137
- try:
138
- for path in list_audio_path:
139
- inputs = processor.apply_transcrition_request(language=id_language,
140
- audio=path, model_id=model_name)
141
- inputs = inputs.to(device, dtype=torch.bfloat16)
142
- outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
143
- decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:],
144
- skip_special_tokens=True)
145
- result += decoded_outputs[0]
146
- status = "✅ **Success!** Transcription done."
147
- except Exception as e:
148
- status = gr.Markdown(f"❌ **Unexpected error during transcription:** {e}")
149
-
150
- return result, status
151
- ###
152
-
153
- @spaces.GPU
154
- def process_translate(language: str, audio_path: str) -> str:
155
- result = ""
156
- status = gr.Markdown()
157
-
158
- if audio_path is None:
159
- status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
160
- else:
161
- try:
162
- conversation = [
163
- {
164
- "role": "user",
165
- "content": [
166
- {
167
- "type": "audio",
168
- "path": audio_path,
169
- },
170
- {"type": "text", "text": "Translate this in "+language},
171
- ],
172
- }
173
- ]
174
-
175
- inputs = processor.apply_chat_template(conversation)
176
- inputs = inputs.to(device, dtype=torch.bfloat16)
177
-
178
- outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
179
- decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
180
- result = decoded_outputs[0]
181
- status = "✅ **Success!** Translation done."
182
- except Exception as e:
183
- status = gr.Markdown(f"❌ **Unexpected error during translation:** {e}")
184
-
185
- return result, status
186
- ###
187
-
188
- @spaces.GPU
189
- def process_chat(question: str, audio_path: str) -> str:
190
- result = ""
191
- status = gr.Markdown()
192
-
193
- if audio_path is None:
194
- status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
195
- else:
196
- try:
197
- conversation = [
198
- {
199
- "role": "user",
200
- "content": [
201
- {
202
- "type": "audio",
203
- "path": audio_path,
204
- },
205
- {"type": "text", "text": question},
206
- ],
207
- }
208
- ]
209
-
210
- inputs = processor.apply_chat_template(conversation)
211
- inputs = inputs.to(device, dtype=torch.bfloat16)
212
-
213
- outputs = model.generate(**inputs, max_new_tokens=500)
214
- decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
215
-
216
- result = decoded_outputs[0]
217
- status = "✅ **Success!** Translation done."
218
- except Exception as e:
219
- status = gr.Markdown(f"❌ **Unexpected error during translation:** {e}")
220
-
221
- return result, status
222
- ###
223
-
224
- def disable_buttons():
225
- return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
226
-
227
- def enable_buttons():
228
- return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
229
- ###
230
-
231
- def clear_audio():
232
- return None, None, None, None
233
- ###
234
-
235
- @spaces.GPU
236
- def voice_extract_demucs():
237
- """
238
- Returns the path of the voice extracted file.
239
- """
240
- try:
241
- cmd = [
242
- "demucs",
243
- "--two-stems=vocals",
244
- "--out", "demucs",
245
- "audio_file.wav"
246
- ]
247
- subprocess.run(cmd, check=True)
248
- voice_path = os.path.join("demucs", "htdemucs", "audio_file", "vocals.wav")
249
- success_message = "✅ **Success!** Voice extracted."
250
- return voice_path, voice_path, gr.Markdown(success_message)
251
- except Exception as e:
252
- return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
253
- ###
254
-
255
- def secure_download_from_url(url: str):
256
- """
257
- Validates a URL and downloads the file if it is an authorized media.
258
- Returns the path of the downloaded file or an error message.
259
- """
260
- # Step 1: Validate the URL format
261
- if not validators.url(url):
262
- return None, None, gr.Markdown("❌ **Error:** The provided URL is invalid.")
263
-
264
- try:
265
- # Step 2: Send a HEAD request to check the headers without downloading the content
266
- # allow_redirects=True to follow redirects to the final file location.
267
- # timeout to avoid blocking requests.
268
- response = requests.head(url, allow_redirects=True, timeout=10)
269
-
270
- # Check if the request was successful (status code 2xx)
271
- response.raise_for_status()
272
-
273
- # Step 3: Validate the content type (MIME type)
274
- content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
275
- if content_type not in ALLOWED_MIME_TYPES:
276
- error_message = (
277
- f"❌ **Error:** The file type is not allowed.\n"
278
- f" - **Type detected:** `{content_type}`\n"
279
- f" - **Allowed types:** Audio and Video only."
280
- )
281
- return None, None, gr.Markdown(error_message)
282
-
283
- # Step 4: Validate the file size
284
- content_length = response.headers.get('Content-Length')
285
- if content_length and int(content_length) > MAX_FILE_SIZE:
286
- error_message = (
287
- f"❌ **Error:** The file is too large.\n"
288
- f" - **File size:** {int(content_length) / 1024 / 1024:.2f} MB\n"
289
- f" - **Maximum allowed size:** {MAX_FILE_SIZE / 1024 / 1024:.2f} MB"
290
- )
291
- return None, None, gr.Markdown(error_message)
292
-
293
- # Step 5: Secure streaming download
294
- with requests.get(url, stream=True, timeout=20) as r:
295
- r.raise_for_status()
296
-
297
- # Extract the file name from the URL
298
- parsed_url = urlparse(url)
299
- filename = os.path.basename(parsed_url.path)
300
- if not filename: # Si l'URL se termine par un '/'
301
- filename = "downloaded_media_file"
302
-
303
- filepath = os.path.join(DOWNLOAD_DIR, filename)
304
-
305
- # --- Step 6: Download the audio ---
306
- # Write the file in chunks to avoid overloading memory
307
- with open(filepath, 'wb') as f:
308
- downloaded_size = 0
309
- for chunk in r.iter_content(chunk_size=8192):
310
- downloaded_size += len(chunk)
311
- if downloaded_size > MAX_FILE_SIZE:
312
- os.remove(filepath) # Supprimer le fichier partiel
313
- return None, None, gr.Markdown("❌ **Error:** The file exceeds the maximum allowed size during download.")
314
- f.write(chunk)
315
-
316
- # --- Step 7: Convert to WAV using Pydub ---
317
- audio_file = AudioSegment.from_file(filepath)
318
- file_handle = audio_file.export("audio_file.wav", format="wav")
319
-
320
- # --- Step 8: Clean up ---
321
- try:
322
- files = glob.glob(DOWNLOAD_DIR)
323
- for f in files:
324
- os.remove(f)
325
- except:
326
- pass
327
-
328
- success_message = (
329
- f"✅ **Success!** File downloaded and saved."
330
- )
331
-
332
- # Returns the file path and a success message.
333
- return "audio_file.wav", "audio_file.wav", gr.Markdown(success_message)
334
-
335
- except requests.exceptions.RequestException as e:
336
- # Handle network errors (timeout, DNS, connection refused, etc.)
337
- return None, None, gr.Markdown(f"❌ **Network error:** Unable to reach URL. Details: {e}")
338
- except Exception as e:
339
- # Handle Other potential errors
340
- return None, None, gr.Markdown(f"❌ **Unexpected error:** {e}")
341
- ###
342
-
343
- def secure_download_youtube_audio(url: str):
344
- """
345
- Returns the path of the downloaded file or an error message.
346
- """
347
- # --- Step 1: Validate URL format with Regex ---
348
- youtube_regex = re.compile(
349
- r'^(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/'
350
- r'(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})')
351
- if not youtube_regex.match(url):
352
- return None, None, gr.Markdown("❌ **Error:** The URL '{url}' does not appear to be a valid YouTube URL.")
353
-
354
- try:
355
- # --- Step 2: Check video availability ---
356
- ydl_info_opts = {'quiet': True, 'skip_download': True}
357
- try:
358
- with yt_dlp.YoutubeDL(ydl_info_opts) as ydl:
359
- info = ydl.extract_info(url, download=False)
360
- except yt_dlp.utils.DownloadError as e:
361
- return None, None, gr.Markdown(f"❌ **Error:** The video at URL '{url}' is unavailable ({str(e)})")
362
-
363
- # --- Step 3: Select best audio format ---
364
- formats = [f for f in info['formats'] if f.get('acodec') != 'none']
365
- if not formats:
366
- return None, None, gr.Markdown("❌ **Error:** No audio-only stream was found for this video.")
367
-
368
- formats.sort(key=lambda f: f.get('abr') or 0, reverse=True)
369
- best_audio_format = formats[0]
370
-
371
- # --- Step 4: Check file size BEFORE downloading ---
372
- filesize = best_audio_format.get('filesize') or best_audio_format.get('filesize_approx')
373
- if filesize is None:
374
- print("Could not determine file size before downloading.")
375
- filesize = 1
376
-
377
- if filesize > MAX_FILE_SIZE:
378
- return None, None, gr.Markdown(
379
- f"❌ **Error:** The file is too large.\n"
380
- f" - **File size:** {filesize / 1024 / 1024:.2f} MB\n"
381
- f" - **Maximum allowed size:** {MAX_FILE_SIZE / 1024 / 1024:.2f} MB"
382
- )
383
-
384
- # --- Step 5: Download & convert directly to WAV ---
385
- ydl_opts = {
386
- 'quiet': True,
387
- 'format': f"{best_audio_format['format_id']}",
388
- 'outtmpl': "audio_file", # will be replaced by ffmpeg output
389
- 'postprocessors': [{
390
- 'key': 'FFmpegExtractAudio',
391
- 'preferredcodec': 'wav',
392
- 'preferredquality': '192',
393
- }],
394
- }
395
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
396
- ydl.download([url])
397
-
398
- success_message = "✅ **Success!** Audio extracted and saved."
399
- return "audio_file.wav", "audio_file.wav", gr.Markdown(success_message)
400
-
401
- except FileNotFoundError:
402
- return None, None, gr.Markdown("❌ **Error:** FFmpeg not found. Please ensure it is installed and in your system's PATH.")
403
- except Exception as e:
404
- return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
405
- ###
406
-
407
- def voice_extract_demucs():
408
- """
409
- Returns the path of the voice extracted file.
410
- """
411
- try:
412
- cmd = [
413
- "demucs",
414
- "--two-stems=vocals",
415
- "--out", "demucs",
416
- "audio_file.wav"
417
- ]
418
- subprocess.run(cmd, check=True)
419
- voice_path = os.path.join("demucs", "htdemucs", "audio_file", "vocals.wav")
420
- success_message = "✅ **Success!** Voice extracted."
421
- return voice_path, voice_path, gr.Markdown(success_message)
422
- except Exception as e:
423
- return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
424
- ###
425
-
426
- def clear_audio():
427
- return None, None, None, None
428
- ###
429
-
430
-
431
-
432
- #### Gradio interface
433
- with gr.Blocks(title="Voxtral") as voxtral:
434
- with gr.Row():
435
- gr.Markdown("# **Voxtral Mini Evaluation**")
436
-
437
- with gr.Accordion("🔎 More on Voxtral", open=False):
438
- gr.Markdown("""## **Key Features:**
439
-
440
- #### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
441
- ##### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
442
- ##### - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
443
- ##### - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
444
- ##### - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
445
- ##### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
446
- ##### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
447
-
448
- gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
449
- capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and \
450
- audio understanding. Available languages: English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian.""")
451
-
452
- gr.Markdown("### **1.Choose the audio:**")
453
- sel_audio = gr.State()
454
- with gr.Row():
455
- with gr.Tabs():
456
- with gr.Tab("From record or file upload"):
457
- gr.Markdown("### **Upload an audio file, record via microphone, or select a demo file:**")
458
- gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription; if longer, it will be cut into chunks)*")
459
- sel_audio1 = gr.Audio(sources=["upload", "microphone"], type="filepath",
460
- label="Set an audio file to process it:")
461
- example1 = [["mapo_tofu.mp3"]]
462
- gr.Examples(
463
- examples=example1,
464
- inputs=sel_audio1,
465
- outputs=None,
466
- fn=None,
467
- cache_examples=False,
468
- run_on_click=False
469
- )
470
- status_output1 = gr.Markdown()
471
- with gr.Row():
472
- voice_button1 = gr.Button("Extract voice (if noisy environment)")
473
- voice_button1.click(
474
- fn=voice_extract_demucs,
475
- outputs=[sel_audio, sel_audio1, status_output1])
476
- clear_audio1 = gr.Button("Clear audio")
477
- clear_audio1.click(
478
- fn=clear_audio,
479
- outputs=[sel_audio, sel_audio, sel_audio1, status_output1])
480
-
481
- with gr.Tab("From file url (audio or video file)"):
482
- gr.Markdown("### **Enter the url of the file (mp3, wav, mp4, ...):**")
483
- url_input2 = gr.Textbox(label="URL (MP3 or MP4 file)",
484
- placeholder="https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4")
485
- example2 = [["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4"]]
486
- gr.Examples(
487
- examples=example2,
488
- inputs=url_input2,
489
- outputs=None,
490
- fn=None,
491
- cache_examples=False,
492
- run_on_click=False
493
- )
494
- download_button2 = gr.Button("Check and upload", variant="primary")
495
- input_audio2 = gr.Audio()
496
- status_output2 = gr.Markdown()
497
- download_button2.click(
498
- fn=secure_download_from_url,
499
- inputs=url_input2,
500
- outputs=[input_audio2, sel_audio, status_output2]
501
- )
502
- with gr.Row():
503
- voice_button2 = gr.Button("Extract voice (if noisy environment)")
504
- voice_button2.click(
505
- fn=voice_extract_demucs,
506
- outputs=[input_audio2, sel_audio, status_output2])
507
- clear_audio1 = gr.Button("Clear audio")
508
- clear_audio1.click(
509
- fn=clear_audio,
510
- outputs=[sel_audio, url_input2, input_audio2, status_output2])
511
-
512
- with gr.Tab("From Youtube url:"):
513
- gr.Markdown("### **Enter the url of the Youtube video:**")
514
- url_input3 = gr.Textbox(label="Youtube url",
515
- placeholder="https://www.youtube.com/...")
516
- download_button3 = gr.Button("Check and upload", variant="primary")
517
- input_audio3 = gr.Audio()
518
- status_output3 = gr.Markdown()
519
- download_button3.click(
520
- fn=secure_download_youtube_audio,
521
- inputs=url_input3,
522
- outputs=[input_audio3, sel_audio, status_output3]
523
- )
524
- with gr.Row():
525
- voice_button3 = gr.Button("Extract voice (if noisy environment)")
526
- voice_button3.click(
527
- fn=voice_extract_demucs,
528
- outputs=[input_audio3, sel_audio, status_output3])
529
- clear_audio1 = gr.Button("Clear audio")
530
- clear_audio1.click(
531
- fn=clear_audio,
532
- outputs=[sel_audio, url_input3, input_audio3, status_output3])
533
-
534
- with gr.Row():
535
- gr.Markdown("### **2. Choose one of theese tasks:**")
536
-
537
- with gr.Row():
538
- with gr.Column():
539
- with gr.Accordion("📝 Transcription", open=True):
540
- sel_language = gr.Dropdown(
541
- choices=list(dict_languages.keys()),
542
- value="English",
543
- label="Select the language of the audio file:"
544
- )
545
- submit_transcript = gr.Button("Extract transcription", variant="primary")
546
- text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
547
- status_transcript = gr.Markdown()
548
-
549
- with gr.Column():
550
- with gr.Accordion("🔁 Translation", open=True):
551
- list_language = list(dict_languages.keys())
552
- list_language.pop(list_language.index(sel_language.value)) # Fix: Access the value of the dropdown
553
- sel_translate_language = gr.Dropdown(
554
- choices=list(dict_languages.keys()),
555
- value="English",
556
- label="Select the language for translation:"
557
- )
558
- submit_translate = gr.Button("Translate audio file", variant="primary")
559
- text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
560
- status_translate = gr.Markdown()
561
-
562
- with gr.Column():
563
- with gr.Accordion("🤖 Ask audio file", open=True):
564
- question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file")
565
- submit_chat = gr.Button("Ask audio file", variant="primary")
566
- example_chat = [["What is the subject of this audio file?"], ["Quels sont les ingrédients ?"]]
567
- gr.Examples(
568
- examples=example_chat,
569
- inputs=question_chat,
570
- outputs=None,
571
- fn=None,
572
- cache_examples=False,
573
- run_on_click=False
574
- )
575
- text_chat = gr.Textbox(label="💬 Model answer", lines=10)
576
- status_chat = gr.Markdown()
577
-
578
- ### Processing
579
-
580
- # Transcription
581
- submit_transcript.click(
582
- disable_buttons,
583
- outputs=[submit_transcript, submit_translate, submit_chat],
584
- trigger_mode="once",
585
- ).then(
586
- fn=process_transcript,
587
- inputs=[sel_language, sel_audio],
588
- outputs=[text_transcript, status_transcript]
589
- ).then(
590
- enable_buttons,
591
- outputs=[submit_transcript, submit_translate, submit_chat],
592
- )
593
-
594
- # Translation
595
- submit_translate.click(
596
- disable_buttons,
597
- outputs=[submit_transcript, submit_translate, submit_chat],
598
- trigger_mode="once",
599
- ).then(
600
- fn=process_translate,
601
- inputs=[sel_translate_language, sel_audio],
602
- outputs=[text_translate, status_translate]
603
- ).then(
604
- enable_buttons,
605
- outputs=[submit_transcript, submit_translate, submit_chat],
606
- )
607
-
608
- # Chat
609
- submit_chat.click(
610
- disable_buttons,
611
- outputs=[submit_transcript, submit_translate, submit_chat],
612
- trigger_mode="once",
613
- ).then(
614
- fn=process_chat,
615
- inputs=[question_chat, sel_audio],
616
- outputs=[text_chat, status_chat]
617
- ).then(
618
- enable_buttons,
619
- outputs=[submit_transcript, submit_translate, submit_chat],
620
- )
621
-
622
- ### Launch the app
623
-
624
- if __name__ == "__main__":
625
- voxtral.queue().launch(debug=True)
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoProcessor, VoxtralForConditionalGeneration
4
+ from pydub import AudioSegment
5
+ from pydub.silence import detect_silence
6
+ import yt_dlp
7
+ import requests
8
+ import validators
9
+ from urllib.parse import urlparse
10
+ import subprocess
11
+ import os
12
+ import re
13
+ import glob
14
+ import spaces
15
+
16
+ ### Initializations
17
+
18
+ MAX_TOKENS = 32000
19
+
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
+ print(f"*** Device: {device}")
22
+ model_name = 'mistralai/Voxtral-Mini-3B-2507'
23
+
24
+ processor = AutoProcessor.from_pretrained(model_name)
25
+ model = VoxtralForConditionalGeneration.from_pretrained(model_name,
26
+ torch_dtype=torch.bfloat16,
27
+ device_map=device)
28
+ # Supported languages
29
+ dict_languages = {"English": "en",
30
+ "French": "fr",
31
+ "German": "de",
32
+ "Spanish": "es",
33
+ "Italian": "it",
34
+ "Portuguese": "pt",
35
+ "Dutch": "nl",
36
+ "Hindi": "hi"}
37
+
38
+ # Whitelist of allowed MIME types for audio and video
39
+ ALLOWED_MIME_TYPES = {
40
+ # Audio
41
+ 'audio/mpeg', 'audio/wav', 'audio/wave', 'audio/x-wav', 'audio/x-pn-wav',
42
+ 'audio/ogg', 'audio/vorbis', 'audio/aac', 'audio/mp4', 'audio/flac',
43
+ 'audio/x-flac', 'audio/opus', 'audio/webm',
44
+ # Video
45
+ 'video/mp4', 'video/mpeg', 'video/ogg', 'video/webm', 'video/quicktime',
46
+ 'video/x-msvideo', 'video/x-matroska'
47
+ }
48
+
49
+ # Maximum allowed file size (in bytes). Ex: 1 GB
50
+ MAX_FILE_SIZE = 1 * 1024 * 1024 * 1024 # 1 GB
51
+
52
+ # Directory where the files will be saved
53
+ DOWNLOAD_DIR = "downloaded_files"
54
+ if not os.path.exists(DOWNLOAD_DIR):
55
+ os.makedirs(DOWNLOAD_DIR)
56
+
57
+ MAX_LEN = 1800000 # 30 mn
58
+ one_second_silence = AudioSegment.silent(duration=1000)
59
+
60
+ #### Functions
61
+
62
+ @spaces.GPU
63
+ def chunks_creation(audio_path):
64
+ list_audio_path = [audio_path]
65
+ audio = AudioSegment.from_file(audio_path)
66
+ status = gr.Markdown("👍 Audio duration less than max")
67
+ # Input too large ?
68
+ if len(audio) > MAX_LEN:
69
+ list_audio_path = []
70
+ try:
71
+ # Create list of chunks
72
+ list_silent = detect_silence(audio,min_silence_len=300,
73
+ # silent if quieter than -14 dBFS threshold
74
+ silence_thresh=audio.dBFS-14, seek_step=100)
75
+ list_interval = [(start, stop) for start, stop in list_silent]
76
+
77
+ # Calculate speech intervals
78
+ list_speech = []
79
+ current_start = 0
80
+ for start, stop in list_interval:
81
+ if current_start < start:
82
+ list_interval.append((current_start, start))
83
+ current_start = stop
84
+ # Add last interval if needed
85
+ if current_start < len(audio):
86
+ list_speech.append((current_start, len(audio)))
87
+
88
+ # Determination of chunks, to fit within the maximum duration
89
+ list_chunks = []
90
+ deb_chunk, fin_chunk = 0, list_speech[0][1]
91
+
92
+ for start, end in list_speech[1:]:
93
+ if end - deb_chunk + one_second_silence <= MAX_LEN:
94
+ fin_chunk = end + one_second_silence
95
+ else:
96
+ list_chunks.append([deb_chunk, fin_chunk])
97
+ deb_chunk, fin_chunk = start, end
98
+ list_chunks.append([deb_chunk, fin_chunk+one_second_silence])
99
+
100
+ # Save chunks
101
+ for i, (start, stop) in enumerate(list_chunks):
102
+ segment = audio[start:stop]
103
+ segment.export(f"chunk_{i}.wav", format="wav")
104
+ list_audio_path.append(f"chunk_{i}.wav")
105
+
106
+ status = f"✅ **Success!** {len(list_audio_path)} chunks saved."
107
+ except Exception as e:
108
+ status = gr.Markdown(f"❌ **Unexpected error during chuncks creation:** {e}")
109
+
110
+ return list_audio_path, status
111
+ ###
112
+
113
+ @spaces.GPU
114
+ def process_transcript(language: str, audio_path: str) -> str:
115
+ """Process the audio file to return its transcription.
116
+
117
+ Args:
118
+ language: The language of the audio.
119
+ audio_path: The path to the audio file.
120
+
121
+ Returns:
122
+ The transcribed text of the audio.
123
+ The status of transcription : with or without chunking.
124
+ """
125
+ result = ""
126
+ status = gr.Markdown()
127
+
128
+ if audio_path is None:
129
+ status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
130
+ else:
131
+ id_language = dict_languages[language]
132
+
133
+ # Verification of the duration, for possible division into chunks
134
+ list_audio_path, status = chunks_creation(audio_path)
135
+
136
+ # Transcription process
137
+ try:
138
+ for path in list_audio_path:
139
+ inputs = processor.apply_transcrition_request(language=id_language,
140
+ audio=path, model_id=model_name)
141
+ inputs = inputs.to(device, dtype=torch.bfloat16)
142
+ outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
143
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:],
144
+ skip_special_tokens=True)
145
+ result += decoded_outputs[0]
146
+ status = "✅ **Success!** Transcription done."
147
+ except Exception as e:
148
+ status = gr.Markdown(f"❌ **Unexpected error during transcription:** {e}")
149
+
150
+ return result, status
151
+ ###
152
+
153
+ @spaces.GPU
154
+ def process_translate(language: str, audio_path: str) -> str:
155
+ result = ""
156
+ status = gr.Markdown()
157
+
158
+ if audio_path is None:
159
+ status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
160
+ else:
161
+ try:
162
+ conversation = [
163
+ {
164
+ "role": "user",
165
+ "content": [
166
+ {
167
+ "type": "audio",
168
+ "path": audio_path,
169
+ },
170
+ {"type": "text", "text": "Translate this in "+language},
171
+ ],
172
+ }
173
+ ]
174
+
175
+ inputs = processor.apply_chat_template(conversation)
176
+ inputs = inputs.to(device, dtype=torch.bfloat16)
177
+
178
+ outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
179
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
180
+ result = decoded_outputs[0]
181
+ status = "✅ **Success!** Translation done."
182
+ except Exception as e:
183
+ status = gr.Markdown(f"❌ **Unexpected error during translation:** {e}")
184
+
185
+ return result, status
186
+ ###
187
+
188
+ @spaces.GPU
189
+ def process_chat(question: str, audio_path: str) -> str:
190
+ result = ""
191
+ status = gr.Markdown()
192
+
193
+ if audio_path is None:
194
+ status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
195
+ else:
196
+ try:
197
+ conversation = [
198
+ {
199
+ "role": "user",
200
+ "content": [
201
+ {
202
+ "type": "audio",
203
+ "path": audio_path,
204
+ },
205
+ {"type": "text", "text": question},
206
+ ],
207
+ }
208
+ ]
209
+
210
+ inputs = processor.apply_chat_template(conversation)
211
+ inputs = inputs.to(device, dtype=torch.bfloat16)
212
+
213
+ outputs = model.generate(**inputs, max_new_tokens=500)
214
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
215
+
216
+ result = decoded_outputs[0]
217
+ status = "✅ **Success!** Translation done."
218
+ except Exception as e:
219
+ status = gr.Markdown(f"❌ **Unexpected error during translation:** {e}")
220
+
221
+ return result, status
222
+ ###
223
+
224
+ def disable_buttons():
225
+ return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
226
+
227
+ def enable_buttons():
228
+ return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
229
+ ###
230
+
231
+ def clear_audio():
232
+ return None, None, None, None
233
+ ###
234
+
235
+ @spaces.GPU
236
+ def voice_extract_demucs():
237
+ """
238
+ Returns the path of the voice extracted file.
239
+ """
240
+ try:
241
+ cmd = [
242
+ "demucs",
243
+ "--two-stems=vocals",
244
+ "--out", "demucs",
245
+ "audio_file.wav"
246
+ ]
247
+ subprocess.run(cmd, check=True)
248
+ voice_path = os.path.join("demucs", "htdemucs", "audio_file", "vocals.wav")
249
+ success_message = "✅ **Success!** Voice extracted."
250
+ return voice_path, voice_path, gr.Markdown(success_message)
251
+ except Exception as e:
252
+ return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
253
+ ###
254
+
255
+ def secure_download_from_url(url: str):
256
+ """
257
+ Validates a URL and downloads the file if it is an authorized media.
258
+ Returns the path of the downloaded file or an error message.
259
+ """
260
+ # Step 1: Validate the URL format
261
+ if not validators.url(url):
262
+ return None, None, gr.Markdown("❌ **Error:** The provided URL is invalid.")
263
+
264
+ try:
265
+ # Step 2: Send a HEAD request to check the headers without downloading the content
266
+ # allow_redirects=True to follow redirects to the final file location.
267
+ # timeout to avoid blocking requests.
268
+ response = requests.head(url, allow_redirects=True, timeout=10)
269
+
270
+ # Check if the request was successful (status code 2xx)
271
+ response.raise_for_status()
272
+
273
+ # Step 3: Validate the content type (MIME type)
274
+ content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
275
+ if content_type not in ALLOWED_MIME_TYPES:
276
+ error_message = (
277
+ f"❌ **Error:** The file type is not allowed.\n"
278
+ f" - **Type detected:** `{content_type}`\n"
279
+ f" - **Allowed types:** Audio and Video only."
280
+ )
281
+ return None, None, gr.Markdown(error_message)
282
+
283
+ # Step 4: Validate the file size
284
+ content_length = response.headers.get('Content-Length')
285
+ if content_length and int(content_length) > MAX_FILE_SIZE:
286
+ error_message = (
287
+ f"❌ **Error:** The file is too large.\n"
288
+ f" - **File size:** {int(content_length) / 1024 / 1024:.2f} MB\n"
289
+ f" - **Maximum allowed size:** {MAX_FILE_SIZE / 1024 / 1024:.2f} MB"
290
+ )
291
+ return None, None, gr.Markdown(error_message)
292
+
293
+ # Step 5: Secure streaming download
294
+ with requests.get(url, stream=True, timeout=20) as r:
295
+ r.raise_for_status()
296
+
297
+ # Extract the file name from the URL
298
+ parsed_url = urlparse(url)
299
+ filename = os.path.basename(parsed_url.path)
300
+ if not filename: # Si l'URL se termine par un '/'
301
+ filename = "downloaded_media_file"
302
+
303
+ filepath = os.path.join(DOWNLOAD_DIR, filename)
304
+
305
+ # --- Step 6: Download the audio ---
306
+ # Write the file in chunks to avoid overloading memory
307
+ with open(filepath, 'wb') as f:
308
+ downloaded_size = 0
309
+ for chunk in r.iter_content(chunk_size=8192):
310
+ downloaded_size += len(chunk)
311
+ if downloaded_size > MAX_FILE_SIZE:
312
+ os.remove(filepath) # Supprimer le fichier partiel
313
+ return None, None, gr.Markdown("❌ **Error:** The file exceeds the maximum allowed size during download.")
314
+ f.write(chunk)
315
+
316
+ # --- Step 7: Convert to WAV using Pydub ---
317
+ audio_file = AudioSegment.from_file(filepath)
318
+ file_handle = audio_file.export("audio_file.wav", format="wav")
319
+
320
+ # --- Step 8: Clean up ---
321
+ try:
322
+ files = glob.glob(DOWNLOAD_DIR)
323
+ for f in files:
324
+ os.remove(f)
325
+ except:
326
+ pass
327
+
328
+ success_message = (
329
+ f"✅ **Success!** File downloaded and saved."
330
+ )
331
+
332
+ # Returns the file path and a success message.
333
+ return "audio_file.wav", "audio_file.wav", gr.Markdown(success_message)
334
+
335
+ except requests.exceptions.RequestException as e:
336
+ # Handle network errors (timeout, DNS, connection refused, etc.)
337
+ return None, None, gr.Markdown(f"❌ **Network error:** Unable to reach URL. Details: {e}")
338
+ except Exception as e:
339
+ # Handle Other potential errors
340
+ return None, None, gr.Markdown(f"❌ **Unexpected error:** {e}")
341
+ ###
342
+
343
+ def secure_download_youtube_audio(url: str):
344
+ """
345
+ Returns the path of the downloaded file or an error message.
346
+ """
347
+ # --- Step 1: Validate URL format with Regex ---
348
+ youtube_regex = re.compile(
349
+ r'^(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/'
350
+ r'(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})')
351
+ if not youtube_regex.match(url):
352
+ return None, None, gr.Markdown("❌ **Error:** The URL '{url}' does not appear to be a valid YouTube URL.")
353
+
354
+ try:
355
+ # --- Step 2: Check video availability ---
356
+ ydl_info_opts = {'quiet': True, 'skip_download': True}
357
+ try:
358
+ with yt_dlp.YoutubeDL(ydl_info_opts) as ydl:
359
+ info = ydl.extract_info(url, download=False)
360
+ except yt_dlp.utils.DownloadError as e:
361
+ return None, None, gr.Markdown(f"❌ **Error:** The video at URL '{url}' is unavailable ({str(e)})")
362
+
363
+ # --- Step 3: Select best audio format ---
364
+ formats = [f for f in info['formats'] if f.get('acodec') != 'none']
365
+ if not formats:
366
+ return None, None, gr.Markdown("❌ **Error:** No audio-only stream was found for this video.")
367
+
368
+ formats.sort(key=lambda f: f.get('abr') or 0, reverse=True)
369
+ best_audio_format = formats[0]
370
+
371
+ # --- Step 4: Check file size BEFORE downloading ---
372
+ filesize = best_audio_format.get('filesize') or best_audio_format.get('filesize_approx')
373
+ if filesize is None:
374
+ print("Could not determine file size before downloading.")
375
+ filesize = 1
376
+
377
+ if filesize > MAX_FILE_SIZE:
378
+ return None, None, gr.Markdown(
379
+ f"❌ **Error:** The file is too large.\n"
380
+ f" - **File size:** {filesize / 1024 / 1024:.2f} MB\n"
381
+ f" - **Maximum allowed size:** {MAX_FILE_SIZE / 1024 / 1024:.2f} MB"
382
+ )
383
+
384
+ # --- Step 5: Download & convert directly to WAV ---
385
+ ydl_opts = {
386
+ 'quiet': True,
387
+ 'format': f"{best_audio_format['format_id']}",
388
+ 'outtmpl': "audio_file", # will be replaced by ffmpeg output
389
+ 'postprocessors': [{
390
+ 'key': 'FFmpegExtractAudio',
391
+ 'preferredcodec': 'wav',
392
+ 'preferredquality': '192',
393
+ }],
394
+ }
395
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
396
+ ydl.download([url])
397
+
398
+ success_message = "✅ **Success!** Audio extracted and saved."
399
+ return "audio_file.wav", "audio_file.wav", gr.Markdown(success_message)
400
+
401
+ except FileNotFoundError:
402
+ return None, None, gr.Markdown("❌ **Error:** FFmpeg not found. Please ensure it is installed and in your system's PATH.")
403
+ except Exception as e:
404
+ return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
405
+ ###
406
+
407
+ def voice_extract_demucs():
408
+ """
409
+ Returns the path of the voice extracted file.
410
+ """
411
+ try:
412
+ cmd = [
413
+ "demucs",
414
+ "--two-stems=vocals",
415
+ "--out", "demucs",
416
+ "audio_file.wav"
417
+ ]
418
+ subprocess.run(cmd, check=True)
419
+ voice_path = os.path.join("demucs", "htdemucs", "audio_file", "vocals.wav")
420
+ success_message = "✅ **Success!** Voice extracted."
421
+ return voice_path, voice_path, gr.Markdown(success_message)
422
+ except Exception as e:
423
+ return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
424
+ ###
425
+
426
+ def clear_audio():
427
+ return None, None, None, None
428
+ ###
429
+
430
+
431
+
432
+ #### Gradio interface
433
+ with gr.Blocks(title="Voxtral") as voxtral:
434
+ with gr.Row():
435
+ gr.Markdown("# **Voxtral Mini Evaluation**")
436
+
437
+ with gr.Accordion("🔎 More on Voxtral", open=False):
438
+ gr.Markdown("""## **Key Features:**
439
+
440
+ #### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
441
+ ##### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
442
+ ##### - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
443
+ ##### - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
444
+ ##### - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
445
+ ##### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
446
+ ##### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
447
+
448
+ gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
449
+ capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and \
450
+ audio understanding. Available languages: English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian.""")
451
+
452
+ gr.Markdown("### **1.Choose the audio:**")
453
+ sel_audio = gr.State()
454
+ with gr.Row():
455
+ with gr.Tabs():
456
+ with gr.Tab("From record or file upload"):
457
+ gr.Markdown("### **Upload an audio file, record via microphone, or select a demo file:**")
458
+ gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription; if longer, it will be cut into chunks)*")
459
+ sel_audio1 = gr.Audio(sources=["microphone", "upload"], type="filepath",
460
+ label="Set an audio file to process it:")
461
+ example1 = [["mapo_tofu.mp3"]]
462
+ gr.Examples(
463
+ examples=example1,
464
+ inputs=sel_audio1,
465
+ outputs=None,
466
+ fn=None,
467
+ cache_examples=False,
468
+ run_on_click=False
469
+ )
470
+ status_output1 = gr.Markdown()
471
+ with gr.Row():
472
+ voice_button1 = gr.Button("Extract voice (if noisy environment)")
473
+ voice_button1.click(
474
+ fn=voice_extract_demucs,
475
+ outputs=[sel_audio, sel_audio1, status_output1])
476
+ clear_audio1 = gr.Button("Clear audio")
477
+ clear_audio1.click(
478
+ fn=clear_audio,
479
+ outputs=[sel_audio, sel_audio, sel_audio1, status_output1])
480
+
481
+ with gr.Tab("From file url (audio or video file)"):
482
+ gr.Markdown("### **Enter the url of the file (mp3, wav, mp4, ...):**")
483
+ url_input2 = gr.Textbox(label="URL (MP3 or MP4 file)",
484
+ placeholder="https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4")
485
+ example2 = [["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4"]]
486
+ gr.Examples(
487
+ examples=example2,
488
+ inputs=url_input2,
489
+ outputs=None,
490
+ fn=None,
491
+ cache_examples=False,
492
+ run_on_click=False
493
+ )
494
+ download_button2 = gr.Button("Check and upload", variant="primary")
495
+ input_audio2 = gr.Audio()
496
+ status_output2 = gr.Markdown()
497
+ download_button2.click(
498
+ fn=secure_download_from_url,
499
+ inputs=url_input2,
500
+ outputs=[input_audio2, sel_audio, status_output2]
501
+ )
502
+ with gr.Row():
503
+ voice_button2 = gr.Button("Extract voice (if noisy environment)")
504
+ voice_button2.click(
505
+ fn=voice_extract_demucs,
506
+ outputs=[input_audio2, sel_audio, status_output2])
507
+ clear_audio1 = gr.Button("Clear audio")
508
+ clear_audio1.click(
509
+ fn=clear_audio,
510
+ outputs=[sel_audio, url_input2, input_audio2, status_output2])
511
+
512
+ with gr.Tab("From Youtube url:"):
513
+ gr.Markdown("### **Enter the url of the Youtube video:**")
514
+ url_input3 = gr.Textbox(label="Youtube url",
515
+ placeholder="https://www.youtube.com/...")
516
+ download_button3 = gr.Button("Check and upload", variant="primary")
517
+ input_audio3 = gr.Audio()
518
+ status_output3 = gr.Markdown()
519
+ download_button3.click(
520
+ fn=secure_download_youtube_audio,
521
+ inputs=url_input3,
522
+ outputs=[input_audio3, sel_audio, status_output3]
523
+ )
524
+ with gr.Row():
525
+ voice_button3 = gr.Button("Extract voice (if noisy environment)")
526
+ voice_button3.click(
527
+ fn=voice_extract_demucs,
528
+ outputs=[input_audio3, sel_audio, status_output3])
529
+ clear_audio1 = gr.Button("Clear audio")
530
+ clear_audio1.click(
531
+ fn=clear_audio,
532
+ outputs=[sel_audio, url_input3, input_audio3, status_output3])
533
+
534
+ with gr.Row():
535
+ gr.Markdown("### **2. Choose one of theese tasks:**")
536
+
537
+ with gr.Row():
538
+ with gr.Column():
539
+ with gr.Accordion("📝 Transcription", open=True):
540
+ sel_language = gr.Dropdown(
541
+ choices=list(dict_languages.keys()),
542
+ value="English",
543
+ label="Select the language of the audio file:"
544
+ )
545
+ submit_transcript = gr.Button("Extract transcription", variant="primary")
546
+ text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
547
+ status_transcript = gr.Markdown()
548
+
549
+ with gr.Column():
550
+ with gr.Accordion("🔁 Translation", open=True):
551
+ list_language = list(dict_languages.keys())
552
+ list_language.pop(list_language.index(sel_language.value)) # Fix: Access the value of the dropdown
553
+ sel_translate_language = gr.Dropdown(
554
+ choices=list(dict_languages.keys()),
555
+ value="English",
556
+ label="Select the language for translation:"
557
+ )
558
+ submit_translate = gr.Button("Translate audio file", variant="primary")
559
+ text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
560
+ status_translate = gr.Markdown()
561
+
562
+ with gr.Column():
563
+ with gr.Accordion("🤖 Ask audio file", open=True):
564
+ question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file")
565
+ submit_chat = gr.Button("Ask audio file", variant="primary")
566
+ example_chat = [["What is the subject of this audio file?"], ["Quels sont les ingrédients ?"]]
567
+ gr.Examples(
568
+ examples=example_chat,
569
+ inputs=question_chat,
570
+ outputs=None,
571
+ fn=None,
572
+ cache_examples=False,
573
+ run_on_click=False
574
+ )
575
+ text_chat = gr.Textbox(label="💬 Model answer", lines=10)
576
+ status_chat = gr.Markdown()
577
+
578
+ ### Processing
579
+
580
+ # Transcription
581
+ submit_transcript.click(
582
+ disable_buttons,
583
+ outputs=[submit_transcript, submit_translate, submit_chat],
584
+ trigger_mode="once",
585
+ ).then(
586
+ fn=process_transcript,
587
+ inputs=[sel_language, sel_audio],
588
+ outputs=[text_transcript, status_transcript]
589
+ ).then(
590
+ enable_buttons,
591
+ outputs=[submit_transcript, submit_translate, submit_chat],
592
+ )
593
+
594
+ # Translation
595
+ submit_translate.click(
596
+ disable_buttons,
597
+ outputs=[submit_transcript, submit_translate, submit_chat],
598
+ trigger_mode="once",
599
+ ).then(
600
+ fn=process_translate,
601
+ inputs=[sel_translate_language, sel_audio],
602
+ outputs=[text_translate, status_translate]
603
+ ).then(
604
+ enable_buttons,
605
+ outputs=[submit_transcript, submit_translate, submit_chat],
606
+ )
607
+
608
+ # Chat
609
+ submit_chat.click(
610
+ disable_buttons,
611
+ outputs=[submit_transcript, submit_translate, submit_chat],
612
+ trigger_mode="once",
613
+ ).then(
614
+ fn=process_chat,
615
+ inputs=[question_chat, sel_audio],
616
+ outputs=[text_chat, status_chat]
617
+ ).then(
618
+ enable_buttons,
619
+ outputs=[submit_transcript, submit_translate, submit_chat],
620
+ )
621
+
622
+ ### Launch the app
623
+
624
+ if __name__ == "__main__":
625
+ voxtral.queue().launch(debug=True)