viditk commited on
Commit
0da2c4e
·
verified ·
1 Parent(s): 46ba302

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -19
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  import torch
3
  import speech_recognition as sr
4
  from pydub import AudioSegment
5
- import jiwer
6
  import os
7
 
8
  # Constants
@@ -19,14 +18,14 @@ def convert_audio_to_wav(file_path):
19
  def transcribe_audio_in_chunks(audio_path, chunk_duration=30):
20
  recognizer = sr.Recognizer()
21
  audio = AudioSegment.from_wav(audio_path)
22
-
23
  if len(audio) > MAX_AUDIO_DURATION * 1000:
24
  audio = audio[:MAX_AUDIO_DURATION * 1000]
25
 
26
  full_text = []
27
  for i in range(0, len(audio), chunk_duration * 1000):
28
- chunk = audio[i : i + chunk_duration * 1000]
29
- chunk_path = f"temp_chunk.wav"
30
  chunk.export(chunk_path, format="wav")
31
 
32
  with sr.AudioFile(chunk_path) as source:
@@ -42,33 +41,24 @@ def transcribe_audio_in_chunks(audio_path, chunk_duration=30):
42
  return " ".join(full_text)
43
 
44
  # --- Main Function ---
45
- def transcribe_and_compute_wer(audio):
46
  if not audio.endswith(".wav"):
47
  audio = convert_audio_to_wav(audio)
48
 
49
  transcription = transcribe_audio_in_chunks(audio)
50
-
51
- # Provide reference text here manually (you can replace this with real ground truth data)
52
- reference_text = "This is the ground truth text that you expect from the audio."
53
-
54
- # Compute WER (Word Error Rate)
55
- wer = jiwer.wer(reference_text.lower(), transcription.lower())
56
- wer_accuracy = round((1 - wer) * 100, 2)
57
-
58
- return transcription, f"{wer_accuracy} %"
59
 
60
  # --- Gradio UI ---
61
  iface = gr.Interface(
62
- fn=transcribe_and_compute_wer,
63
  inputs=[
64
  gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input English Audio")
65
  ],
66
  outputs=[
67
- gr.Textbox(label="Transcribed Text"),
68
- gr.Textbox(label="WER Accuracy (%)")
69
  ],
70
- title="English Speech Recognition + WER Accuracy",
71
- description="Upload or record English audio → Transcribe Compute WER Accuracy against fixed reference text.",
72
  allow_flagging="never"
73
  )
74
 
 
2
  import torch
3
  import speech_recognition as sr
4
  from pydub import AudioSegment
 
5
  import os
6
 
7
  # Constants
 
18
  def transcribe_audio_in_chunks(audio_path, chunk_duration=30):
19
  recognizer = sr.Recognizer()
20
  audio = AudioSegment.from_wav(audio_path)
21
+
22
  if len(audio) > MAX_AUDIO_DURATION * 1000:
23
  audio = audio[:MAX_AUDIO_DURATION * 1000]
24
 
25
  full_text = []
26
  for i in range(0, len(audio), chunk_duration * 1000):
27
+ chunk = audio[i: i + chunk_duration * 1000]
28
+ chunk_path = "temp_chunk.wav"
29
  chunk.export(chunk_path, format="wav")
30
 
31
  with sr.AudioFile(chunk_path) as source:
 
41
  return " ".join(full_text)
42
 
43
  # --- Main Function ---
44
+ def transcribe_audio(audio):
45
  if not audio.endswith(".wav"):
46
  audio = convert_audio_to_wav(audio)
47
 
48
  transcription = transcribe_audio_in_chunks(audio)
49
+ return transcription
 
 
 
 
 
 
 
 
50
 
51
  # --- Gradio UI ---
52
  iface = gr.Interface(
53
+ fn=transcribe_audio,
54
  inputs=[
55
  gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input English Audio")
56
  ],
57
  outputs=[
58
+ gr.Textbox(label="Transcribed Text")
 
59
  ],
60
+ title="English Speech Recognition",
61
+ description="Upload or record English audio → Transcribe to text.",
62
  allow_flagging="never"
63
  )
64