Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,6 @@ import gradio as gr
|
|
| 2 |
import torch
|
| 3 |
import speech_recognition as sr
|
| 4 |
from pydub import AudioSegment
|
| 5 |
-
import jiwer
|
| 6 |
import os
|
| 7 |
|
| 8 |
# Constants
|
|
@@ -19,14 +18,14 @@ def convert_audio_to_wav(file_path):
|
|
| 19 |
def transcribe_audio_in_chunks(audio_path, chunk_duration=30):
|
| 20 |
recognizer = sr.Recognizer()
|
| 21 |
audio = AudioSegment.from_wav(audio_path)
|
| 22 |
-
|
| 23 |
if len(audio) > MAX_AUDIO_DURATION * 1000:
|
| 24 |
audio = audio[:MAX_AUDIO_DURATION * 1000]
|
| 25 |
|
| 26 |
full_text = []
|
| 27 |
for i in range(0, len(audio), chunk_duration * 1000):
|
| 28 |
-
chunk = audio[i
|
| 29 |
-
chunk_path =
|
| 30 |
chunk.export(chunk_path, format="wav")
|
| 31 |
|
| 32 |
with sr.AudioFile(chunk_path) as source:
|
|
@@ -42,33 +41,24 @@ def transcribe_audio_in_chunks(audio_path, chunk_duration=30):
|
|
| 42 |
return " ".join(full_text)
|
| 43 |
|
| 44 |
# --- Main Function ---
|
| 45 |
-
def
|
| 46 |
if not audio.endswith(".wav"):
|
| 47 |
audio = convert_audio_to_wav(audio)
|
| 48 |
|
| 49 |
transcription = transcribe_audio_in_chunks(audio)
|
| 50 |
-
|
| 51 |
-
# Provide reference text here manually (you can replace this with real ground truth data)
|
| 52 |
-
reference_text = "This is the ground truth text that you expect from the audio."
|
| 53 |
-
|
| 54 |
-
# Compute WER (Word Error Rate)
|
| 55 |
-
wer = jiwer.wer(reference_text.lower(), transcription.lower())
|
| 56 |
-
wer_accuracy = round((1 - wer) * 100, 2)
|
| 57 |
-
|
| 58 |
-
return transcription, f"{wer_accuracy} %"
|
| 59 |
|
| 60 |
# --- Gradio UI ---
|
| 61 |
iface = gr.Interface(
|
| 62 |
-
fn=
|
| 63 |
inputs=[
|
| 64 |
gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input English Audio")
|
| 65 |
],
|
| 66 |
outputs=[
|
| 67 |
-
gr.Textbox(label="Transcribed Text")
|
| 68 |
-
gr.Textbox(label="WER Accuracy (%)")
|
| 69 |
],
|
| 70 |
-
title="English Speech Recognition
|
| 71 |
-
description="Upload or record English audio → Transcribe
|
| 72 |
allow_flagging="never"
|
| 73 |
)
|
| 74 |
|
|
|
|
| 2 |
import torch
|
| 3 |
import speech_recognition as sr
|
| 4 |
from pydub import AudioSegment
|
|
|
|
| 5 |
import os
|
| 6 |
|
| 7 |
# Constants
|
|
|
|
| 18 |
def transcribe_audio_in_chunks(audio_path, chunk_duration=30):
|
| 19 |
recognizer = sr.Recognizer()
|
| 20 |
audio = AudioSegment.from_wav(audio_path)
|
| 21 |
+
|
| 22 |
if len(audio) > MAX_AUDIO_DURATION * 1000:
|
| 23 |
audio = audio[:MAX_AUDIO_DURATION * 1000]
|
| 24 |
|
| 25 |
full_text = []
|
| 26 |
for i in range(0, len(audio), chunk_duration * 1000):
|
| 27 |
+
chunk = audio[i: i + chunk_duration * 1000]
|
| 28 |
+
chunk_path = "temp_chunk.wav"
|
| 29 |
chunk.export(chunk_path, format="wav")
|
| 30 |
|
| 31 |
with sr.AudioFile(chunk_path) as source:
|
|
|
|
| 41 |
return " ".join(full_text)
|
| 42 |
|
| 43 |
# --- Main Function ---
|
| 44 |
+
def transcribe_audio(audio):
|
| 45 |
if not audio.endswith(".wav"):
|
| 46 |
audio = convert_audio_to_wav(audio)
|
| 47 |
|
| 48 |
transcription = transcribe_audio_in_chunks(audio)
|
| 49 |
+
return transcription
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
# --- Gradio UI ---
|
| 52 |
iface = gr.Interface(
|
| 53 |
+
fn=transcribe_audio,
|
| 54 |
inputs=[
|
| 55 |
gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input English Audio")
|
| 56 |
],
|
| 57 |
outputs=[
|
| 58 |
+
gr.Textbox(label="Transcribed Text")
|
|
|
|
| 59 |
],
|
| 60 |
+
title="English Speech Recognition",
|
| 61 |
+
description="Upload or record English audio → Transcribe to text.",
|
| 62 |
allow_flagging="never"
|
| 63 |
)
|
| 64 |
|