Spaces:

throgletworld
/

MultiModalSpeechDisfluencyDetectionSystem

Sleeping

throgletworld commited on 24 days ago

Commit

3e58827

verified ·

1 Parent(s): 39dbbe7

Upload app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -141,7 +141,27 @@ def analyze_audio(audio_path, threshold, progress=gr.Progress()):
         timeline_rows.append({"time": time_str, "detected": detected or ["Fluent"], "probs": probs})
     progress(0.75, desc="Transcribing ...")
-    transcription = whisper_model.transcribe(audio_path).get("text", "").strip()
     progress(0.90, desc="Building report ...")
     total_stutters = sum(counts.values())
@@ -180,9 +200,9 @@ def analyze_audio(audio_path, threshold, progress=gr.Progress()):
     summary_md = "\n".join(summary_lines)
-    tl_lines = ["| Time | Detected |", "|------|----------|"]
     for row in timeline_rows:
-        tl_lines.append(f"| {row['time']} | {', '.join(row['detected'])} |")
     timeline_md = "\n".join(tl_lines)
     recs = ["## Recommendations\n"]

         timeline_rows.append({"time": time_str, "detected": detected or ["Fluent"], "probs": probs})
     progress(0.75, desc="Transcribing ...")
+    whisper_result = whisper_model.transcribe(audio_path, word_timestamps=True)
+    transcription = whisper_result.get("text", "").strip()
+    # Extract word-level timestamps from Whisper
+    word_timestamps_list = []
+    for seg in whisper_result.get("segments", []):
+        for w in seg.get("words", []):
+            word_timestamps_list.append({
+                "word": w["word"].strip(),
+                "start": w["start"],
+                "end": w["end"],
+            })
+    # Map words to each chunk's time range
+    for row in timeline_rows:
+        t_start, t_end = [float(x) for x in row["time"].replace("s", "").split("-")]
+        chunk_words = [
+            w["word"] for w in word_timestamps_list
+            if w["start"] >= t_start - 0.15 and w["end"] <= t_end + 0.15
+        ]
+        row["words"] = " ".join(chunk_words) if chunk_words else "—"
     progress(0.90, desc="Building report ...")
     total_stutters = sum(counts.values())
     summary_md = "\n".join(summary_lines)
+    tl_lines = ["| Time | Detected | Words Spoken |", "|------|----------|--------------|"]
     for row in timeline_rows:
+        tl_lines.append(f"| {row['time']} | {', '.join(row['detected'])} | {row.get('words', '—')} |")
     timeline_md = "\n".join(tl_lines)
     recs = ["## Recommendations\n"]