NeuralFalcon commited on
Commit
528103b
·
verified ·
1 Parent(s): bd876a3

Upload 7 files

Browse files
Files changed (7) hide show
  1. Qwen3_TTS_Colab.ipynb +97 -0
  2. README.md +11 -14
  3. app.py +534 -0
  4. hf_downloader.py +92 -0
  5. process_text.py +253 -0
  6. requirements.txt +5 -0
  7. subtitle.py +574 -0
Qwen3_TTS_Colab.ipynb ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "markdown",
21
+ "source": [
22
+ "\n",
23
+ "\n",
24
+ "### 🏷️ **Credits & License**\n",
25
+ "\n",
26
+ "* 🔗 [Qwen3-TTS GitHub Repository](https://github.com/QwenLM/Qwen3-TTS)\n",
27
+ "* 🤗 [Qwen3-TTS on Hugging Face](https://huggingface.co/collections/Qwen/qwen3-tts)\n",
28
+ "* 📄 **License**: Provided under the [Apache License 2.0](https://github.com/QwenLM/Qwen3-TTS?tab=Apache-2.0-1-ov-file)\n",
29
+ "* 🤗 [Try Qwen3-TTS on HuggingFace Space](https://huggingface.co/spaces/Qwen/Qwen3-TTS)\n",
30
+ "\n",
31
+ "\n",
32
+ "\n",
33
+ "### ⚠️ **Usage Disclaimer**\n",
34
+ "\n",
35
+ "Use of this voice cloning model is subject to strict ethical and legal standards. By using this tool, you agree **not to** engage in any of the following prohibited activities:\n",
36
+ "\n",
37
+ "* **Fraud or Deception**: Using cloned voices to create misleading or fraudulent content.\n",
38
+ "* **Impersonation**: Replicating someone’s voice without their explicit permission, especially for malicious, harmful, or deceptive purposes.\n",
39
+ "* **Illegal Activities**: Employing the model in any manner that violates local, national, or international laws and regulations.\n",
40
+ "* **Harmful Content Generation**: Creating offensive, defamatory, or unethical material, including content that spreads misinformation or causes harm.\n",
41
+ "\n",
42
+ "> ⚖️ **Legal Responsibility**\n",
43
+ "> The developers of this tool disclaim all liability for misuse. **Users bear full responsibility** for ensuring that their usage complies with all applicable laws, regulations, and ethical guidelines.\n",
44
+ "\n",
45
+ "\n"
46
+ ],
47
+ "metadata": {
48
+ "id": "O5hhJS2moOhU"
49
+ }
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": null,
54
+ "metadata": {
55
+ "cellView": "form",
56
+ "id": "57sW-0cHjthT"
57
+ },
58
+ "outputs": [],
59
+ "source": [
60
+ "#@title Install Qwen3-TTS\n",
61
+ "%cd /content/\n",
62
+ "# !rm -rf /content/Qwen3-TTS-Colab\n",
63
+ "!git clone https://github.com/NeuralFalconYT/Qwen3-TTS-Colab.git\n",
64
+ "!git clone https://github.com/QwenLM/Qwen3-TTS.git\n",
65
+ "%cd Qwen3-TTS\n",
66
+ "!pip install -e .\n",
67
+ "!pip install faster-whisper==1.1.1\n",
68
+ "!pip install ctranslate2==4.5.0\n",
69
+ "!pip install pysrt\n",
70
+ "!pip install sentencex\n",
71
+ "from IPython.display import Audio,display\n",
72
+ "from IPython.display import clear_output\n",
73
+ "import time\n",
74
+ "clear_output()\n",
75
+ "\n",
76
+ "display(Audio(\"https://raw.githubusercontent.com/NeuralFalconYT/Useful-Function/refs/heads/main/audio/warning.mp3\", autoplay=True))\n",
77
+ "time.sleep(6)\n",
78
+ "clear_output()\n",
79
+ "# time.sleep(5)\n",
80
+ "# import os\n",
81
+ "# os.kill(os.getpid(), 9)"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "source": [
87
+ "%cd /content/Qwen3-TTS-Colab\n",
88
+ "!python app.py --share --debug"
89
+ ],
90
+ "metadata": {
91
+ "id": "v7Y8L5EDpYNU"
92
+ },
93
+ "execution_count": null,
94
+ "outputs": []
95
+ }
96
+ ]
97
+ }
README.md CHANGED
@@ -1,14 +1,11 @@
1
- ---
2
- title: Qwen3 TTS Colab
3
- emoji: 🦀
4
- colorFrom: blue
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 6.4.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: Run Qwen3-TTS On Google Colab
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ## Run Qwen3 TTS on Google Colab
2
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NeuralFalconYT/Qwen3-TTS-Colab/blob/main/Qwen3_TTS_Colab.ipynb) <br>
3
+
4
+
5
+ ![1](https://github.com/user-attachments/assets/e2602945-1a69-4c59-ad89-e95d96ba7858)
6
+
7
+ ## Credit:
8
+ [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS)
9
+
10
+ ## Disclaimer
11
+ Don't use this model to do bad things.
 
 
 
app.py ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %cd /content/Qwen3-TTS-Colab
2
+ from subtitle import subtitle_maker
3
+ from process_text import text_chunk
4
+ from qwen_tts import Qwen3TTSModel
5
+ import subprocess
6
+ import os
7
+ import gradio as gr
8
+ import numpy as np
9
+ import torch
10
+ import soundfile as sf
11
+ from pydub import AudioSegment
12
+ from pydub.silence import split_on_silence
13
+ from huggingface_hub import snapshot_download
14
+ from hf_downloader import download_model
15
+ import gc
16
+ from huggingface_hub import login
17
+
18
+ HF_TOKEN = os.getenv("HF_TOKEN")
19
+ if HF_TOKEN:
20
+ login(token=HF_TOKEN)
21
+ else:
22
+ HF_TOKEN=None
23
+
24
+ # Global model holders
25
+ loaded_models = {}
26
+ MODEL_SIZES = ["0.6B", "1.7B"]
27
+
28
+ # Speaker and language choices
29
+ SPEAKERS = [
30
+ "Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
31
+ ]
32
+ LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]
33
+
34
+ # --- Helper Functions ---
35
+
36
+ def get_model_path(model_type: str, model_size: str) -> str:
37
+ """Get model path based on type and size."""
38
+ try:
39
+ return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}")
40
+ except Exception as e:
41
+ return download_model(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}", download_folder="./qwen_tts_model", redownload= False)
42
+
43
+ def clear_other_models(keep_key=None):
44
+ """Delete all loaded models except the current one."""
45
+ global loaded_models
46
+ keys_to_delete = [k for k in loaded_models if k != keep_key]
47
+ for k in keys_to_delete:
48
+ try:
49
+ del loaded_models[k]
50
+ except Exception:
51
+ pass
52
+ for k in keys_to_delete:
53
+ loaded_models.pop(k, None)
54
+ gc.collect()
55
+ if torch.cuda.is_available():
56
+ torch.cuda.empty_cache()
57
+
58
+ def get_model(model_type: str, model_size: str):
59
+ """Load model and clear others to avoid OOM in Colab."""
60
+ global loaded_models
61
+ key = (model_type, model_size)
62
+ if key in loaded_models:
63
+ return loaded_models[key]
64
+
65
+ clear_other_models(keep_key=key)
66
+ model_path = get_model_path(model_type, model_size)
67
+ model = Qwen3TTSModel.from_pretrained(
68
+ model_path,
69
+ device_map="cuda",
70
+ dtype=torch.bfloat16,
71
+ )
72
+ loaded_models[key] = model
73
+ return model
74
+
75
+ def _normalize_audio(wav, eps=1e-12, clip=True):
76
+ """Normalize audio to float32 in [-1, 1] range."""
77
+ x = np.asarray(wav)
78
+ if np.issubdtype(x.dtype, np.integer):
79
+ info = np.iinfo(x.dtype)
80
+ if info.min < 0:
81
+ y = x.astype(np.float32) / max(abs(info.min), info.max)
82
+ else:
83
+ mid = (info.max + 1) / 2.0
84
+ y = (x.astype(np.float32) - mid) / mid
85
+ elif np.issubdtype(x.dtype, np.floating):
86
+ y = x.astype(np.float32)
87
+ m = np.max(np.abs(y)) if y.size else 0.0
88
+ if m > 1.0 + 1e-6:
89
+ y = y / (m + eps)
90
+ else:
91
+ raise TypeError(f"Unsupported dtype: {x.dtype}")
92
+ if clip:
93
+ y = np.clip(y, -1.0, 1.0)
94
+ if y.ndim > 1:
95
+ y = np.mean(y, axis=-1).astype(np.float32)
96
+ return y
97
+
98
+ def _audio_to_tuple(audio):
99
+ """Convert Gradio audio input to (wav, sr) tuple."""
100
+ if audio is None: return None
101
+ if isinstance(audio, str):
102
+ try:
103
+ wav, sr = sf.read(audio)
104
+ wav = _normalize_audio(wav)
105
+ return wav, int(sr)
106
+ except Exception as e:
107
+ print(f"Error reading audio file: {e}")
108
+ return None
109
+ if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int):
110
+ sr, wav = audio
111
+ wav = _normalize_audio(wav)
112
+ return wav, int(sr)
113
+ if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio:
114
+ sr = int(audio["sampling_rate"])
115
+ wav = _normalize_audio(audio["data"])
116
+ return wav, sr
117
+ return None
118
+
119
+ def transcribe_reference(audio_path, mode_input, language="English"):
120
+ """Uses subtitle_maker to extract text from the reference audio."""
121
+ should_run = False
122
+ if isinstance(mode_input, bool): should_run = mode_input
123
+ elif isinstance(mode_input, str) and "High-Quality" in mode_input: should_run = True
124
+
125
+ if not audio_path or not should_run: return gr.update()
126
+
127
+ print(f"Starting transcription for: {audio_path}")
128
+ src_lang = language if language != "Auto" else "English"
129
+ try:
130
+ results = subtitle_maker(audio_path, src_lang)
131
+ transcript = results[7]
132
+ return transcript if transcript else "Could not detect speech."
133
+ except Exception as e:
134
+ print(f"Transcription Error: {e}")
135
+ return f"Error during transcription: {str(e)}"
136
+
137
+ # --- Audio Processing Utils (Disk Based) ---
138
+
139
+ def remove_silence_function(file_path, minimum_silence=100):
140
+ """Removes silence from an audio file using Pydub."""
141
+ try:
142
+ output_path = file_path.replace(".wav", "_no_silence.wav")
143
+ sound = AudioSegment.from_wav(file_path)
144
+ audio_chunks = split_on_silence(sound,
145
+ min_silence_len=minimum_silence,
146
+ silence_thresh=-45,
147
+ keep_silence=50)
148
+ combined = AudioSegment.empty()
149
+ for chunk in audio_chunks:
150
+ combined += chunk
151
+ combined.export(output_path, format="wav")
152
+ return output_path
153
+ except Exception as e:
154
+ print(f"Error removing silence: {e}")
155
+ return file_path
156
+
157
+ def process_audio_output(audio_path, make_subtitle, remove_silence, language="Auto"):
158
+ """Handles Silence Removal and Subtitle Generation."""
159
+ # 1. Remove Silence
160
+ final_audio_path = audio_path
161
+ if remove_silence:
162
+ final_audio_path = remove_silence_function(audio_path)
163
+
164
+ # 2. Generate Subtitles
165
+ default_srt, custom_srt, word_srt, shorts_srt = None, None, None, None
166
+ if make_subtitle:
167
+ try:
168
+ results = subtitle_maker(final_audio_path, language)
169
+ default_srt = results[0]
170
+ custom_srt = results[1]
171
+ word_srt = results[2]
172
+ shorts_srt = results[3]
173
+ except Exception as e:
174
+ print(f"Subtitle generation error: {e}")
175
+
176
+ return final_audio_path, default_srt, custom_srt, word_srt, shorts_srt
177
+
178
+ def stitch_chunk_files(chunk_files):
179
+ """
180
+ Takes a list of file paths.
181
+ Stitches them into one file.
182
+ Deletes the temporary chunk files.
183
+ """
184
+ if not chunk_files:
185
+ return None
186
+
187
+ combined_audio = AudioSegment.empty()
188
+
189
+ print(f"Stitching {len(chunk_files)} audio files...")
190
+ for f in chunk_files:
191
+ try:
192
+ segment = AudioSegment.from_wav(f)
193
+ combined_audio += segment
194
+ except Exception as e:
195
+ print(f"Error appending chunk {f}: {e}")
196
+
197
+ output_filename = f"final_output_{os.getpid()}.wav"
198
+ combined_audio.export(output_filename, format="wav")
199
+
200
+ # Clean up temp files
201
+ for f in chunk_files:
202
+ try:
203
+ if os.path.exists(f):
204
+ os.remove(f)
205
+ except Exception as e:
206
+ print(f"Warning: Could not delete temp file {f}: {e}")
207
+
208
+ return output_filename
209
+
210
+ # --- Generators (Memory Optimized) ---
211
+
212
+ def generate_voice_design(text, language, voice_description, remove_silence, make_subs):
213
+ if not text or not text.strip(): return None, "Error: Text is required.", None, None, None, None
214
+
215
+ try:
216
+ # 1. Chunk Text
217
+ text_chunks, tts_filename = text_chunk(text, language, char_limit=280)
218
+ print(f"Processing {len(text_chunks)} chunks...")
219
+
220
+ chunk_files = []
221
+ tts = get_model("VoiceDesign", "1.7B")
222
+
223
+ # 2. Generate & Save Loop
224
+ for i, chunk in enumerate(text_chunks):
225
+ wavs, sr = tts.generate_voice_design(
226
+ text=chunk.strip(),
227
+ language=language,
228
+ instruct=voice_description.strip(),
229
+ non_streaming_mode=True,
230
+ max_new_tokens=2048,
231
+ )
232
+
233
+ # Save immediately to disk
234
+ temp_filename = f"temp_chunk_{i}_{os.getpid()}.wav"
235
+ sf.write(temp_filename, wavs[0], sr)
236
+ chunk_files.append(temp_filename)
237
+
238
+ # Clear memory
239
+ del wavs
240
+ torch.cuda.empty_cache()
241
+ gc.collect()
242
+
243
+ # 3. Stitch from disk
244
+ stitched_file = stitch_chunk_files(chunk_files)
245
+
246
+ # 4. Post-Process
247
+ final_audio, srt1, srt2, srt3, srt4 = process_audio_output(stitched_file, make_subs, remove_silence, language)
248
+
249
+ return final_audio, "Generation Success!", srt1, srt2, srt3, srt4
250
+
251
+ except Exception as e:
252
+ return None, f"Error: {e}", None, None, None, None
253
+
254
+ def generate_custom_voice(text, language, speaker, instruct, model_size, remove_silence, make_subs):
255
+ if not text or not text.strip(): return None, "Error: Text is required.", None, None, None, None
256
+
257
+ try:
258
+ text_chunks, tts_filename = text_chunk(text, language, char_limit=280)
259
+ chunk_files = []
260
+ tts = get_model("CustomVoice", model_size)
261
+ formatted_speaker = speaker.lower().replace(" ", "_")
262
+
263
+ for i, chunk in enumerate(text_chunks):
264
+ wavs, sr = tts.generate_custom_voice(
265
+ text=chunk.strip(),
266
+ language=language,
267
+ speaker=formatted_speaker,
268
+ instruct=instruct.strip() if instruct else None,
269
+ non_streaming_mode=True,
270
+ max_new_tokens=2048,
271
+ )
272
+ # Save immediately
273
+ temp_filename = f"temp_custom_{i}_{os.getpid()}.wav"
274
+ sf.write(temp_filename, wavs[0], sr)
275
+ chunk_files.append(temp_filename)
276
+
277
+ # Clear memory
278
+ del wavs
279
+ torch.cuda.empty_cache()
280
+ gc.collect()
281
+
282
+ stitched_file = stitch_chunk_files(chunk_files)
283
+ final_audio, srt1, srt2, srt3, srt4 = process_audio_output(stitched_file, make_subs, remove_silence, language)
284
+ return final_audio, "Generation Success!", srt1, srt2, srt3, srt4
285
+
286
+ except Exception as e:
287
+ return None, f"Error: {e}", None, None, None, None
288
+
289
+ def smart_generate_clone(ref_audio, ref_text, target_text, language, mode, model_size, remove_silence, make_subs):
290
+ if not target_text or not target_text.strip(): return None, "Error: Target text is required.", None, None, None, None
291
+ if not ref_audio: return None, "Error: Ref audio required.", None, None, None, None
292
+
293
+ # 1. Mode & Transcript Logic
294
+ use_xvector_only = ("Fast" in mode)
295
+ final_ref_text = ref_text
296
+ audio_tuple = _audio_to_tuple(ref_audio)
297
+
298
+ if not use_xvector_only:
299
+ if not final_ref_text or not final_ref_text.strip():
300
+ print("Auto-transcribing reference...")
301
+ try:
302
+ final_ref_text = transcribe_reference(ref_audio, True, language)
303
+ if not final_ref_text or "Error" in final_ref_text:
304
+ return None, f"Transcription failed: {final_ref_text}", None, None, None, None
305
+ except Exception as e:
306
+ return None, f"Transcribe Error: {e}", None, None, None, None
307
+ else:
308
+ final_ref_text = None
309
+
310
+ try:
311
+ # 2. Chunk Target Text
312
+ text_chunks, tts_filename = text_chunk(target_text, language, char_limit=280)
313
+ chunk_files = []
314
+ tts = get_model("Base", model_size)
315
+
316
+ # 3. Generate Loop
317
+ for i, chunk in enumerate(text_chunks):
318
+ wavs, sr = tts.generate_voice_clone(
319
+ text=chunk.strip(),
320
+ language=language,
321
+ ref_audio=audio_tuple,
322
+ ref_text=final_ref_text.strip() if final_ref_text else None,
323
+ x_vector_only_mode=use_xvector_only,
324
+ max_new_tokens=2048,
325
+ )
326
+ # Save immediately
327
+ temp_filename = f"temp_clone_{i}_{os.getpid()}.wav"
328
+ sf.write(temp_filename, wavs[0], sr)
329
+ chunk_files.append(temp_filename)
330
+
331
+ # Clear memory
332
+ del wavs
333
+ torch.cuda.empty_cache()
334
+ gc.collect()
335
+
336
+ # 4. Stitch & Process
337
+ stitched_file = stitch_chunk_files(chunk_files)
338
+ final_audio, srt1, srt2, srt3, srt4 = process_audio_output(stitched_file, make_subs, remove_silence, language)
339
+ return final_audio, f"Success! Mode: {mode}", srt1, srt2, srt3, srt4
340
+
341
+ except Exception as e:
342
+ return None, f"Error: {e}", None, None, None, None
343
+
344
+
345
+ # --- UI Construction ---
346
+
347
+ def on_mode_change(mode):
348
+ return gr.update(visible=("High-Quality" in mode))
349
+
350
+ def build_ui():
351
+ theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"])
352
+ css = ".gradio-container {max-width: none !important;} .tab-content {padding: 20px;}"
353
+
354
+ with gr.Blocks(theme=theme, css=css, title="Qwen3-TTS Demo") as demo:
355
+ gr.HTML("""
356
+ <div style="text-align: center; margin: 20px auto; max-width: 800px;">
357
+ <h1 style="font-size: 2.5em; margin-bottom: 5px;">🎙️ Qwen3-TTS </h1>
358
+ <a href="https://colab.research.google.com/github/NeuralFalconYT/Qwen3-TTS-Colab/blob/main/Qwen3_TTS_Colab.ipynb" target="_blank" style="display: inline-block; padding: 10px 20px; background-color: #4285F4; color: white; border-radius: 6px; text-decoration: none; font-size: 1em;">🥳 Run on Google Colab</a>
359
+ </div>""")
360
+
361
+ with gr.Tabs():
362
+ # --- Tab 1: Voice Design ---
363
+ with gr.Tab("Voice Design"):
364
+ with gr.Row():
365
+ with gr.Column(scale=2):
366
+ design_text = gr.Textbox(label="Text to Synthesize", lines=4, value="It's in the top drawer... wait, it's empty? No way, that's impossible! I'm sure I put it there!",
367
+ placeholder="Enter the text you want to convert to speech...")
368
+ design_language = gr.Dropdown(label="Language", choices=LANGUAGES, value="Auto")
369
+ design_instruct = gr.Textbox(label="Voice Description", lines=3, placeholder="Describe the voice characteristics you want...",
370
+ value="Speak in an incredulous tone, but with a hint of panic beginning to creep into your voice.")
371
+ design_btn = gr.Button("Generate with Custom Voice", variant="primary")
372
+ with gr.Accordion("More options", open=False):
373
+ with gr.Row():
374
+ design_rem_silence = gr.Checkbox(label="Remove Silence", value=False)
375
+ design_make_subs = gr.Checkbox(label="Generate Subtitles", value=False)
376
+
377
+
378
+
379
+ with gr.Column(scale=2):
380
+ design_audio_out = gr.Audio(label="Generated Audio", type="filepath")
381
+ design_status = gr.Textbox(label="Status", interactive=False)
382
+
383
+ with gr.Accordion("📝 Subtitles", open=False):
384
+ with gr.Row():
385
+ d_srt1 = gr.File(label="Original (Whisper)")
386
+ d_srt2 = gr.File(label="Readable")
387
+ with gr.Row():
388
+ d_srt3 = gr.File(label="Word-level")
389
+ d_srt4 = gr.File(label="Shorts/Reels")
390
+
391
+ design_btn.click(
392
+ generate_voice_design,
393
+ inputs=[design_text, design_language, design_instruct, design_rem_silence, design_make_subs],
394
+ outputs=[design_audio_out, design_status, d_srt1, d_srt2, d_srt3, d_srt4]
395
+ )
396
+
397
+ # --- Tab 2: Voice Clone ---
398
+ with gr.Tab("Voice Clone (Base)"):
399
+ with gr.Row():
400
+ with gr.Column(scale=2):
401
+ clone_target_text = gr.Textbox(label="Target Text", lines=3, placeholder="Enter the text you want the cloned voice to speak...")
402
+ clone_ref_audio = gr.Audio(label="Reference Audio (Upload a voice sample to clone)", type="filepath")
403
+
404
+ with gr.Row():
405
+ clone_language = gr.Dropdown(label="Language", choices=LANGUAGES, value="Auto",scale=1)
406
+ clone_model_size = gr.Dropdown(label="Model Size", choices=MODEL_SIZES, value="1.7B",scale=1)
407
+ clone_mode = gr.Dropdown(
408
+ label="Mode",
409
+ choices=["High-Quality (Audio + Transcript)", "Fast (Audio Only)"],
410
+ value="High-Quality (Audio + Transcript)",
411
+ interactive=True,
412
+ scale=2
413
+ )
414
+
415
+ clone_ref_text = gr.Textbox(label="Reference Text", lines=2, visible=True)
416
+ clone_btn = gr.Button("Clone & Generate", variant="primary")
417
+ with gr.Accordion("More options", open=False):
418
+ with gr.Row():
419
+ clone_rem_silence = gr.Checkbox(label="Remove Silence", value=False)
420
+ clone_make_subs = gr.Checkbox(label="Generate Subtitles", value=False)
421
+
422
+
423
+
424
+ with gr.Column(scale=2):
425
+ clone_audio_out = gr.Audio(label="Generated Audio", type="filepath")
426
+ clone_status = gr.Textbox(label="Status", interactive=False)
427
+
428
+ with gr.Accordion("📝 Subtitles", open=False):
429
+ with gr.Row():
430
+ c_srt1 = gr.File(label="Original")
431
+ c_srt2 = gr.File(label="Readable")
432
+ with gr.Row():
433
+ c_srt3 = gr.File(label="Word-level")
434
+ c_srt4 = gr.File(label="Shorts/Reels")
435
+
436
+ clone_mode.change(on_mode_change, inputs=[clone_mode], outputs=[clone_ref_text])
437
+ clone_ref_audio.change(transcribe_reference, inputs=[clone_ref_audio, clone_mode, clone_language], outputs=[clone_ref_text])
438
+
439
+ clone_btn.click(
440
+ smart_generate_clone,
441
+ inputs=[clone_ref_audio, clone_ref_text, clone_target_text, clone_language, clone_mode, clone_model_size, clone_rem_silence, clone_make_subs],
442
+ outputs=[clone_audio_out, clone_status, c_srt1, c_srt2, c_srt3, c_srt4]
443
+ )
444
+
445
+ # --- Tab 3: TTS (CustomVoice) ---
446
+ with gr.Tab("TTS (CustomVoice)"):
447
+ with gr.Row():
448
+ with gr.Column(scale=2):
449
+ tts_text = gr.Textbox(label="Text", lines=4, placeholder="Enter the text you want to convert to speech...",
450
+ value="Hello! Welcome to Text-to-Speech system. This is a demo of our TTS capabilities.")
451
+ with gr.Row():
452
+ tts_language = gr.Dropdown(label="Language", choices=LANGUAGES, value="English")
453
+ tts_speaker = gr.Dropdown(label="Speaker", choices=SPEAKERS, value="Ryan")
454
+ with gr.Row():
455
+ tts_instruct = gr.Textbox(label="Style Instruction (Optional)", lines=2,placeholder="e.g., Speak in a cheerful and energetic tone")
456
+ tts_model_size = gr.Dropdown(label="Size", choices=MODEL_SIZES, value="1.7B")
457
+ tts_btn = gr.Button("Generate Speech", variant="primary")
458
+ with gr.Accordion("More options", open=False):
459
+ with gr.Row():
460
+ tts_rem_silence = gr.Checkbox(label="Remove Silence", value=False)
461
+ tts_make_subs = gr.Checkbox(label="Generate Subtitles", value=False)
462
+
463
+
464
+
465
+ with gr.Column(scale=2):
466
+ tts_audio_out = gr.Audio(label="Generated Audio", type="filepath")
467
+ tts_status = gr.Textbox(label="Status", interactive=False)
468
+
469
+ with gr.Accordion("📝 Subtitles", open=False):
470
+ with gr.Row():
471
+ t_srt1 = gr.File(label="Original")
472
+ t_srt2 = gr.File(label="Readable")
473
+ with gr.Row():
474
+ t_srt3 = gr.File(label="Word-level")
475
+ t_srt4 = gr.File(label="Shorts/Reels")
476
+
477
+ tts_btn.click(
478
+ generate_custom_voice,
479
+ inputs=[tts_text, tts_language, tts_speaker, tts_instruct, tts_model_size, tts_rem_silence, tts_make_subs],
480
+ outputs=[tts_audio_out, tts_status, t_srt1, t_srt2, t_srt3, t_srt4]
481
+ )
482
+ # --- Tab 4: About ---
483
+ with gr.Tab("About"):
484
+ gr.Markdown("""
485
+ # Qwen3-TTS
486
+ A unified Text-to-Speech demo featuring three powerful modes:
487
+ - **Voice Design**: Create custom voices using natural language descriptions
488
+ - **Voice Clone (Base)**: Clone any voice from a reference audio
489
+ - **TTS (CustomVoice)**: Generate speech with predefined speakers and optional style instructions
490
+
491
+ Built with [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS) by Alibaba Qwen Team.
492
+ """)
493
+
494
+ gr.HTML("""
495
+ <hr>
496
+ <p style="color: red; font-weight: bold; font-size: 16px;">
497
+ NOTE
498
+ </p>
499
+ <p>
500
+ This Gradio UI is not affiliated with the official Qwen3-TTS project and is based on the
501
+ official Qwen3-TTS demo UI:<br>
502
+ <a href="https://huggingface.co/spaces/Qwen/Qwen3-TTS" target="_blank">
503
+ https://huggingface.co/spaces/Qwen/Qwen3-TTS
504
+ </a>
505
+ </p>
506
+
507
+ <p><b>Additional features:</b></p>
508
+ <ul>
509
+ <li>Automatic transcription support using faster-whisper-large-v3-turbo-ct2</li>
510
+ <li>Long text input support</li>
511
+ <li>Because we are using Whisper, subtitles are also added</li>
512
+ </ul>
513
+ """)
514
+
515
+
516
+ return demo
517
+
518
+ # if __name__ == "__main__":
519
+ # demo = build_ui()
520
+ # demo.launch(share=True, debug=True)
521
+
522
+
523
+
524
+ import click
525
+ @click.command()
526
+ @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
527
+ @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
528
+ def main(share,debug):
529
+ demo = build_ui()
530
+ demo.launch(share=True, debug=True)
531
+ demo.queue().launch(debug=debug, share=share)
532
+
533
+ if __name__ == "__main__":
534
+ main()
hf_downloader.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import urllib.request
4
+ import urllib.error
5
+ from tqdm.auto import tqdm
6
+
7
+
8
+ def download_file(url: str, download_file_path: str, redownload: bool = False) -> bool:
9
+ """Download a single file with urllib + tqdm progress bar."""
10
+ base_path = os.path.dirname(download_file_path)
11
+ os.makedirs(base_path, exist_ok=True)
12
+
13
+ # Skip if file already exists
14
+ if os.path.exists(download_file_path):
15
+ if redownload:
16
+ os.remove(download_file_path)
17
+ tqdm.write(f"♻️ Redownloading: {os.path.basename(download_file_path)}")
18
+ elif os.path.getsize(download_file_path) > 0:
19
+ tqdm.write(f"✔️ Skipped (already exists): {os.path.basename(download_file_path)}")
20
+ return True
21
+
22
+ # Try fetching metadata
23
+ try:
24
+ request = urllib.request.urlopen(url)
25
+ total = int(request.headers.get("Content-Length", 0))
26
+ except urllib.error.URLError as e:
27
+ print(f"❌ Error: Unable to open URL: {url}")
28
+ print(f"Reason: {e.reason}")
29
+ return False
30
+
31
+ # Download with progress bar
32
+ with tqdm(
33
+ total=total,
34
+ desc=os.path.basename(download_file_path),
35
+ unit="B",
36
+ unit_scale=True,
37
+ unit_divisor=1024,
38
+ ) as progress:
39
+ try:
40
+ urllib.request.urlretrieve(
41
+ url,
42
+ download_file_path,
43
+ reporthook=lambda count, block_size, total_size: progress.update(block_size),
44
+ )
45
+ except urllib.error.URLError as e:
46
+ print(f"❌ Error: Failed to download {url}")
47
+ print(f"Reason: {e.reason}")
48
+ return False
49
+
50
+ tqdm.write(f"⬇️ Downloaded: {os.path.basename(download_file_path)}")
51
+ return True
52
+
53
+
54
+ def download_model(repo_id: str, download_folder: str = "./", redownload: bool = False) -> str | None:
55
+ """
56
+ Download all files from a Hugging Face repo into a local folder.
57
+
58
+ Args:
59
+ repo_id (str): Hugging Face repo ID, e.g. "IndexTeam/IndexTTS-2"
60
+ download_folder (str): Path where the model should be stored
61
+ redownload (bool): If True, re-download files even if they exist
62
+
63
+ Returns:
64
+ str | None: Path to the downloaded model folder, or None on error
65
+ """
66
+ # Normalize empty string as current dir
67
+ if not download_folder.strip():
68
+ download_folder = "."
69
+
70
+ url = f"https://huggingface.co/api/models/{repo_id}"
71
+ download_dir = os.path.abspath(f"{download_folder.rstrip('/')}/{repo_id.split('/')[-1]}")
72
+ os.makedirs(download_dir, exist_ok=True)
73
+
74
+ print(f"📂 Download directory: {download_dir}")
75
+
76
+ response = requests.get(url)
77
+ if response.status_code != 200:
78
+ print("❌ Error:", response.status_code, response.text)
79
+ return None
80
+
81
+ data = response.json()
82
+ siblings = data.get("siblings", [])
83
+ files = [f["rfilename"] for f in siblings]
84
+
85
+ print(f"📦 Found {len(files)} files in repo '{repo_id}'. Checking cache ...")
86
+
87
+ for file in tqdm(files, desc="Processing files", unit="file"):
88
+ file_url = f"https://huggingface.co/{repo_id}/resolve/main/{file}"
89
+ file_path = os.path.join(download_dir, file)
90
+ download_file(file_url, file_path, redownload=redownload)
91
+
92
+ return download_dir
process_text.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pip install sentencex
2
+ from sentencex import segment
3
+ import re
4
+ import uuid
5
+ import os
6
+ LANGUAGE_CODE = {
7
+ 'Akan': 'aka', 'Albanian': 'sq', 'Amharic': 'am', 'Arabic': 'ar', 'Armenian': 'hy',
8
+ 'Assamese': 'as', 'Azerbaijani': 'az', 'Basque': 'eu', 'Bashkir': 'ba', 'Bengali': 'bn',
9
+ 'Bosnian': 'bs', 'Bulgarian': 'bg', 'Burmese': 'my', 'Catalan': 'ca', 'Chinese': 'zh',
10
+ 'Croatian': 'hr', 'Czech': 'cs', 'Danish': 'da', 'Dutch': 'nl', 'English': 'en',
11
+ 'Estonian': 'et', 'Faroese': 'fo', 'Finnish': 'fi', 'French': 'fr', 'Galician': 'gl',
12
+ 'Georgian': 'ka', 'German': 'de', 'Greek': 'el', 'Gujarati': 'gu', 'Haitian Creole': 'ht',
13
+ 'Hausa': 'ha', 'Hebrew': 'he', 'Hindi': 'hi', 'Hungarian': 'hu', 'Icelandic': 'is',
14
+ 'Indonesian': 'id', 'Italian': 'it', 'Japanese': 'ja', 'Kannada': 'kn', 'Kazakh': 'kk',
15
+ 'Korean': 'ko', 'Kurdish': 'ckb', 'Kyrgyz': 'ky', 'Lao': 'lo', 'Lithuanian': 'lt',
16
+ 'Luxembourgish': 'lb', 'Macedonian': 'mk', 'Malay': 'ms', 'Malayalam': 'ml', 'Maltese': 'mt',
17
+ 'Maori': 'mi', 'Marathi': 'mr', 'Mongolian': 'mn', 'Nepali': 'ne', 'Norwegian': 'no',
18
+ 'Norwegian Nynorsk': 'nn', 'Pashto': 'ps', 'Persian': 'fa', 'Polish': 'pl', 'Portuguese': 'pt',
19
+ 'Punjabi': 'pa', 'Romanian': 'ro', 'Russian': 'ru', 'Serbian': 'sr', 'Sinhala': 'si',
20
+ 'Slovak': 'sk', 'Slovenian': 'sl', 'Somali': 'so', 'Spanish': 'es', 'Sundanese': 'su',
21
+ 'Swahili': 'sw', 'Swedish': 'sv', 'Tamil': 'ta', 'Telugu': 'te', 'Thai': 'th',
22
+ 'Turkish': 'tr', 'Ukrainian': 'uk', 'Urdu': 'ur', 'Uzbek': 'uz', 'Vietnamese': 'vi',
23
+ 'Welsh': 'cy', 'Yiddish': 'yi', 'Yoruba': 'yo', 'Zulu': 'zu'
24
+ }
25
+
26
+ # ==================================================
27
+ # CONSTANTS
28
+ # ==================================================
29
+
30
+ QUOTE_SPACE = "\uFFFF" # invisible placeholder for protected quotes
31
+ PUNCT_RE = re.compile(r'[.,;:!?]')
32
+
33
+
34
+ # ==================================================
35
+ # CLEAN TEXT (KEEP PUNCTUATION)
36
+ # ==================================================
37
+
38
+ def clean_text(text):
39
+ replacements = {
40
+ "**": "",
41
+ "*": "",
42
+ "#": "",
43
+ "—": "",
44
+ "“": '"',
45
+ "”": '"',
46
+ "‘": "'",
47
+ "’": "'",
48
+ }
49
+ for old, new in replacements.items():
50
+ text = text.replace(old, new)
51
+
52
+ text = re.sub(r'\s+', ' ', text).strip()
53
+ return text
54
+
55
+
56
+ # ==================================================
57
+ # PROTECT SHORT QUOTES (ATOMIC QUOTE RULE)
58
+ # ==================================================
59
+
60
+ def protect_short_quotes(text, max_chars):
61
+ """
62
+ If a quoted span fits entirely within max_chars,
63
+ protect it so it behaves like a single token.
64
+ """
65
+ def repl(match):
66
+ quote = match.group(0)
67
+ if len(quote) <= max_chars:
68
+ return quote.replace(" ", QUOTE_SPACE)
69
+ return quote
70
+
71
+ return re.sub(r'"[^"]+"', repl, text)
72
+
73
+
74
+ def restore_quotes(text):
75
+ return text.replace(QUOTE_SPACE, " ")
76
+
77
+
78
+ # ==================================================
79
+ # SMART SPLIT FOR LONG SENTENCES (QUOTE AWARE)
80
+ # ==================================================
81
+
82
+ def smart_split_long_sentence(sentence, max_chars=300, lookback=60):
83
+ words = re.findall(r'\S+\s*', sentence)
84
+ chunks = []
85
+ buffer = ""
86
+ in_quote = False
87
+
88
+ for w in words:
89
+ tentative = buffer + w
90
+ quote_count = w.count('"')
91
+
92
+ # 1️⃣ SAFE ADD
93
+ if len(tentative) <= max_chars:
94
+ buffer = tentative
95
+ if quote_count % 2 != 0:
96
+ in_quote = not in_quote
97
+ continue
98
+
99
+ # 2️⃣ OVERFLOW INSIDE QUOTE → MOVE WHOLE QUOTE
100
+ if in_quote:
101
+ if buffer.strip():
102
+ chunks.append(buffer.strip())
103
+ buffer = w
104
+ if quote_count % 2 != 0:
105
+ in_quote = not in_quote
106
+ continue
107
+
108
+ # 3️⃣ NORMAL PUNCTUATION-AWARE REBALANCE
109
+ split_at = None
110
+ search_region = buffer[-lookback:]
111
+
112
+ matches = list(PUNCT_RE.finditer(search_region))
113
+ if matches:
114
+ last = matches[-1]
115
+ split_at = len(buffer) - lookback + last.end()
116
+
117
+ if split_at:
118
+ chunks.append(buffer[:split_at].strip())
119
+ buffer = buffer[split_at:].lstrip() + w
120
+ else:
121
+ chunks.append(buffer.strip())
122
+ buffer = w
123
+
124
+ if quote_count % 2 != 0:
125
+ in_quote = not in_quote
126
+
127
+ if buffer.strip():
128
+ chunks.append(buffer.strip())
129
+
130
+ return chunks
131
+
132
+
133
+ # ==================================================
134
+ # SENTENCE-FIRST CHUNKER
135
+ # ==================================================
136
+
137
+ def split_into_chunks(text, lang_code="en", max_chars=300):
138
+ if len(text) <= max_chars:
139
+ return [text]
140
+
141
+ sentences = list(segment(lang_code, text))
142
+ chunks = []
143
+ current = ""
144
+
145
+ for sen in sentences:
146
+ sen = sen.strip()
147
+
148
+ if len(sen) > max_chars:
149
+ if current:
150
+ chunks.append(current.strip())
151
+ current = ""
152
+ chunks.extend(smart_split_long_sentence(sen, max_chars))
153
+ continue
154
+
155
+ tentative = f"{current} {sen}".strip() if current else sen
156
+
157
+ if len(tentative) <= max_chars:
158
+ current = tentative
159
+ else:
160
+ chunks.append(current.strip())
161
+ current = sen
162
+
163
+ if current.strip():
164
+ chunks.append(current.strip())
165
+
166
+ return chunks
167
+
168
+
169
+ # ==================================================
170
+ # FIX DANGLING QUOTES BETWEEN CHUNKS
171
+ # ==================================================
172
+
173
+ def repair_dangling_quotes(chunks):
174
+ fixed = []
175
+
176
+ for i, chunk in enumerate(chunks):
177
+ chunk = chunk.strip()
178
+
179
+ if i > 0:
180
+ prev = fixed[-1]
181
+ if prev.endswith('"') and chunk.startswith('"'):
182
+ chunk = chunk[1:].lstrip()
183
+
184
+ fixed.append(chunk)
185
+
186
+ return fixed
187
+
188
+
189
+ # ==================================================
190
+ # TTS FILE NAME
191
+ # ==================================================
192
+
193
+ def get_tts_file_name(text, language="en"):
194
+ temp_audio_dir = "./ai_tts_voice/"
195
+ os.makedirs(temp_audio_dir, exist_ok=True)
196
+
197
+ clean = re.sub(r'[^a-zA-Z\s]', '', text or "")
198
+ clean = clean.lower().strip().replace(" ", "_")[:20] or "audio"
199
+
200
+ uid = uuid.uuid4().hex[:8].upper()
201
+ language = language.lower().strip()
202
+
203
+ return os.path.join(
204
+ temp_audio_dir,
205
+ f"{clean}_{language}_{uid}.wav"
206
+ )
207
+
208
+
209
+ # ==================================================
210
+ # main funtion
211
+ # ==================================================
212
+
213
+ def text_chunk(text, language="English", char_limit=280):
214
+ lang_code=LANGUAGE_CODE.get('English',"en")
215
+
216
+ # text = clean_text(text) #because Qwen3-TTS can handle that
217
+
218
+ # 🔒 Atomic quote protection
219
+ text = protect_short_quotes(text, char_limit)
220
+
221
+ if len(text) > char_limit:
222
+ print("⚠️ The text is too long. Breaking it into smaller pieces for TTS.")
223
+
224
+ chunks = split_into_chunks(text, lang_code, char_limit)
225
+ chunks = repair_dangling_quotes(chunks)
226
+
227
+ # 🔓 Restore spaces inside quotes
228
+ chunks = [restore_quotes(c) for c in chunks]
229
+
230
+ tts_file_name = get_tts_file_name(text, lang_code)
231
+ return chunks, tts_file_name
232
+
233
+
234
+ # ==================================================
235
+ # TEST
236
+ # ==================================================
237
+
238
+ # from process_text import text_chunk
239
+ # text="Hi, this is a test"
240
+ # chunks, tts_filename =text_chunk(text, language="English", char_limit=280)
241
+
242
+ if __name__ == "__main__":
243
+ text = "He said \"You are a looser\"" # @param {type: "string"}
244
+
245
+ language="English" # @param {type: "string"}
246
+ char_limit = 20 # @param {type: "number"}
247
+
248
+ chunks, filename = text_chunk(text, language, char_limit)
249
+
250
+ print(filename)
251
+ print(len(chunks))
252
+ for c in chunks:
253
+ print(len(c), c)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ faster-whisper==1.1.1
2
+ ctranslate2==4.5.0
3
+ pysrt
4
+ sentencex
5
+ qwen-tts
subtitle.py ADDED
@@ -0,0 +1,574 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # ==============================================================================
4
+ # --- 1. IMPORTS
5
+ # ==============================================================================
6
+
7
+ import os
8
+ import re
9
+ import gc
10
+ import uuid
11
+ import math
12
+ import shutil
13
+ import string
14
+ import requests
15
+ import urllib.request
16
+ import urllib.error
17
+
18
+ import torch
19
+ import pysrt
20
+ from tqdm.auto import tqdm
21
+ from faster_whisper import WhisperModel
22
+
23
+
24
+ # ==============================================================================
25
+ # --- 2. CONSTANTS & CONFIGURATION
26
+ # ==============================================================================
27
+
28
+ # Folder paths for storing generated files and temporary audio
29
+ SUBTITLE_FOLDER = "./generated_subtitle"
30
+ TEMP_FOLDER = "./subtitle_audio"
31
+
32
+ # Mapping of language names to their ISO 639-1 codes
33
+ LANGUAGE_CODE = {
34
+ 'Akan': 'aka', 'Albanian': 'sq', 'Amharic': 'am', 'Arabic': 'ar', 'Armenian': 'hy',
35
+ 'Assamese': 'as', 'Azerbaijani': 'az', 'Basque': 'eu', 'Bashkir': 'ba', 'Bengali': 'bn',
36
+ 'Bosnian': 'bs', 'Bulgarian': 'bg', 'Burmese': 'my', 'Catalan': 'ca', 'Chinese': 'zh',
37
+ 'Croatian': 'hr', 'Czech': 'cs', 'Danish': 'da', 'Dutch': 'nl', 'English': 'en',
38
+ 'Estonian': 'et', 'Faroese': 'fo', 'Finnish': 'fi', 'French': 'fr', 'Galician': 'gl',
39
+ 'Georgian': 'ka', 'German': 'de', 'Greek': 'el', 'Gujarati': 'gu', 'Haitian Creole': 'ht',
40
+ 'Hausa': 'ha', 'Hebrew': 'he', 'Hindi': 'hi', 'Hungarian': 'hu', 'Icelandic': 'is',
41
+ 'Indonesian': 'id', 'Italian': 'it', 'Japanese': 'ja', 'Kannada': 'kn', 'Kazakh': 'kk',
42
+ 'Korean': 'ko', 'Kurdish': 'ckb', 'Kyrgyz': 'ky', 'Lao': 'lo', 'Lithuanian': 'lt',
43
+ 'Luxembourgish': 'lb', 'Macedonian': 'mk', 'Malay': 'ms', 'Malayalam': 'ml', 'Maltese': 'mt',
44
+ 'Maori': 'mi', 'Marathi': 'mr', 'Mongolian': 'mn', 'Nepali': 'ne', 'Norwegian': 'no',
45
+ 'Norwegian Nynorsk': 'nn', 'Pashto': 'ps', 'Persian': 'fa', 'Polish': 'pl', 'Portuguese': 'pt',
46
+ 'Punjabi': 'pa', 'Romanian': 'ro', 'Russian': 'ru', 'Serbian': 'sr', 'Sinhala': 'si',
47
+ 'Slovak': 'sk', 'Slovenian': 'sl', 'Somali': 'so', 'Spanish': 'es', 'Sundanese': 'su',
48
+ 'Swahili': 'sw', 'Swedish': 'sv', 'Tamil': 'ta', 'Telugu': 'te', 'Thai': 'th',
49
+ 'Turkish': 'tr', 'Ukrainian': 'uk', 'Urdu': 'ur', 'Uzbek': 'uz', 'Vietnamese': 'vi',
50
+ 'Welsh': 'cy', 'Yiddish': 'yi', 'Yoruba': 'yo', 'Zulu': 'zu'
51
+ }
52
+
53
+
54
+ # ==============================================================================
55
+ # --- 3. FILE & MODEL DOWNLOADING UTILITIES
56
+ # ==============================================================================
57
+
58
+ def download_file(url, download_file_path, redownload=False):
59
+ """Download a single file with urllib and a tqdm progress bar."""
60
+ base_path = os.path.dirname(download_file_path)
61
+ os.makedirs(base_path, exist_ok=True)
62
+
63
+ if os.path.exists(download_file_path):
64
+ if redownload:
65
+ os.remove(download_file_path)
66
+ tqdm.write(f"♻️ Redownloading: {os.path.basename(download_file_path)}")
67
+ elif os.path.getsize(download_file_path) > 0:
68
+ tqdm.write(f"✔️ Skipped (already exists): {os.path.basename(download_file_path)}")
69
+ return True
70
+
71
+ try:
72
+ request = urllib.request.urlopen(url)
73
+ total = int(request.headers.get('Content-Length', 0))
74
+ except urllib.error.URLError as e:
75
+ print(f"❌ Error: Unable to open URL: {url}")
76
+ print(f"Reason: {e.reason}")
77
+ return False
78
+
79
+ with tqdm(total=total, desc=os.path.basename(download_file_path), unit='B', unit_scale=True, unit_divisor=1024) as progress:
80
+ try:
81
+ urllib.request.urlretrieve(
82
+ url,
83
+ download_file_path,
84
+ reporthook=lambda count, block_size, total_size: progress.update(block_size)
85
+ )
86
+ except urllib.error.URLError as e:
87
+ print(f"❌ Error: Failed to download {url}")
88
+ print(f"Reason: {e.reason}")
89
+ return False
90
+
91
+ tqdm.write(f"⬇️ Downloaded: {os.path.basename(download_file_path)}")
92
+ return True
93
+
94
+
95
+ def download_model(repo_id, download_folder="./", redownload=False):
96
+ """
97
+ Downloads all files from a Hugging Face repository using the public API,
98
+ avoiding the need for a Hugging Face token for public models.
99
+ """
100
+ if not download_folder.strip():
101
+ download_folder = "."
102
+
103
+ api_url = f"https://huggingface.co/api/models/{repo_id}"
104
+ model_name = repo_id.split('/')[-1]
105
+ download_dir = os.path.abspath(f"{download_folder.rstrip('/')}/{model_name}")
106
+ os.makedirs(download_dir, exist_ok=True)
107
+
108
+ print(f"📂 Download directory: {download_dir}")
109
+
110
+ try:
111
+ response = requests.get(api_url)
112
+ response.raise_for_status()
113
+ except requests.exceptions.RequestException as e:
114
+ print(f"❌ Error fetching repo info: {e}")
115
+ return None
116
+
117
+ data = response.json()
118
+ files_to_download = [f["rfilename"] for f in data.get("siblings", [])]
119
+
120
+ if not files_to_download:
121
+ print(f"⚠️ No files found in repo '{repo_id}'.")
122
+ return None
123
+
124
+ print(f"📦 Found {len(files_to_download)} files in repo '{repo_id}'. Checking cache...")
125
+
126
+ for file in tqdm(files_to_download, desc="Processing files", unit="file"):
127
+ file_url = f"https://huggingface.co/{repo_id}/resolve/main/{file}"
128
+ file_path = os.path.join(download_dir, file)
129
+ download_file(file_url, file_path, redownload=redownload)
130
+
131
+ return download_dir
132
+
133
+
134
+ # ==============================================================================
135
+ # --- 4. CORE TRANSCRIPTION & PROCESSING LOGIC
136
+ # ==============================================================================
137
+
138
+ def get_language_name(code):
139
+ """Retrieves the full language name from its code."""
140
+ for name, value in LANGUAGE_CODE.items():
141
+ if value == code:
142
+ return name
143
+ return None
144
+
145
+ def clean_file_name(file_path):
146
+ """Generates a clean, unique file name to avoid path issues."""
147
+ dir_name = os.path.dirname(file_path)
148
+ base_name, extension = os.path.splitext(os.path.basename(file_path))
149
+
150
+ cleaned_base = re.sub(r'[^a-zA-Z\d]+', '_', base_name)
151
+ cleaned_base = re.sub(r'_+', '_', cleaned_base).strip('_')
152
+ random_uuid = uuid.uuid4().hex[:6]
153
+
154
+ return os.path.join(dir_name, f"{cleaned_base}_{random_uuid}{extension}")
155
+
156
+ def format_segments(segments):
157
+ """Formats the raw segments from Whisper into structured lists."""
158
+ sentence_timestamp = []
159
+ words_timestamp = []
160
+ speech_to_text = ""
161
+
162
+ for i in segments:
163
+ text = i.text.strip()
164
+ sentence_id = len(sentence_timestamp)
165
+ sentence_timestamp.append({
166
+ "id": sentence_id,
167
+ "text": text,
168
+ "start": i.start,
169
+ "end": i.end,
170
+ "words": []
171
+ })
172
+ speech_to_text += text + " "
173
+
174
+ for word in i.words:
175
+ word_data = {
176
+ "word": word.word.strip(),
177
+ "start": word.start,
178
+ "end": word.end
179
+ }
180
+ sentence_timestamp[sentence_id]["words"].append(word_data)
181
+ words_timestamp.append(word_data)
182
+
183
+ return sentence_timestamp, words_timestamp, speech_to_text.strip()
184
+
185
+ # def get_audio_file(uploaded_file):
186
+ # """Copies the uploaded media file to a temporary location for processing."""
187
+ # temp_path = os.path.join(TEMP_FOLDER, os.path.basename(uploaded_file))
188
+ # cleaned_path = clean_file_name(temp_path)
189
+ # shutil.copy(uploaded_file, cleaned_path)
190
+ # return cleaned_path
191
+
192
+ whisper_model=None
193
+
194
+ def load_whisper_model(model_name="deepdml/faster-whisper-large-v3-turbo-ct2"):
195
+ global whisper_model
196
+ if whisper_model is None:
197
+ device = "cuda" if torch.cuda.is_available() else "cpu"
198
+ compute_type = "float16" if torch.cuda.is_available() else "int8"
199
+ try:
200
+ whisper_model = WhisperModel(
201
+ model_name,
202
+ device=device,
203
+ compute_type=compute_type,
204
+ )
205
+ except Exception as e:
206
+ model_dir = download_model(
207
+ "deepdml/faster-whisper-large-v3-turbo-ct2",
208
+ download_folder="./",
209
+ redownload=False)
210
+ whisper_model = WhisperModel(
211
+ model_dir,
212
+ device=device,
213
+ compute_type=compute_type)
214
+ return whisper_model
215
+
216
+
217
+ def whisper_subtitle(uploaded_file, source_language):
218
+ """
219
+ Main transcription function. Loads the model, transcribes the audio,
220
+ and generates subtitle files.
221
+ """
222
+
223
+ model = load_whisper_model()
224
+
225
+
226
+ # 2. Process audio file
227
+ # audio_file_path = get_audio_file(uploaded_file)
228
+ audio_file_path=uploaded_file
229
+
230
+ # 3. Transcribe
231
+ detected_language = source_language
232
+ if source_language == "Auto":
233
+ segments, info = model.transcribe(audio_file_path, word_timestamps=True)
234
+ detected_lang_code = info.language
235
+ detected_language = get_language_name(detected_lang_code)
236
+ else:
237
+ lang_code = LANGUAGE_CODE[source_language]
238
+ segments, _ = model.transcribe(audio_file_path, word_timestamps=True, language=lang_code)
239
+
240
+ sentence_timestamps, word_timestamps, transcript_text = format_segments(segments)
241
+
242
+ # 4. Cleanup
243
+ # if os.path.exists(audio_file_path):
244
+ # os.remove(audio_file_path)
245
+ del model
246
+ gc.collect()
247
+ if torch.cuda.is_available():
248
+ torch.cuda.empty_cache()
249
+
250
+ # 5. Prepare output file paths
251
+ base_filename = os.path.splitext(os.path.basename(uploaded_file))[0][:30]
252
+ srt_base = f"{SUBTITLE_FOLDER}/{base_filename}_{detected_language}.srt"
253
+ clean_srt_path = clean_file_name(srt_base)
254
+ txt_path = clean_srt_path.replace(".srt", ".txt")
255
+ word_srt_path = clean_srt_path.replace(".srt", "_word_level.srt")
256
+ custom_srt_path = clean_srt_path.replace(".srt", "_Multiline.srt")
257
+ shorts_srt_path = clean_srt_path.replace(".srt", "_shorts.srt")
258
+
259
+ # 6. Generate all subtitle files
260
+ generate_srt_from_sentences(sentence_timestamps, srt_path=clean_srt_path)
261
+ word_level_srt(word_timestamps, srt_path=word_srt_path)
262
+ shorts_json=write_sentence_srt(
263
+ word_timestamps, output_file=shorts_srt_path, max_lines=1,
264
+ max_duration_s=2.0, max_chars_per_line=17
265
+ )
266
+ sentence_json=write_sentence_srt(
267
+ word_timestamps, output_file=custom_srt_path, max_lines=2,
268
+ max_duration_s=7.0, max_chars_per_line=38
269
+ )
270
+
271
+ with open(txt_path, 'w', encoding='utf-8') as f:
272
+ f.write(transcript_text)
273
+
274
+ return (
275
+ clean_srt_path, custom_srt_path, word_srt_path, shorts_srt_path,
276
+ txt_path, transcript_text, sentence_json,shorts_json,detected_language
277
+ )
278
+
279
+
280
+ # ==============================================================================
281
+ # --- 5. SUBTITLE GENERATION & FORMATTING
282
+ # ==============================================================================
283
+
284
+ def convert_time_to_srt_format(seconds):
285
+ """Converts seconds to the standard SRT time format (HH:MM:SS,ms)."""
286
+ hours = int(seconds // 3600)
287
+ minutes = int((seconds % 3600) // 60)
288
+ secs = int(seconds % 60)
289
+ milliseconds = round((seconds - int(seconds)) * 1000)
290
+
291
+ if milliseconds == 1000:
292
+ milliseconds = 0
293
+ secs += 1
294
+ if secs == 60:
295
+ secs, minutes = 0, minutes + 1
296
+ if minutes == 60:
297
+ minutes, hours = 0, hours + 1
298
+
299
+ return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"
300
+
301
+ def split_line_by_char_limit(text, max_chars_per_line=38):
302
+ """Splits a string into multiple lines based on a character limit."""
303
+ words = text.split()
304
+ lines = []
305
+ current_line = ""
306
+ for word in words:
307
+ if not current_line:
308
+ current_line = word
309
+ elif len(current_line + " " + word) <= max_chars_per_line:
310
+ current_line += " " + word
311
+ else:
312
+ lines.append(current_line)
313
+ current_line = word
314
+ if current_line:
315
+ lines.append(current_line)
316
+ return lines
317
+
318
+ def merge_punctuation_glitches(subtitles):
319
+ """Cleans up punctuation artifacts at the boundaries of subtitle entries."""
320
+ if not subtitles:
321
+ return []
322
+
323
+ cleaned = [subtitles[0]]
324
+ for i in range(1, len(subtitles)):
325
+ prev = cleaned[-1]
326
+ curr = subtitles[i]
327
+
328
+ prev_text = prev["text"].rstrip()
329
+ curr_text = curr["text"].lstrip()
330
+
331
+ match = re.match(r'^([,.:;!?]+)(\s*)(.+)', curr_text)
332
+ if match:
333
+ punct, _, rest = match.groups()
334
+ if not prev_text.endswith(tuple(punct)):
335
+ prev["text"] = prev_text + punct
336
+ curr_text = rest.strip()
337
+
338
+ unwanted_chars = ['"', '“', '”', ';', ':']
339
+ for ch in unwanted_chars:
340
+ curr_text = curr_text.replace(ch, '')
341
+ curr_text = curr_text.strip()
342
+
343
+ if not curr_text or re.fullmatch(r'[.,!?]+', curr_text):
344
+ prev["end"] = curr["end"]
345
+ continue
346
+
347
+ curr["text"] = curr_text
348
+ prev["text"] = prev["text"].replace('"', '').replace('“', '').replace('”', '')
349
+ cleaned.append(curr)
350
+
351
+ return cleaned
352
+
353
+ import json
354
+ def write_sentence_srt(
355
+ word_level_timestamps, output_file="subtitles_professional.srt", max_lines=2,
356
+ max_duration_s=7.0, max_chars_per_line=38, hard_pause_threshold=0.5,
357
+ merge_pause_threshold=0.4
358
+ ):
359
+ """Creates professional-grade SRT files and a corresponding timestamp.json file."""
360
+ if not word_level_timestamps:
361
+ return
362
+
363
+ # Phase 1: Generate draft subtitles based on timing and length rules
364
+ draft_subtitles = []
365
+ i = 0
366
+ while i < len(word_level_timestamps):
367
+ start_time = word_level_timestamps[i]["start"]
368
+
369
+ # We'll now store the full word objects, not just the text
370
+ current_word_objects = []
371
+
372
+ j = i
373
+ while j < len(word_level_timestamps):
374
+ entry = word_level_timestamps[j]
375
+
376
+ # Create potential text from the word objects
377
+ potential_words = [w["word"] for w in current_word_objects] + [entry["word"]]
378
+ potential_text = " ".join(potential_words)
379
+
380
+ if len(split_line_by_char_limit(potential_text, max_chars_per_line)) > max_lines: break
381
+ if (entry["end"] - start_time) > max_duration_s and current_word_objects: break
382
+
383
+ if j > i:
384
+ prev_entry = word_level_timestamps[j-1]
385
+ pause = entry["start"] - prev_entry["end"]
386
+ if pause >= hard_pause_threshold: break
387
+ if prev_entry["word"].endswith(('.','!','?')): break
388
+
389
+ # Append the full word object
390
+ current_word_objects.append(entry)
391
+ j += 1
392
+
393
+ if not current_word_objects:
394
+ current_word_objects.append(word_level_timestamps[i])
395
+ j = i + 1
396
+
397
+ text = " ".join([w["word"] for w in current_word_objects])
398
+ end_time = word_level_timestamps[j - 1]["end"]
399
+
400
+ # Include the list of word objects in our draft subtitle
401
+ draft_subtitles.append({
402
+ "start": start_time,
403
+ "end": end_time,
404
+ "text": text,
405
+ "words": current_word_objects
406
+ })
407
+ i = j
408
+
409
+ # Phase 2: Post-process to merge single-word "orphan" subtitles
410
+ if not draft_subtitles: return
411
+ final_subtitles = [draft_subtitles[0]]
412
+ for k in range(1, len(draft_subtitles)):
413
+ prev_sub = final_subtitles[-1]
414
+ current_sub = draft_subtitles[k]
415
+ is_orphan = len(current_sub["text"].split()) == 1
416
+ pause_from_prev = current_sub["start"] - prev_sub["end"]
417
+
418
+ if is_orphan and pause_from_prev < merge_pause_threshold:
419
+ merged_text = prev_sub["text"] + " " + current_sub["text"]
420
+ if len(split_line_by_char_limit(merged_text, max_chars_per_line)) <= max_lines:
421
+ prev_sub["text"] = merged_text
422
+ prev_sub["end"] = current_sub["end"]
423
+
424
+ # Merge the word-level data as well
425
+ prev_sub["words"].extend(current_sub["words"])
426
+ continue
427
+
428
+ final_subtitles.append(current_sub)
429
+
430
+ final_subtitles = merge_punctuation_glitches(final_subtitles)
431
+ # print(final_subtitles)
432
+ # ==============================================================================
433
+ # NEW CODE BLOCK: Generate JSON data and write files
434
+ # ==============================================================================
435
+
436
+ # This dictionary will hold the data for our JSON file
437
+ timestamps_data = {}
438
+
439
+ # Phase 3: Write the final SRT file (and prepare JSON data)
440
+ with open(output_file, "w", encoding="utf-8") as f:
441
+ for idx, sub in enumerate(final_subtitles, start=1):
442
+ # --- SRT Writing (Unchanged) ---
443
+ text = sub["text"].replace(" ,", ",").replace(" .", ".")
444
+ formatted_lines = split_line_by_char_limit(text, max_chars_per_line)
445
+ start_time_str = convert_time_to_srt_format(sub['start'])
446
+ end_time_str = convert_time_to_srt_format(sub['end'])
447
+
448
+ f.write(f"{idx}\n")
449
+ f.write(f"{start_time_str} --> {end_time_str}\n")
450
+ f.write("\n".join(formatted_lines) + "\n\n")
451
+
452
+ # --- JSON Data Population (New) ---
453
+ # Create the list of word dictionaries for the current subtitle
454
+ word_data = []
455
+ for word_obj in sub["words"]:
456
+ word_data.append({
457
+ "word": word_obj["word"],
458
+ "start": convert_time_to_srt_format(word_obj["start"]),
459
+ "end": convert_time_to_srt_format(word_obj["end"])
460
+ })
461
+
462
+ # Add the complete entry to our main dictionary
463
+ timestamps_data[str(idx)] = {
464
+ "text": "\n".join(formatted_lines),
465
+ "start": start_time_str,
466
+ "end": end_time_str,
467
+ "words": word_data
468
+ }
469
+
470
+ # Write the collected data to the JSON file
471
+ json_output_file = output_file.replace(".srt",".json")
472
+ with open(json_output_file, "w", encoding="utf-8") as f_json:
473
+ json.dump(timestamps_data, f_json, indent=4, ensure_ascii=False)
474
+
475
+ # print(f"Successfully generated SRT file: {output_file}")
476
+ # print(f"Successfully generated JSON file: {json_output_file}")
477
+ return json_output_file
478
+
479
+ def write_subtitles_to_file(subtitles, filename="subtitles.srt"):
480
+ """Writes a dictionary of subtitles to a standard SRT file."""
481
+ with open(filename, 'w', encoding='utf-8') as f:
482
+ for id, entry in subtitles.items():
483
+ if entry['start'] is None or entry['end'] is None:
484
+ print(f"Skipping subtitle ID {id} due to missing timestamps.")
485
+ continue
486
+ start_time = convert_time_to_srt_format(entry['start'])
487
+ end_time = convert_time_to_srt_format(entry['end'])
488
+ f.write(f"{id}\n")
489
+ f.write(f"{start_time} --> {end_time}\n")
490
+ f.write(f"{entry['text']}\n\n")
491
+
492
+ def word_level_srt(words_timestamp, srt_path="word_level_subtitle.srt", shorts=False):
493
+ """Generates an SRT file with one word per subtitle entry."""
494
+ punctuation = re.compile(r'[.,!?;:"\–—_~^+*|]')
495
+ with open(srt_path, 'w', encoding='utf-8') as srt_file:
496
+ for i, word_info in enumerate(words_timestamp, start=1):
497
+ start = convert_time_to_srt_format(word_info['start'])
498
+ end = convert_time_to_srt_format(word_info['end'])
499
+ word = re.sub(punctuation, '', word_info['word'])
500
+ if word.strip().lower() == 'i': word = "I"
501
+ if not shorts: word = word.replace("-", "")
502
+ srt_file.write(f"{i}\n{start} --> {end}\n{word}\n\n")
503
+
504
+ def generate_srt_from_sentences(sentence_timestamp, srt_path="default_subtitle.srt"):
505
+ """Generates a standard SRT file from sentence-level timestamps."""
506
+ with open(srt_path, 'w', encoding='utf-8') as srt_file:
507
+ for index, sentence in enumerate(sentence_timestamp, start=1):
508
+ start = convert_time_to_srt_format(sentence['start'])
509
+ end = convert_time_to_srt_format(sentence['end'])
510
+ srt_file.write(f"{index}\n{start} --> {end}\n{sentence['text']}\n\n")
511
+
512
+
513
+
514
+
515
+ # ==============================================================================
516
+ # --- 7. MAIN ORCHESTRATOR FUNCTION
517
+ # ==============================================================================
518
+
519
+ def subtitle_maker(media_file, source_lang):
520
+ """
521
+ The main entry point to generate and optionally translate subtitles.
522
+
523
+ Args:
524
+ media_file (str): Path to the input media file.
525
+ source_lang (str): The source language ('Automatic' for detection).
526
+ target_lang (str): The target language for translation.
527
+
528
+ Returns:
529
+ A tuple containing paths to all generated files and the transcript text.
530
+ """
531
+
532
+ try:
533
+ (
534
+ default_srt, custom_srt, word_srt, shorts_srt,
535
+ txt_path, transcript, sentence_json,word_json,detected_lang
536
+ ) = whisper_subtitle(media_file, source_lang)
537
+ except Exception as e:
538
+ print(f"❌ An error occurred during transcription: {e}")
539
+ return (None, None, None, None, None, None,None,None, f"Error: {e}")
540
+
541
+
542
+ return (
543
+ default_srt, custom_srt, word_srt,
544
+ shorts_srt, txt_path,sentence_json,word_json, transcript,detected_lang
545
+ )
546
+
547
+
548
+ # ==============================================================================
549
+ # --- 8. INITIALIZATION
550
+ # ==============================================================================
551
+ os.makedirs(SUBTITLE_FOLDER, exist_ok=True)
552
+ os.makedirs(TEMP_FOLDER, exist_ok=True)
553
+
554
+
555
+ # from subtitle import subtitle_maker
556
+ # media_file = "/content/output.mp3"
557
+ # source_lang = "Auto" #"English"
558
+
559
+ # default_srt, custom_srt, word_srt,shorts_srt, txt_path,sentence_json,word_json, transcript,detected_lang= subtitle_maker(
560
+ # media_file, source_lang
561
+ # )
562
+
563
+
564
+ # default_srt -> Original subtitles generated directly by Whisper-Large-V3-Turbo-CT2
565
+ # custom_srt -> Modified version of default subtitles with shorter segments
566
+ # (better readability for horizontal videos, Maximum 38 characters per segment. )
567
+ # word_srt -> Word-level timestamps (useful for creating YouTube Shorts/Reels)
568
+ # shorts_srt -> Optimized subtitles for vertical videos (displays 3–4 words at a time , Maximum 17 characters per segment.)
569
+ # txt_path -> Full transcript as plain text (useful for video summarization or for asking questions about the video or audio data with other LLM tools)
570
+ # sentence_json,word_json --> To Generate .ass file later
571
+ # transcript -> Transcript text directly returned by the function, if you just need the transcript
572
+ # detected_lang -> Detected Lang
573
+ # All functionality is contained in a single file, making it portable
574
+ # and reusable across multiple projects for different purposes.