Qwen3-TTS-Colab / app.py
NeuralFalcon's picture
Update app.py
7d4f32b verified
# %cd /content/Qwen3-TTS-Colab
#huggingface display
model_list=["Qwen/Qwen3-TTS-12Hz-1.7B-Base",
"Qwen/Qwen3-TTS-12Hz-0.6B-Base"
"Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
"Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
"Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"
]
from subtitle import subtitle_maker
from process_text import text_chunk
from qwen_tts import Qwen3TTSModel
import subprocess
import os
import gradio as gr
import numpy as np
import torch
import soundfile as sf
from pydub import AudioSegment
from pydub.silence import split_on_silence
from huggingface_hub import snapshot_download
from hf_downloader import download_model
import gc
from huggingface_hub import login
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN:
login(token=HF_TOKEN)
else:
HF_TOKEN=None
# Global model holders
loaded_models = {}
MODEL_SIZES = ["0.6B", "1.7B"]
# Speaker and language choices
SPEAKERS = [
"Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
]
LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]
# --- Helper Functions ---
def get_model_path(model_type: str, model_size: str) -> str:
"""Get model path based on type and size."""
try:
return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}")
except Exception as e:
return download_model(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}", download_folder="./qwen_tts_model", redownload= False)
def clear_other_models(keep_key=None):
"""Delete all loaded models except the current one."""
global loaded_models
keys_to_delete = [k for k in loaded_models if k != keep_key]
for k in keys_to_delete:
try:
del loaded_models[k]
except Exception:
pass
for k in keys_to_delete:
loaded_models.pop(k, None)
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
def get_model(model_type: str, model_size: str):
"""Load model and clear others to avoid OOM in Colab."""
global loaded_models
key = (model_type, model_size)
if key in loaded_models:
return loaded_models[key]
clear_other_models(keep_key=key)
model_path = get_model_path(model_type, model_size)
model = Qwen3TTSModel.from_pretrained(
model_path,
device_map="cuda",
dtype=torch.bfloat16,
)
loaded_models[key] = model
return model
def _normalize_audio(wav, eps=1e-12, clip=True):
"""Normalize audio to float32 in [-1, 1] range."""
x = np.asarray(wav)
if np.issubdtype(x.dtype, np.integer):
info = np.iinfo(x.dtype)
if info.min < 0:
y = x.astype(np.float32) / max(abs(info.min), info.max)
else:
mid = (info.max + 1) / 2.0
y = (x.astype(np.float32) - mid) / mid
elif np.issubdtype(x.dtype, np.floating):
y = x.astype(np.float32)
m = np.max(np.abs(y)) if y.size else 0.0
if m > 1.0 + 1e-6:
y = y / (m + eps)
else:
raise TypeError(f"Unsupported dtype: {x.dtype}")
if clip:
y = np.clip(y, -1.0, 1.0)
if y.ndim > 1:
y = np.mean(y, axis=-1).astype(np.float32)
return y
def _audio_to_tuple(audio):
"""Convert Gradio audio input to (wav, sr) tuple."""
if audio is None: return None
if isinstance(audio, str):
try:
wav, sr = sf.read(audio)
wav = _normalize_audio(wav)
return wav, int(sr)
except Exception as e:
print(f"Error reading audio file: {e}")
return None
if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int):
sr, wav = audio
wav = _normalize_audio(wav)
return wav, int(sr)
if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio:
sr = int(audio["sampling_rate"])
wav = _normalize_audio(audio["data"])
return wav, sr
return None
def transcribe_reference(audio_path, mode_input, language="English"):
"""Uses subtitle_maker to extract text from the reference audio."""
should_run = False
if isinstance(mode_input, bool): should_run = mode_input
elif isinstance(mode_input, str) and "High-Quality" in mode_input: should_run = True
if not audio_path or not should_run: return gr.update()
print(f"Starting transcription for: {audio_path}")
src_lang = language if language != "Auto" else "English"
try:
results = subtitle_maker(audio_path, src_lang)
transcript = results[7]
return transcript if transcript else "Could not detect speech."
except Exception as e:
print(f"Transcription Error: {e}")
return f"Error during transcription: {str(e)}"
# --- Audio Processing Utils (Disk Based) ---
def remove_silence_function(file_path, minimum_silence=100):
"""Removes silence from an audio file using Pydub."""
try:
output_path = file_path.replace(".wav", "_no_silence.wav")
sound = AudioSegment.from_wav(file_path)
audio_chunks = split_on_silence(sound,
min_silence_len=minimum_silence,
silence_thresh=-45,
keep_silence=50)
combined = AudioSegment.empty()
for chunk in audio_chunks:
combined += chunk
combined.export(output_path, format="wav")
return output_path
except Exception as e:
print(f"Error removing silence: {e}")
return file_path
def process_audio_output(audio_path, make_subtitle, remove_silence, language="Auto"):
"""Handles Silence Removal and Subtitle Generation."""
# 1. Remove Silence
final_audio_path = audio_path
if remove_silence:
final_audio_path = remove_silence_function(audio_path)
# 2. Generate Subtitles
default_srt, custom_srt, word_srt, shorts_srt = None, None, None, None
if make_subtitle:
try:
results = subtitle_maker(final_audio_path, language)
default_srt = results[0]
custom_srt = results[1]
word_srt = results[2]
shorts_srt = results[3]
except Exception as e:
print(f"Subtitle generation error: {e}")
return final_audio_path, default_srt, custom_srt, word_srt, shorts_srt
def stitch_chunk_files(chunk_files,output_filename):
"""
Takes a list of file paths.
Stitches them into one file.
Deletes the temporary chunk files.
"""
if not chunk_files:
return None
combined_audio = AudioSegment.empty()
print(f"Stitching {len(chunk_files)} audio files...")
for f in chunk_files:
try:
segment = AudioSegment.from_wav(f)
combined_audio += segment
except Exception as e:
print(f"Error appending chunk {f}: {e}")
# output_filename = f"final_output_{os.getpid()}.wav"
combined_audio.export(output_filename, format="wav")
# Clean up temp files
for f in chunk_files:
try:
if os.path.exists(f):
os.remove(f)
except Exception as e:
print(f"Warning: Could not delete temp file {f}: {e}")
return output_filename
# --- Generators (Memory Optimized) ---
def generate_voice_design(text, language, voice_description, remove_silence, make_subs):
if not text or not text.strip(): return None, "Error: Text is required.", None, None, None, None
try:
# 1. Chunk Text
text_chunks, tts_filename = text_chunk(text, language, char_limit=280)
print(f"Processing {len(text_chunks)} chunks...")
chunk_files = []
tts = get_model("VoiceDesign", "1.7B")
# 2. Generate & Save Loop
for i, chunk in enumerate(text_chunks):
wavs, sr = tts.generate_voice_design(
text=chunk.strip(),
language=language,
instruct=voice_description.strip(),
non_streaming_mode=True,
max_new_tokens=2048,
)
# Save immediately to disk
temp_filename = f"temp_chunk_{i}_{os.getpid()}.wav"
sf.write(temp_filename, wavs[0], sr)
chunk_files.append(temp_filename)
# Clear memory
del wavs
torch.cuda.empty_cache()
gc.collect()
# 3. Stitch from disk
stitched_file = stitch_chunk_files(chunk_files,tts_filename)
# 4. Post-Process
final_audio, srt1, srt2, srt3, srt4 = process_audio_output(stitched_file, make_subs, remove_silence, language)
return final_audio, "Generation Success!", srt1, srt2, srt3, srt4
except Exception as e:
return None, f"Error: {e}", None, None, None, None
def generate_custom_voice(text, language, speaker, instruct, model_size, remove_silence, make_subs):
if not text or not text.strip(): return None, "Error: Text is required.", None, None, None, None
try:
text_chunks, tts_filename = text_chunk(text, language, char_limit=280)
chunk_files = []
tts = get_model("CustomVoice", model_size)
formatted_speaker = speaker.lower().replace(" ", "_")
for i, chunk in enumerate(text_chunks):
wavs, sr = tts.generate_custom_voice(
text=chunk.strip(),
language=language,
speaker=formatted_speaker,
instruct=instruct.strip() if instruct else None,
non_streaming_mode=True,
max_new_tokens=2048,
)
# Save immediately
temp_filename = f"temp_custom_{i}_{os.getpid()}.wav"
sf.write(temp_filename, wavs[0], sr)
chunk_files.append(temp_filename)
# Clear memory
del wavs
torch.cuda.empty_cache()
gc.collect()
stitched_file = stitch_chunk_files(chunk_files,tts_filename)
final_audio, srt1, srt2, srt3, srt4 = process_audio_output(stitched_file, make_subs, remove_silence, language)
return final_audio, "Generation Success!", srt1, srt2, srt3, srt4
except Exception as e:
return None, f"Error: {e}", None, None, None, None
def smart_generate_clone(ref_audio, ref_text, target_text, language, mode, model_size, remove_silence, make_subs):
if not target_text or not target_text.strip(): return None, "Error: Target text is required.", None, None, None, None
if not ref_audio: return None, "Error: Ref audio required.", None, None, None, None
# 1. Mode & Transcript Logic
use_xvector_only = ("Fast" in mode)
final_ref_text = ref_text
audio_tuple = _audio_to_tuple(ref_audio)
if not use_xvector_only:
if not final_ref_text or not final_ref_text.strip():
print("Auto-transcribing reference...")
try:
final_ref_text = transcribe_reference(ref_audio, True, language)
if not final_ref_text or "Error" in final_ref_text:
return None, f"Transcription failed: {final_ref_text}", None, None, None, None
except Exception as e:
return None, f"Transcribe Error: {e}", None, None, None, None
else:
final_ref_text = None
try:
# 2. Chunk Target Text
text_chunks, tts_filename = text_chunk(target_text, language, char_limit=280)
chunk_files = []
tts = get_model("Base", model_size)
# 3. Generate Loop
for i, chunk in enumerate(text_chunks):
wavs, sr = tts.generate_voice_clone(
text=chunk.strip(),
language=language,
ref_audio=audio_tuple,
ref_text=final_ref_text.strip() if final_ref_text else None,
x_vector_only_mode=use_xvector_only,
max_new_tokens=2048,
)
# Save immediately
temp_filename = f"temp_clone_{i}_{os.getpid()}.wav"
sf.write(temp_filename, wavs[0], sr)
chunk_files.append(temp_filename)
# Clear memory
del wavs
torch.cuda.empty_cache()
gc.collect()
# 4. Stitch & Process
stitched_file = stitch_chunk_files(chunk_files,tts_filename)
final_audio, srt1, srt2, srt3, srt4 = process_audio_output(stitched_file, make_subs, remove_silence, language)
return final_audio, f"Success! Mode: {mode}", srt1, srt2, srt3, srt4
except Exception as e:
return None, f"Error: {e}", None, None, None, None
# --- UI Construction ---
def on_mode_change(mode):
return gr.update(visible=("High-Quality" in mode))
def build_ui():
theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"])
css = ".gradio-container {max-width: none !important;} .tab-content {padding: 20px;}"
with gr.Blocks(theme=theme, css=css, title="Qwen3-TTS Demo") as demo:
gr.HTML("""
<div style="text-align: center; margin: 20px auto; max-width: 800px;">
<h1 style="font-size: 2.5em; margin-bottom: 5px;">๐ŸŽ™๏ธ Qwen3-TTS </h1>
<a href="https://colab.research.google.com/github/NeuralFalconYT/Qwen3-TTS-Colab/blob/main/Qwen3_TTS_Colab.ipynb" target="_blank" style="display: inline-block; padding: 10px 20px; background-color: #4285F4; color: white; border-radius: 6px; text-decoration: none; font-size: 1em;">๐Ÿฅณ Run on Google Colab</a>
</div>""")
with gr.Tabs():
# --- Tab 1: Voice Design ---
with gr.Tab("Voice Design"):
with gr.Row():
with gr.Column(scale=2):
design_text = gr.Textbox(label="Text to Synthesize", lines=4, value="It's in the top drawer... wait, it's empty? No way, that's impossible! I'm sure I put it there!",
placeholder="Enter the text you want to convert to speech...")
design_language = gr.Dropdown(label="Language", choices=LANGUAGES, value="Auto")
design_instruct = gr.Textbox(label="Voice Description", lines=3, placeholder="Describe the voice characteristics you want...",
value="Speak in an incredulous tone, but with a hint of panic beginning to creep into your voice.")
design_btn = gr.Button("Generate with Custom Voice", variant="primary")
with gr.Accordion("More options", open=False):
with gr.Row():
design_rem_silence = gr.Checkbox(label="Remove Silence", value=False)
design_make_subs = gr.Checkbox(label="Generate Subtitles", value=False)
with gr.Column(scale=2):
design_audio_out = gr.Audio(label="Generated Audio", type="filepath")
design_status = gr.Textbox(label="Status", interactive=False)
with gr.Accordion("๐Ÿ“ Subtitles", open=False):
with gr.Row():
d_srt1 = gr.File(label="Original (Whisper)")
d_srt2 = gr.File(label="Readable")
with gr.Row():
d_srt3 = gr.File(label="Word-level")
d_srt4 = gr.File(label="Shorts/Reels")
design_btn.click(
generate_voice_design,
inputs=[design_text, design_language, design_instruct, design_rem_silence, design_make_subs],
outputs=[design_audio_out, design_status, d_srt1, d_srt2, d_srt3, d_srt4]
)
# --- Tab 2: Voice Clone ---
with gr.Tab("Voice Clone (Base)"):
with gr.Row():
with gr.Column(scale=2):
clone_target_text = gr.Textbox(label="Target Text", lines=3, placeholder="Enter the text you want the cloned voice to speak...")
clone_ref_audio = gr.Audio(label="Reference Audio (Upload a voice sample to clone)", type="filepath")
with gr.Row():
clone_language = gr.Dropdown(label="Language", choices=LANGUAGES, value="Auto",scale=1)
clone_model_size = gr.Dropdown(label="Model Size", choices=MODEL_SIZES, value="1.7B",scale=1)
clone_mode = gr.Dropdown(
label="Mode",
choices=["High-Quality (Audio + Transcript)", "Fast (Audio Only)"],
value="High-Quality (Audio + Transcript)",
interactive=True,
scale=2
)
clone_ref_text = gr.Textbox(label="Reference Text", lines=2, visible=True)
clone_btn = gr.Button("Clone & Generate", variant="primary")
with gr.Accordion("More options", open=False):
with gr.Row():
clone_rem_silence = gr.Checkbox(label="Remove Silence", value=False)
clone_make_subs = gr.Checkbox(label="Generate Subtitles", value=False)
with gr.Column(scale=2):
clone_audio_out = gr.Audio(label="Generated Audio", type="filepath")
clone_status = gr.Textbox(label="Status", interactive=False)
with gr.Accordion("๐Ÿ“ Subtitles", open=False):
with gr.Row():
c_srt1 = gr.File(label="Original")
c_srt2 = gr.File(label="Readable")
with gr.Row():
c_srt3 = gr.File(label="Word-level")
c_srt4 = gr.File(label="Shorts/Reels")
clone_mode.change(on_mode_change, inputs=[clone_mode], outputs=[clone_ref_text])
clone_ref_audio.change(transcribe_reference, inputs=[clone_ref_audio, clone_mode, clone_language], outputs=[clone_ref_text])
clone_btn.click(
smart_generate_clone,
inputs=[clone_ref_audio, clone_ref_text, clone_target_text, clone_language, clone_mode, clone_model_size, clone_rem_silence, clone_make_subs],
outputs=[clone_audio_out, clone_status, c_srt1, c_srt2, c_srt3, c_srt4]
)
# --- Tab 3: TTS (CustomVoice) ---
with gr.Tab("TTS (CustomVoice)"):
with gr.Row():
with gr.Column(scale=2):
tts_text = gr.Textbox(label="Text", lines=4, placeholder="Enter the text you want to convert to speech...",
value="Hello! Welcome to Text-to-Speech system. This is a demo of our TTS capabilities.")
with gr.Row():
tts_language = gr.Dropdown(label="Language", choices=LANGUAGES, value="English")
tts_speaker = gr.Dropdown(label="Speaker", choices=SPEAKERS, value="Ryan")
with gr.Row():
tts_instruct = gr.Textbox(label="Style Instruction (Optional)", lines=2,placeholder="e.g., Speak in a cheerful and energetic tone")
tts_model_size = gr.Dropdown(label="Size", choices=MODEL_SIZES, value="1.7B")
tts_btn = gr.Button("Generate Speech", variant="primary")
with gr.Accordion("More options", open=False):
with gr.Row():
tts_rem_silence = gr.Checkbox(label="Remove Silence", value=False)
tts_make_subs = gr.Checkbox(label="Generate Subtitles", value=False)
with gr.Column(scale=2):
tts_audio_out = gr.Audio(label="Generated Audio", type="filepath")
tts_status = gr.Textbox(label="Status", interactive=False)
with gr.Accordion("๐Ÿ“ Subtitles", open=False):
with gr.Row():
t_srt1 = gr.File(label="Original")
t_srt2 = gr.File(label="Readable")
with gr.Row():
t_srt3 = gr.File(label="Word-level")
t_srt4 = gr.File(label="Shorts/Reels")
tts_btn.click(
generate_custom_voice,
inputs=[tts_text, tts_language, tts_speaker, tts_instruct, tts_model_size, tts_rem_silence, tts_make_subs],
outputs=[tts_audio_out, tts_status, t_srt1, t_srt2, t_srt3, t_srt4]
)
# --- Tab 4: About ---
with gr.Tab("About"):
gr.Markdown("""
# Qwen3-TTS
A unified Text-to-Speech demo featuring three powerful modes:
- **Voice Design**: Create custom voices using natural language descriptions
- **Voice Clone (Base)**: Clone any voice from a reference audio
- **TTS (CustomVoice)**: Generate speech with predefined speakers and optional style instructions
Built with [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS) by Alibaba Qwen Team.
""")
gr.HTML("""
<hr>
<p style="color: red; font-weight: bold; font-size: 16px;">
โš ๏ธ NOTE
</p>
<p>
This Gradio UI is not affiliated with the official Qwen3-TTS project and is based on the
official Qwen3-TTS demo UI:<br>
<a href="https://huggingface.co/spaces/Qwen/Qwen3-TTS" target="_blank">
https://huggingface.co/spaces/Qwen/Qwen3-TTS
</a>
</p>
<p><b>Additional features:</b></p>
<ul>
<li>Automatic transcription support using faster-whisper-large-v3-turbo-ct2</li>
<li>Long text input support</li>
<li>Because we are using Whisper, subtitles are also added</li>
</ul>
""")
return demo
# if __name__ == "__main__":
# demo = build_ui()
# demo.launch(share=True, debug=True)
import click
@click.command()
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
def main(share,debug):
demo = build_ui()
demo.launch(share=True, debug=True)
demo.queue().launch(debug=debug, share=share)
if __name__ == "__main__":
main()