Spaces:

atalink
/

TTS-Talker

Runtime error

App Files Files Community

longtq commited on Sep 16

Commit

2bf87e7

1 Parent(s): c393370

Fix build error

Browse files

Files changed (14) hide show

.python-version +1 -0
.vscode/shortcuts.json +1 -0
Dockerfile +2 -2
README.md +12 -1
app.py +60 -194
app_tts.py +126 -22
examples/vocab.txt +2566 -0
requirements.txt +7 -5
src/face3d/models/arcface_torch/inference.py +8 -1
src/face3d/models/networks.py +10 -1
src/face3d/util/util.py +0 -1
src/gradio_demo.py +9 -1
src/test_audio2coeff.py +5 -3
src/utils/model2safetensor.py +6 -3

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

.vscode/shortcuts.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

Dockerfile CHANGED Viewed

@@ -43,7 +43,7 @@ RUN pyenv install ${PYTHON_VERSION} && \
     pyenv rehash && \
     pip install --no-cache-dir -U pip setuptools wheel
-RUN pip install --no-cache-dir -U torch==1.12.1 torchvision==0.13.1
 COPY --chown=1000 requirements.txt /tmp/requirements.txt
 RUN pip install --no-cache-dir -U -r /tmp/requirements.txt
@@ -56,4 +56,4 @@ ENV PYTHONPATH=${HOME}/app \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_THEME=huggingface \
     SYSTEM=spaces
-CMD ["python", "app.py"]

     pyenv rehash && \
     pip install --no-cache-dir -U pip setuptools wheel
+RUN pip install --no-cache-dir -U torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0
 COPY --chown=1000 requirements.txt /tmp/requirements.txt
 RUN pip install --no-cache-dir -U -r /tmp/requirements.txt
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_THEME=huggingface \
     SYSTEM=spaces
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -10,5 +10,16 @@ pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 license: mit
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+Use local:
+python=3.10
+```
+pip install torch==2.4.0+cu124 torchaudio==2.4.0+cu124 --extra-index-url https://download.pytorch.org/whl/cu124
+```
+```
+sudo apt-get update
+sudo apt-get install sox ffmpeg
+```

app.py CHANGED Viewed

@@ -32,215 +32,81 @@ def download_model():
     REPO_ID = 'vinthony/SadTalker-V002rc'
     snapshot_download(repo_id=REPO_ID, local_dir='./checkpoints', local_dir_use_symlinks=True)
-def sadtalker_demo():
-    download_model()
     sad_talker = SadTalker(lazy_load=True)
-    # tts_talker = TTSTalker()
     with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
         gr.Markdown("""
-# 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis.
-# The model was trained with approximately 1000 hours of data on a RTX 3090 GPU.
-Enter text and upload a sample voice to generate natural speech.
 """)
         with gr.Row():
             ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
             gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
         speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
-        btn_synthesize = gr.Button("🔥 Generate Voice")
         with gr.Row():
-            driven_audio = gr.Audio(label="🎧 Generated Audio", type="numpy")
-            output_spectrogram = gr.Image(label="📊 Spectrogram")
-        btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[driven_audio, output_spectrogram])
-        gr.Markdown("<div align='center'> <h2> 😭 SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023) </span> </h2> \
-                    <a style='font-size:18px;color: #efefef' href='https://arxiv.org/abs/2211.12194'>Arxiv</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
-                    <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a>  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
-                     <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
-        gr.Markdown("""
-        <b>You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue. <a style='display:inline-block' href="https://huggingface.co/spaces/vinthony/SadTalker?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a></b> \
-        <br/><b>Alternatively, try our GitHub <a href=https://github.com/Winfredy/SadTalker> code </a> on your own GPU. </b> <a style='display:inline-block' href="https://github.com/Winfredy/SadTalker"><img src="https://img.shields.io/github/stars/Winfredy/SadTalker?style=social"/></a> \
-        """)
-        with gr.Row(): #.style(equal_height=False):
-            with gr.Column(variant='panel'):
-                with gr.Tabs(elem_id="sadtalker_source_image"):
-                    with gr.TabItem('Source image'):
-                        with gr.Row():
-                            source_image = gr.Image(label="Source image", source="upload", type="filepath", elem_id="img2img_image") # .style(width=512)
-                with gr.Tabs(elem_id="sadtalker_driven_audio"):
-                    with gr.TabItem('Driving Methods'):
-                        gr.Markdown("Possible driving combinations: <br> 1. Audio only 2. Audio/IDLE Mode + Ref Video(pose, blink, pose+blink) 3. IDLE Mode only 4. Ref Video only (all) ")
-                        with gr.Row():
-                            # driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath", max_length=180) # 180s
-                            driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", source="upload", type="filepath", visible=False)
-                            with gr.Column():
-                                use_idle_mode = gr.Checkbox(label="Use Idle Animation")
-                                length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.")
-                                use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo
-                        with gr.Row():
-                            ref_video = gr.Video(label="Reference Video", source="upload", type="filepath", elem_id="vidref") # .style(width=512)
-                            with gr.Column():
-                                use_ref_video = gr.Checkbox(label="Use Reference Video")
-                                ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))")
-                            ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo
-            with gr.Column(variant='panel'):
-                with gr.Tabs(elem_id="sadtalker_checkbox"):
-                    with gr.TabItem('Settings'):
-                        gr.Markdown("need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials")
-                        with gr.Column(variant='panel'):
-                            # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
-                            # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
-                            with gr.Row():
-                                pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
-                                exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) #
-                                blink_every = gr.Checkbox(label="use eye blink", value=True)
-                            with gr.Row():
-                                size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?") #
-                                preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
-                            with gr.Row():
-                                is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
-                                facerender = gr.Radio(['facevid2vid','pirender'], value='facevid2vid', label='facerender', info="which face render?")
-                            with gr.Row():
-                                batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
-                                enhancer = gr.Checkbox(label="GFPGAN as Face enhancer")
-                            submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
-                with gr.Tabs(elem_id="sadtalker_genearted"):
-                        gen_video = gr.Video(label="Generated video", format="mp4", scale=1) # .style(width=256)
-        submit.click(
-                fn=sad_talker.test,
-                inputs=[source_image,
-                        driven_audio,
-                        preprocess_type,
-                        is_still_mode,
-                        enhancer,
-                        batch_size,
-                        size_of_image,
-                        pose_style,
-                        facerender,
-                        exp_weight,
-                        use_ref_video,
-                        ref_video,
-                        ref_info,
-                        use_idle_mode,
-                        length_of_audio,
-                        blink_every
-                        ],
-                outputs=[gen_video],
-                )
         with gr.Row():
-            examples = [
-                [
-                    'examples/source_image/full_body_1.png',
-                    'examples/driven_audio/bus_chinese.wav',
-                    'crop',
-                    True,
-                    False
-                ],
-                [
-                    'examples/source_image/full_body_2.png',
-                    'examples/driven_audio/japanese.wav',
-                    'crop',
-                    False,
-                    False
-                ],
-                [
-                    'examples/source_image/full3.png',
-                    'examples/driven_audio/deyu.wav',
-                    'crop',
-                    False,
-                    True
-                ],
-                [
-                    'examples/source_image/full4.jpeg',
-                    'examples/driven_audio/eluosi.wav',
-                    'full',
-                    False,
-                    True
-                ],
-                [
-                    'examples/source_image/full4.jpeg',
-                    'examples/driven_audio/imagine.wav',
-                    'full',
-                    True,
-                    True
-                ],
-                [
-                    'examples/source_image/full_body_1.png',
-                    'examples/driven_audio/bus_chinese.wav',
-                    'full',
-                    True,
-                    False
-                ],
-                [
-                    'examples/source_image/art_13.png',
-                    'examples/driven_audio/fayu.wav',
-                    'resize',
-                    True,
-                    False
-                ],
-                [
-                    'examples/source_image/art_5.png',
-                    'examples/driven_audio/chinese_news.wav',
-                    'resize',
-                    False,
-                    False
-                ],
-                [
-                    'examples/source_image/art_5.png',
-                    'examples/driven_audio/RD_Radio31_000.wav',
-                    'resize',
-                    True,
-                    True
-                ],
-            ]
-            gr.Examples(examples=examples,
-                        inputs=[
-                            source_image,
-                            driven_audio,
-                            preprocess_type,
-                            is_still_mode,
-                            enhancer],
-                        outputs=[gen_video],
-                        fn=sad_talker.test,
-                        cache_examples=os.getenv('SYSTEM') == 'spaces') #
     return sadtalker_interface
 if __name__ == "__main__":
     demo = sadtalker_demo()
     demo.queue(max_size=10, api_open=True)
-    demo.launch(debug=True)

     REPO_ID = 'vinthony/SadTalker-V002rc'
     snapshot_download(repo_id=REPO_ID, local_dir='./checkpoints', local_dir_use_symlinks=True)
+# New: Gộp 2 nút thành 1, output audio là input cho video
+import soundfile as sf
+def generate_voice_and_video(ref_audio, ref_text, gen_text, speed, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every):
+    # 1. Sinh audio từ TTS
+    (final_sample_rate, final_wave), _ = infer_tts(ref_audio, ref_text, gen_text, speed)
+    # Lưu ra file tạm
+    tmp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    sf.write(tmp_audio.name, final_wave, final_sample_rate)
+    # 2. Gọi SadTalker với audio vừa sinh ra
     sad_talker = SadTalker(lazy_load=True)
+    video_path = sad_talker.test(
+        source_image,
+        tmp_audio.name,
+        preprocess_type,
+        is_still_mode,
+        enhancer,
+        batch_size,
+        size_of_image,
+        pose_style,
+        facerender,
+        exp_weight,
+        use_ref_video,
+        ref_video,
+        ref_info,
+        use_idle_mode,
+        length_of_audio,
+        blink_every
+    )
+    return tmp_audio.name, video_path
+def sadtalker_demo():
+    download_model()
     with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
         gr.Markdown("""
+# 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis & SadTalker Video
+# Nhập text, upload sample voice và ảnh để tạo video nói chuyện.
 """)
         with gr.Row():
             ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
+            ref_text = gr.Textbox(label="📝 Reference Transcript (optional)", placeholder="Nhập transcript tiếng Việt cho sample voice nếu có...", lines=2)
             gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
         speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
         with gr.Row():
+            source_image = gr.Image(label="Source image", type="filepath", elem_id="img2img_image")
         with gr.Row():
+            # Các setting cho SadTalker
+            with gr.Column():
+                preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
+                is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
+                enhancer = gr.Checkbox(label="GFPGAN as Face enhancer")
+                batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
+                size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model?")
+                pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0)
+                facerender = gr.Radio(['facevid2vid','pirender'], value='facevid2vid', label='facerender', info="which face render?")
+                exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1)
+                use_ref_video = gr.Checkbox(label="Use Reference Video")
+                ref_video = gr.Video(label="Reference Video", elem_id="vidref")
+                ref_info = gr.Radio(['pose', 'blink','pose+blink', 'all'], value='pose', label='Reference Video',info="How to borrow from reference Video?((fully transfer, aka, video driving mode))")
+                use_idle_mode = gr.Checkbox(label="Use Idle Animation")
+                length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.")
+                blink_every = gr.Checkbox(label="use eye blink", value=True)
+        btn_generate = gr.Button("🔥 Generate Voice & Video")
+        with gr.Row():
+            output_audio = gr.Audio(label="🎧 Generated Audio", type="filepath")
+            gen_video = gr.Video(label="Generated video", format="mp4", scale=1)
+        btn_generate.click(
+            generate_voice_and_video,
+            inputs=[ref_audio, ref_text, gen_text, speed, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight, use_ref_video, ref_video, ref_info, use_idle_mode, length_of_audio, blink_every],
+            outputs=[output_audio, gen_video]
+        )
     return sadtalker_interface
 if __name__ == "__main__":
     demo = sadtalker_demo()
     demo.queue(max_size=10, api_open=True)
+    demo.launch(debug=True, server_name="0.0.0.0")

app_tts.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import spaces
 import os
 from huggingface_hub import login
 import gradio as gr
 from cached_path import cached_path
 import tempfile
 from vinorm import TTSnorm
 from f5_tts.model import DiT
 from f5_tts.infer.utils_infer import (
     preprocess_ref_audio_text,
@@ -13,8 +14,24 @@ from f5_tts.infer.utils_infer import (
     load_model,
     infer_process,
     save_spectrogram,
 )
 # Retrieve token from secrets
 hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
@@ -35,18 +52,108 @@ def post_process(text):
     text = " " + text + " "
     text = text.replace('"', "")
     return " ".join(text.split())
 # Load models
-vocoder = load_vocoder()
-model = load_model(
-    DiT,
-    dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
-    ckpt_path=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/model_last.pt")),
-    vocab_file=str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/config.json")),
-)
 @spaces.GPU
-def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
     if not ref_audio_orig:
         raise gr.Error("Please upload a sample audio file.")
@@ -54,39 +161,37 @@ def infer_tts(ref_audio_orig: str, gen_text: str, speed: float = 1.0, request: g
         raise gr.Error("Please enter the text content to generate voice.")
     if len(gen_text.split()) > 1000:
         raise gr.Error("Please enter text content with less than 1000 words.")
     try:
-        ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
         final_wave, final_sample_rate, spectrogram = infer_process(
-            ref_audio, ref_text.lower(), post_process(TTSnorm(gen_text)).lower(), model, vocoder, speed=speed
         )
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
             spectrogram_path = tmp_spectrogram.name
             save_spectrogram(spectrogram, spectrogram_path)
         return (final_sample_rate, final_wave), spectrogram_path
     except Exception as e:
         raise gr.Error(f"Error generating voice: {e}")
 # Gradio UI
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis.
     # The model was trained with approximately 1000 hours of data on a RTX 3090 GPU.
     Enter text and upload a sample voice to generate natural speech.
     """)
     with gr.Row():
         ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
         gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
     speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
     btn_synthesize = gr.Button("🔥 Generate Voice")
     with gr.Row():
         output_audio = gr.Audio(label="🎧 Generated Audio", type="numpy")
         output_spectrogram = gr.Image(label="📊 Spectrogram")
     model_limitations = gr.Textbox(
         value="""1. This model may not perform well with numerical characters, dates, special characters, etc. => A text normalization module is needed.
 2. The rhythm of some generated audios may be inconsistent or choppy => It is recommended to select clearly pronounced sample audios with minimal pauses for better synthesis quality.
@@ -96,8 +201,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         lines=4,
         interactive=False
     )
-    btn_synthesize.click(infer_tts, inputs=[ref_audio, gen_text, speed], outputs=[output_audio, output_spectrogram])
 # Run Gradio with share=True to get a gradio.live link
 # demo.queue().launch()

 import spaces
 import os
+import codecs
 from huggingface_hub import login
 import gradio as gr
 from cached_path import cached_path
 import tempfile
 from vinorm import TTSnorm
+from importlib.resources import files
 from f5_tts.model import DiT
 from f5_tts.infer.utils_infer import (
     preprocess_ref_audio_text,
     load_model,
     infer_process,
     save_spectrogram,
+    target_sample_rate as default_target_sample_rate,
+    n_mel_channels as default_n_mel_channels,
+    hop_length as default_hop_length,
+    win_length as default_win_length,
+    n_fft as default_n_fft,
+    mel_spec_type as default_mel_spec_type,
+    target_rms as default_target_rms,
+    cross_fade_duration as default_cross_fade_duration,
+    ode_method as default_ode_method,
+    nfe_step as default_nfe_step,  # 16, 32
+    cfg_strength as default_cfg_strength,
+    sway_sampling_coef as default_sway_sampling_coef,
+    speed as default_speed,
+    fix_duration as default_fix_duration
 )
+from pathlib import Path
+from omegaconf import OmegaConf
+from datetime import datetime
 # Retrieve token from secrets
 hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
     text = " " + text + " "
     text = text.replace('"', "")
     return " ".join(text.split())
 # Load models
 @spaces.GPU
+def infer_tts(ref_audio_orig: str, ref_text_input: str, gen_text: str, speed: float = 1.0, request: gr.Request = None):
+    args = {
+        "model": "F5TTS_Base",
+        "ckpt_file": str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/model_last.pt")),
+        "vocab_file": str(cached_path("hf://hynt/F5-TTS-Vietnamese-ViVoice/config.json")),
+        "ref_audio": ref_audio_orig,
+        "ref_text": ref_text_input,
+        "gen_text": gen_text,
+        "speed": speed
+    }
+    config = {} # tomli.load(open(args.config, "rb"))
+    # command-line interface parameters
+    model = args["model"] or config.get("model", "F5TTS_Base")
+    ckpt_file = args["ckpt_file"] or config.get("ckpt_file", "")
+    vocab_file = args["vocab_file"] or config.get("vocab_file", "")
+    ref_audio = args["ref_audio"] or config.get("ref_audio", "infer/examples/basic/basic_ref_en.wav")
+    ref_text = args["ref_text"] if args["ref_text"] is not None else config.get("ref_text", "Some call me nature, others call me mother nature.")
+    gen_text = args["gen_text"] or config.get("gen_text", "Here we generate something just for test.")
+    gen_file = args.get("gen_file", "") or config.get("gen_file", "")
+    output_dir = args.get("output_dir", "") or config.get("output_dir", "tests")
+    output_file = args.get("output_file", "") or config.get("output_file", f"infer_cli_{datetime.now().strftime(r'%Y%m%d_%H%M%S')}.wav")
+    save_chunk = args.get("save_chunk", False) or config.get("save_chunk", False)
+    remove_silence = args.get("remove_silence", False) or config.get("remove_silence", False)
+    load_vocoder_from_local = args.get("load_vocoder_from_local", False) or config.get("load_vocoder_from_local", False)
+    vocoder_name = args.get("vocoder_name", "") or config.get("vocoder_name", default_mel_spec_type)
+    target_rms = args.get("target_rms", None) or config.get("target_rms", default_target_rms)
+    cross_fade_duration = args.get("cross_fade_duration", None) or config.get("cross_fade_duration", default_cross_fade_duration)
+    nfe_step = args.get("nfe_step", None) or config.get("nfe_step", default_nfe_step)
+    cfg_strength = args.get("cfg_strength", None) or config.get("cfg_strength", default_cfg_strength)
+    sway_sampling_coef = args.get("sway_sampling_coef", None) or config.get("sway_sampling_coef", default_sway_sampling_coef)
+    speed = args.get("speed", None) or config.get("speed", default_speed)
+    fix_duration = args.get("fix_duration", None) or config.get("fix_duration", default_fix_duration)
+    if "infer/examples/" in ref_audio:
+        ref_audio = str(files("f5_tts").joinpath(f"{ref_audio}"))
+    if "infer/examples/" in gen_file:
+        gen_file = str(files("f5_tts").joinpath(f"{gen_file}"))
+    if "voices" in config:
+        for voice in config["voices"]:
+            voice_ref_audio = config["voices"][voice]["ref_audio"]
+            if "infer/examples/" in voice_ref_audio:
+                config["voices"][voice]["ref_audio"] = str(files("f5_tts").joinpath(f"{voice_ref_audio}"))
+    # ignore gen_text if gen_file provided
+    if gen_file:
+        gen_text = codecs.open(gen_file, "r", "utf-8").read()
+    # output path
+    wave_path = Path(output_dir) / output_file
+    # spectrogram_path = Path(output_dir) / "infer_cli_out.png"
+    if save_chunk:
+        output_chunk_dir = os.path.join(output_dir, f"{Path(output_file).stem}_chunks")
+        if not os.path.exists(output_chunk_dir):
+            os.makedirs(output_chunk_dir)
+    # load vocoder
+    if vocoder_name == "vocos":
+        vocoder_local_path = "../checkpoints/vocos-mel-24khz"
+    elif vocoder_name == "bigvgan":
+        vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
+    vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=load_vocoder_from_local, local_path=vocoder_local_path)
+    # load TTS model
+    model_cfg = OmegaConf.load(
+        config.get("model_cfg", str(files("f5_tts").joinpath(f"configs/{model}.yaml")))
+    ).model
+    model_cls = globals()[model_cfg.backbone]
+    repo_name, ckpt_step, ckpt_type = "F5-TTS", 1250000, "safetensors"
+    if model != "F5TTS_Base":
+        assert vocoder_name == model_cfg.mel_spec.mel_spec_type
+    # override for previous models
+    if model == "F5TTS_Base":
+        if vocoder_name == "vocos":
+            ckpt_step = 1200000
+        elif vocoder_name == "bigvgan":
+            model = "F5TTS_Base_bigvgan"
+            ckpt_type = "pt"
+    elif model == "E2TTS_Base":
+        repo_name = "E2-TTS"
+        ckpt_step = 1200000
+    if not ckpt_file:
+        ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}"))
+    print(f"Using {model}...")
+    ema_model = load_model(model_cls, model_cfg.arch, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file)
     if not ref_audio_orig:
         raise gr.Error("Please upload a sample audio file.")
         raise gr.Error("Please enter the text content to generate voice.")
     if len(gen_text.split()) > 1000:
         raise gr.Error("Please enter text content with less than 1000 words.")
     try:
+        # Nếu người dùng nhập ref_text thì dùng, không thì để rỗng để tự động nhận diện
+        ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text_input or "")
+        gen_text_ = gen_text.strip()
         final_wave, final_sample_rate, spectrogram = infer_process(
+            ref_audio, ref_text.lower(), gen_text_, ema_model, vocoder, speed=speed
         )
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
             spectrogram_path = tmp_spectrogram.name
             save_spectrogram(spectrogram, spectrogram_path)
         return (final_sample_rate, final_wave), spectrogram_path
     except Exception as e:
         raise gr.Error(f"Error generating voice: {e}")
 # Gradio UI
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # 🎤 F5-TTS: Vietnamese Text-to-Speech Synthesis.
     # The model was trained with approximately 1000 hours of data on a RTX 3090 GPU.
     Enter text and upload a sample voice to generate natural speech.
     """)
     with gr.Row():
         ref_audio = gr.Audio(label="🔊 Sample Voice", type="filepath")
+        ref_text = gr.Textbox(label="📝 Reference Transcript (optional)", placeholder="Nhập transcript tiếng Việt cho sample voice nếu có...", lines=2)
         gen_text = gr.Textbox(label="📝 Text", placeholder="Enter the text to generate voice...", lines=3)
     speed = gr.Slider(0.3, 2.0, value=1.0, step=0.1, label="⚡ Speed")
     btn_synthesize = gr.Button("🔥 Generate Voice")
     with gr.Row():
         output_audio = gr.Audio(label="🎧 Generated Audio", type="numpy")
         output_spectrogram = gr.Image(label="📊 Spectrogram")
     model_limitations = gr.Textbox(
         value="""1. This model may not perform well with numerical characters, dates, special characters, etc. => A text normalization module is needed.
 2. The rhythm of some generated audios may be inconsistent or choppy => It is recommended to select clearly pronounced sample audios with minimal pauses for better synthesis quality.
         lines=4,
         interactive=False
     )
+    btn_synthesize.click(infer_tts, inputs=[ref_audio, ref_text, gen_text, speed], outputs=[output_audio, output_spectrogram])
 # Run Gradio with share=True to get a gradio.live link
 # demo.queue().launch()

examples/vocab.txt ADDED Viewed

	@@ -0,0 +1,2566 @@

+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+=
+>
+?
+@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+[
+\
+]
+_
+a
+a1
+ai1
+ai2
+ai3
+ai4
+an1
+an3
+an4
+ang1
+ang2
+ang4
+ao1
+ao2
+ao3
+ao4
+b
+ba
+ba1
+ba2
+ba3
+ba4
+bai1
+bai2
+bai3
+bai4
+ban1
+ban2
+ban3
+ban4
+bang1
+bang2
+bang3
+bang4
+bao1
+bao2
+bao3
+bao4
+bei
+bei1
+bei2
+bei3
+bei4
+ben1
+ben2
+ben3
+ben4
+beng
+beng1
+beng2
+beng3
+beng4
+bi1
+bi2
+bi3
+bi4
+bian1
+bian2
+bian3
+bian4
+biao1
+biao2
+biao3
+bie1
+bie2
+bie3
+bie4
+bin1
+bin4
+bing1
+bing2
+bing3
+bing4
+bo
+bo1
+bo2
+bo3
+bo4
+bu2
+bu3
+bu4
+c
+ca1
+cai1
+cai2
+cai3
+cai4
+can1
+can2
+can3
+can4
+cang1
+cang2
+cao1
+cao2
+cao3
+ce4
+cen1
+cen2
+ceng1
+ceng2
+ceng4
+cha1
+cha2
+cha3
+cha4
+chai1
+chai2
+chan1
+chan2
+chan3
+chan4
+chang1
+chang2
+chang3
+chang4
+chao1
+chao2
+chao3
+che1
+che2
+che3
+che4
+chen1
+chen2
+chen3
+chen4
+cheng1
+cheng2
+cheng3
+cheng4
+chi1
+chi2
+chi3
+chi4
+chong1
+chong2
+chong3
+chong4
+chou1
+chou2
+chou3
+chou4
+chu1
+chu2
+chu3
+chu4
+chua1
+chuai1
+chuai2
+chuai3
+chuai4
+chuan1
+chuan2
+chuan3
+chuan4
+chuang1
+chuang2
+chuang3
+chuang4
+chui1
+chui2
+chun1
+chun2
+chun3
+chuo1
+chuo4
+ci1
+ci2
+ci3
+ci4
+cong1
+cong2
+cou4
+cu1
+cu4
+cuan1
+cuan2
+cuan4
+cui1
+cui3
+cui4
+cun1
+cun2
+cun4
+cuo1
+cuo2
+cuo4
+d
+da
+da1
+da2
+da3
+da4
+dai1
+dai2
+dai3
+dai4
+dan1
+dan2
+dan3
+dan4
+dang1
+dang2
+dang3
+dang4
+dao1
+dao2
+dao3
+dao4
+de
+de1
+de2
+dei3
+den4
+deng1
+deng2
+deng3
+deng4
+di1
+di2
+di3
+di4
+dia3
+dian1
+dian2
+dian3
+dian4
+diao1
+diao3
+diao4
+die1
+die2
+die4
+ding1
+ding2
+ding3
+ding4
+diu1
+dong1
+dong3
+dong4
+dou1
+dou2
+dou3
+dou4
+du1
+du2
+du3
+du4
+duan1
+duan2
+duan3
+duan4
+dui1
+dui4
+dun1
+dun3
+dun4
+duo1
+duo2
+duo3
+duo4
+e
+e1
+e2
+e3
+e4
+ei2
+en1
+en4
+er
+er2
+er3
+er4
+f
+fa1
+fa2
+fa3
+fa4
+fan1
+fan2
+fan3
+fan4
+fang1
+fang2
+fang3
+fang4
+fei1
+fei2
+fei3
+fei4
+fen1
+fen2
+fen3
+fen4
+feng1
+feng2
+feng3
+feng4
+fo2
+fou2
+fou3
+fu1
+fu2
+fu3
+fu4
+g
+ga1
+ga2
+ga3
+ga4
+gai1
+gai2
+gai3
+gai4
+gan1
+gan2
+gan3
+gan4
+gang1
+gang2
+gang3
+gang4
+gao1
+gao2
+gao3
+gao4
+ge1
+ge2
+ge3
+ge4
+gei2
+gei3
+gen1
+gen2
+gen3
+gen4
+geng1
+geng3
+geng4
+gong1
+gong3
+gong4
+gou1
+gou2
+gou3
+gou4
+gu
+gu1
+gu2
+gu3
+gu4
+gua1
+gua2
+gua3
+gua4
+guai1
+guai2
+guai3
+guai4
+guan1
+guan2
+guan3
+guan4
+guang1
+guang2
+guang3
+guang4
+gui1
+gui2
+gui3
+gui4
+gun3
+gun4
+guo1
+guo2
+guo3
+guo4
+h
+ha1
+ha2
+ha3
+hai1
+hai2
+hai3
+hai4
+han1
+han2
+han3
+han4
+hang1
+hang2
+hang4
+hao1
+hao2
+hao3
+hao4
+he1
+he2
+he4
+hei1
+hen2
+hen3
+hen4
+heng1
+heng2
+heng4
+hong1
+hong2
+hong3
+hong4
+hou1
+hou2
+hou3
+hou4
+hu1
+hu2
+hu3
+hu4
+hua1
+hua2
+hua4
+huai2
+huai4
+huan1
+huan2
+huan3
+huan4
+huang1
+huang2
+huang3
+huang4
+hui1
+hui2
+hui3
+hui4
+hun1
+hun2
+hun4
+huo
+huo1
+huo2
+huo3
+huo4
+i
+j
+ji1
+ji2
+ji3
+ji4
+jia
+jia1
+jia2
+jia3
+jia4
+jian1
+jian2
+jian3
+jian4
+jiang1
+jiang2
+jiang3
+jiang4
+jiao1
+jiao2
+jiao3
+jiao4
+jie1
+jie2
+jie3
+jie4
+jin1
+jin2
+jin3
+jin4
+jing1
+jing2
+jing3
+jing4
+jiong3
+jiu1
+jiu2
+jiu3
+jiu4
+ju1
+ju2
+ju3
+ju4
+juan1
+juan2
+juan3
+juan4
+jue1
+jue2
+jue4
+jun1
+jun4
+k
+ka1
+ka2
+ka3
+kai1
+kai2
+kai3
+kai4
+kan1
+kan2
+kan3
+kan4
+kang1
+kang2
+kang4
+kao1
+kao2
+kao3
+kao4
+ke1
+ke2
+ke3
+ke4
+ken3
+keng1
+kong1
+kong3
+kong4
+kou1
+kou2
+kou3
+kou4
+ku1
+ku2
+ku3
+ku4
+kua1
+kua3
+kua4
+kuai3
+kuai4
+kuan1
+kuan2
+kuan3
+kuang1
+kuang2
+kuang4
+kui1
+kui2
+kui3
+kui4
+kun1
+kun3
+kun4
+kuo4
+l
+la
+la1
+la2
+la3
+la4
+lai2
+lai4
+lan2
+lan3
+lan4
+lang1
+lang2
+lang3
+lang4
+lao1
+lao2
+lao3
+lao4
+le
+le1
+le4
+lei
+lei1
+lei2
+lei3
+lei4
+leng1
+leng2
+leng3
+leng4
+li
+li1
+li2
+li3
+li4
+lia3
+lian2
+lian3
+lian4
+liang2
+liang3
+liang4
+liao1
+liao2
+liao3
+liao4
+lie1
+lie2
+lie3
+lie4
+lin1
+lin2
+lin3
+lin4
+ling2
+ling3
+ling4
+liu1
+liu2
+liu3
+liu4
+long1
+long2
+long3
+long4
+lou1
+lou2
+lou3
+lou4
+lu1
+lu2
+lu3
+lu4
+luan2
+luan3
+luan4
+lun1
+lun2
+lun4
+luo1
+luo2
+luo3
+luo4
+lv2
+lv3
+lv4
+lve3
+lve4
+m
+ma
+ma1
+ma2
+ma3
+ma4
+mai2
+mai3
+mai4
+man1
+man2
+man3
+man4
+mang2
+mang3
+mao1
+mao2
+mao3
+mao4
+me
+mei2
+mei3
+mei4
+men
+men1
+men2
+men4
+meng
+meng1
+meng2
+meng3
+meng4
+mi1
+mi2
+mi3
+mi4
+mian2
+mian3
+mian4
+miao1
+miao2
+miao3
+miao4
+mie1
+mie4
+min2
+min3
+ming2
+ming3
+ming4
+miu4
+mo1
+mo2
+mo3
+mo4
+mou1
+mou2
+mou3
+mu2
+mu3
+mu4
+n
+n2
+na1
+na2
+na3
+na4
+nai2
+nai3
+nai4
+nan1
+nan2
+nan3
+nan4
+nang1
+nang2
+nang3
+nao1
+nao2
+nao3
+nao4
+ne
+ne2
+ne4
+nei3
+nei4
+nen4
+neng2
+ni1
+ni2
+ni3
+ni4
+nian1
+nian2
+nian3
+nian4
+niang2
+niang4
+niao2
+niao3
+niao4
+nie1
+nie4
+nin2
+ning2
+ning3
+ning4
+niu1
+niu2
+niu3
+niu4
+nong2
+nong4
+nou4
+nu2
+nu3
+nu4
+nuan3
+nuo2
+nuo4
+nv2
+nv3
+nve4
+o
+o1
+o2
+ou1
+ou2
+ou3
+ou4
+p
+pa1
+pa2
+pa4
+pai1
+pai2
+pai3
+pai4
+pan1
+pan2
+pan4
+pang1
+pang2
+pang4
+pao1
+pao2
+pao3
+pao4
+pei1
+pei2
+pei4
+pen1
+pen2
+pen4
+peng1
+peng2
+peng3
+peng4
+pi1
+pi2
+pi3
+pi4
+pian1
+pian2
+pian4
+piao1
+piao2
+piao3
+piao4
+pie1
+pie2
+pie3
+pin1
+pin2
+pin3
+pin4
+ping1
+ping2
+po1
+po2
+po3
+po4
+pou1
+pu1
+pu2
+pu3
+pu4
+q
+qi1
+qi2
+qi3
+qi4
+qia1
+qia3
+qia4
+qian1
+qian2
+qian3
+qian4
+qiang1
+qiang2
+qiang3
+qiang4
+qiao1
+qiao2
+qiao3
+qiao4
+qie1
+qie2
+qie3
+qie4
+qin1
+qin2
+qin3
+qin4
+qing1
+qing2
+qing3
+qing4
+qiong1
+qiong2
+qiu1
+qiu2
+qiu3
+qu1
+qu2
+qu3
+qu4
+quan1
+quan2
+quan3
+quan4
+que1
+que2
+que4
+qun2
+r
+ran2
+ran3
+rang1
+rang2
+rang3
+rang4
+rao2
+rao3
+rao4
+re2
+re3
+re4
+ren2
+ren3
+ren4
+reng1
+reng2
+ri4
+rong1
+rong2
+rong3
+rou2
+rou4
+ru2
+ru3
+ru4
+ruan2
+ruan3
+rui3
+rui4
+run4
+ruo4
+s
+sa1
+sa2
+sa3
+sa4
+sai1
+sai4
+san1
+san2
+san3
+san4
+sang1
+sang3
+sang4
+sao1
+sao2
+sao3
+sao4
+se4
+sen1
+seng1
+sha1
+sha2
+sha3
+sha4
+shai1
+shai2
+shai3
+shai4
+shan1
+shan3
+shan4
+shang
+shang1
+shang3
+shang4
+shao1
+shao2
+shao3
+shao4
+she1
+she2
+she3
+she4
+shei2
+shen1
+shen2
+shen3
+shen4
+sheng1
+sheng2
+sheng3
+sheng4
+shi
+shi1
+shi2
+shi3
+shi4
+shou1
+shou2
+shou3
+shou4
+shu1
+shu2
+shu3
+shu4
+shua1
+shua2
+shua3
+shua4
+shuai1
+shuai3
+shuai4
+shuan1
+shuan4
+shuang1
+shuang3
+shui2
+shui3
+shui4
+shun3
+shun4
+shuo1
+shuo4
+si1
+si2
+si3
+si4
+song1
+song3
+song4
+sou1
+sou3
+sou4
+su1
+su2
+su4
+suan1
+suan4
+sui1
+sui2
+sui3
+sui4
+sun1
+sun3
+suo
+suo1
+suo2
+suo3
+t
+ta1
+ta2
+ta3
+ta4
+tai1
+tai2
+tai4
+tan1
+tan2
+tan3
+tan4
+tang1
+tang2
+tang3
+tang4
+tao1
+tao2
+tao3
+tao4
+te4
+teng2
+ti1
+ti2
+ti3
+ti4
+tian1
+tian2
+tian3
+tiao1
+tiao2
+tiao3
+tiao4
+tie1
+tie2
+tie3
+tie4
+ting1
+ting2
+ting3
+tong1
+tong2
+tong3
+tong4
+tou
+tou1
+tou2
+tou4
+tu1
+tu2
+tu3
+tu4
+tuan1
+tuan2
+tui1
+tui2
+tui3
+tui4
+tun1
+tun2
+tun4
+tuo1
+tuo2
+tuo3
+tuo4
+u
+v
+w
+wa
+wa1
+wa2
+wa3
+wa4
+wai1
+wai3
+wai4
+wan1
+wan2
+wan3
+wan4
+wang1
+wang2
+wang3
+wang4
+wei1
+wei2
+wei3
+wei4
+wen1
+wen2
+wen3
+wen4
+weng1
+weng4
+wo1
+wo2
+wo3
+wo4
+wu1
+wu2
+wu3
+wu4
+x
+xi1
+xi2
+xi3
+xi4
+xia1
+xia2
+xia4
+xian1
+xian2
+xian3
+xian4
+xiang1
+xiang2
+xiang3
+xiang4
+xiao1
+xiao2
+xiao3
+xiao4
+xie1
+xie2
+xie3
+xie4
+xin1
+xin2
+xin4
+xing1
+xing2
+xing3
+xing4
+xiong1
+xiong2
+xiu1
+xiu3
+xiu4
+xu
+xu1
+xu2
+xu3
+xu4
+xuan1
+xuan2
+xuan3
+xuan4
+xue1
+xue2
+xue3
+xue4
+xun1
+xun2
+xun4
+y
+ya
+ya1
+ya2
+ya3
+ya4
+yan1
+yan2
+yan3
+yan4
+yang1
+yang2
+yang3
+yang4
+yao1
+yao2
+yao3
+yao4
+ye1
+ye2
+ye3
+ye4
+yi
+yi1
+yi2
+yi3
+yi4
+yin1
+yin2
+yin3
+yin4
+ying1
+ying2
+ying3
+ying4
+yo1
+yong1
+yong2
+yong3
+yong4
+you1
+you2
+you3
+you4
+yu1
+yu2
+yu3
+yu4
+yuan1
+yuan2
+yuan3
+yuan4
+yue1
+yue4
+yun1
+yun2
+yun3
+yun4
+z
+za1
+za2
+za3
+zai1
+zai3
+zai4
+zan1
+zan2
+zan3
+zan4
+zang1
+zang4
+zao1
+zao2
+zao3
+zao4
+ze2
+ze4
+zei2
+zen3
+zeng1
+zeng4
+zha1
+zha2
+zha3
+zha4
+zhai1
+zhai2
+zhai3
+zhai4
+zhan1
+zhan2
+zhan3
+zhan4
+zhang1
+zhang2
+zhang3
+zhang4
+zhao1
+zhao2
+zhao3
+zhao4
+zhe
+zhe1
+zhe2
+zhe3
+zhe4
+zhen1
+zhen2
+zhen3
+zhen4
+zheng1
+zheng2
+zheng3
+zheng4
+zhi1
+zhi2
+zhi3
+zhi4
+zhong1
+zhong2
+zhong3
+zhong4
+zhou1
+zhou2
+zhou3
+zhou4
+zhu1
+zhu2
+zhu3
+zhu4
+zhua1
+zhua2
+zhua3
+zhuai1
+zhuai3
+zhuai4
+zhuan1
+zhuan2
+zhuan3
+zhuan4
+zhuang1
+zhuang4
+zhui1
+zhui4
+zhun1
+zhun2
+zhun3
+zhuo1
+zhuo2
+zi
+zi1
+zi2
+zi3
+zi4
+zong1
+zong2
+zong3
+zong4
+zou1
+zou2
+zou3
+zou4
+zu1
+zu2
+zu3
+zuan1
+zuan3
+zuan4
+zui2
+zui3
+zui4
+zun1
+zuo
+zuo1
+zuo2
+zuo3
+zuo4
+{
+~
+¡
+¢
+£
+¥
+§
+¨
+©
+«
+®
+¯
+°
+±
+²
+³
+´
+µ
+·
+¹
+º
+»
+¼
+½
+¾
+¿
+À
+Á
+Â
+Ã
+Ä
+Å
+Æ
+Ç
+È
+É
+Ê
+Í
+Î
+Ñ
+Ó
+Ö
+×
+Ø
+Ú
+Ü
+Ý
+Þ
+ß
+à
+á
+â
+ã
+ä
+å
+æ
+ç
+è
+é
+ê
+ë
+ì
+í
+î
+ï
+ð
+ñ
+ò
+ó
+ô
+õ
+ö
+ø
+ù
+ú
+û
+ü
+ý
+Ā
+ā
+ă
+ą
+ć
+Č
+č
+Đ
+đ
+ē
+ė
+ę
+ě
+ĝ
+ğ
+ħ
+ī
+į
+İ
+ı
+Ł
+ł
+ń
+ņ
+ň
+ŋ
+Ō
+ō
+ő
+œ
+ř
+Ś
+ś
+Ş
+ş
+Š
+š
+Ť
+ť
+ũ
+ū
+ź
+Ż
+ż
+Ž
+ž
+ơ
+ư
+ǎ
+ǐ
+ǒ
+ǔ
+ǚ
+ș
+ț
+ɑ
+ɔ
+ɕ
+ə
+ɛ
+ɜ
+ɡ
+ɣ
+ɪ
+ɫ
+ɴ
+ɹ
+ɾ
+ʃ
+ʊ
+ʌ
+ʒ
+ʔ
+ʰ
+ʷ
+ʻ
+ʾ
+ʿ
+ˈ
+ː
+˙
+˜
+ˢ
+́
+̅
+Α
+Β
+Δ
+Ε
+Θ
+Κ
+Λ
+Μ
+Ξ
+Π
+Σ
+Τ
+Φ
+Χ
+Ψ
+Ω
+ά
+έ
+ή
+ί
+α
+β
+γ
+δ
+ε
+ζ
+η
+θ
+ι
+κ
+λ
+μ
+ν
+ξ
+ο
+π
+ρ
+ς
+σ
+τ
+υ
+φ
+χ
+ψ
+ω
+ϊ
+ό
+ύ
+ώ
+ϕ
+ϵ
+Ё
+А
+Б
+В
+Г
+Д
+Е
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+ё
+і
+ְ
+ִ
+ֵ
+ֶ
+ַ
+ָ
+ֹ
+ּ
+־
+ׁ
+א
+ב
+ג
+ד
+ה
+ו
+ז
+ח
+ט
+י
+כ
+ל
+ם
+מ
+ן
+נ
+ס
+ע
+פ
+ק
+ר
+ש
+ת
+أ
+ب
+ة
+ت
+ج
+ح
+د
+ر
+ز
+س
+ص
+ط
+ع
+ق
+ك
+ل
+م
+ن
+ه
+و
+ي
+َ
+ُ
+ِ
+ْ
+ก
+ข
+ง
+จ
+ต
+ท
+น
+ป
+ย
+ร
+ว
+ส
+ห
+อ
+ฮ
+ั
+า
+ี
+ึ
+โ
+ใ
+ไ
+่
+้
+์
+ḍ
+Ḥ
+ḥ
+ṁ
+ṃ
+ṅ
+ṇ
+Ṛ
+ṛ
+Ṣ
+ṣ
+Ṭ
+ṭ
+ạ
+ả
+Ấ
+ấ
+ầ
+ậ
+ắ
+ằ
+ẻ
+ẽ
+ế
+ề
+ể
+ễ
+ệ
+ị
+ọ
+ỏ
+ố
+ồ
+ộ
+ớ
+ờ
+ở
+ụ
+ủ
+ứ
+ữ
+ἀ
+ἁ
+Ἀ
+ἐ
+ἔ
+ἰ
+ἱ
+ὀ
+ὁ
+ὐ
+ὲ
+ὸ
+���
+᾽
+ῆ
+ῇ
+ῶ
+‎
+‑
+‒
+–
+—
+―
+‖
+†
+‡
+•
+…
+‧
+‬
+′
+″
+⁄
+⁡
+⁰
+⁴
+⁵
+⁶
+⁷
+⁸
+⁹
+₁
+₂
+₃
+€
+₱
+₹
+₽
+℃
+ℏ
+ℓ
+№
+ℝ
+™
+⅓
+⅔
+⅛
+→
+∂
+∈
+∑
+−
+∗
+√
+∞
+∫
+≈
+≠
+≡
+≤
+≥
+⋅
+⋯
+█
+♪
+⟨
+⟩
+、
+。
+《
+》
+「
+」
+【
+】
+あ
+う
+え
+お
+か
+が
+き
+ぎ
+く
+ぐ
+け
+げ
+こ
+ご
+さ
+し
+じ
+す
+ず
+せ
+ぜ
+そ
+ぞ
+た
+だ
+ち
+っ
+つ
+で
+と
+ど
+な
+に
+ね
+の
+は
+ば
+ひ
+ぶ
+へ
+べ
+ま
+み
+む
+め
+も
+ゃ
+や
+ゆ
+ょ
+よ
+ら
+り
+る
+れ
+ろ
+わ
+を
+ん
+ァ
+ア
+ィ
+イ
+ウ
+ェ
+エ
+オ
+カ
+ガ
+キ
+ク
+ケ
+ゲ
+コ
+ゴ
+サ
+ザ
+シ
+ジ
+ス
+ズ
+セ
+ゾ
+タ
+ダ
+チ
+ッ
+ツ
+テ
+デ
+ト
+ド
+ナ
+ニ
+ネ
+ノ
+バ
+パ
+ビ
+ピ
+フ
+プ
+ヘ
+ベ
+ペ
+ホ
+ボ
+ポ
+マ
+ミ
+ム
+メ
+モ
+ャ
+ヤ
+ュ
+ユ
+ョ
+ヨ
+ラ
+リ
+ル
+レ
+ロ
+ワ
+ン
+・
+ー
+ㄋ
+ㄍ
+ㄎ
+ㄏ
+ㄓ
+ㄕ
+ㄚ
+ㄜ
+ㄟ
+ㄤ
+ㄥ
+ㄧ
+ㄱ
+ㄴ
+ㄷ
+ㄹ
+ㅁ
+ㅂ
+ㅅ
+ㅈ
+ㅍ
+ㅎ
+ㅏ
+ㅓ
+ㅗ
+ㅜ
+ㅡ
+ㅣ
+㗎
+가
+각
+간
+갈
+감
+갑
+갓
+갔
+강
+같
+개
+거
+건
+걸
+겁
+것
+겉
+게
+겠
+겨
+결
+겼
+경
+계
+고
+곤
+골
+곱
+공
+과
+관
+광
+교
+구
+국
+굴
+귀
+귄
+그
+근
+글
+금
+기
+긴
+길
+까
+깍
+깔
+깜
+깨
+께
+꼬
+꼭
+꽃
+꾸
+꿔
+끔
+끗
+끝
+끼
+나
+난
+날
+남
+납
+내
+냐
+냥
+너
+넘
+넣
+네
+녁
+년
+녕
+노
+녹
+놀
+누
+눈
+느
+는
+늘
+니
+님
+닙
+다
+닥
+단
+달
+닭
+당
+대
+더
+덕
+던
+덥
+데
+도
+독
+동
+돼
+됐
+되
+된
+될
+두
+둑
+둥
+드
+들
+등
+디
+따
+딱
+딸
+땅
+때
+떤
+떨
+떻
+또
+똑
+뚱
+뛰
+뜻
+띠
+라
+락
+란
+람
+랍
+랑
+래
+랜
+러
+런
+럼
+렇
+레
+려
+력
+렵
+렸
+로
+록
+롬
+루
+르
+른
+를
+름
+릉
+리
+릴
+림
+마
+막
+만
+많
+말
+맑
+맙
+맛
+매
+머
+먹
+멍
+메
+면
+명
+몇
+모
+목
+몸
+못
+무
+문
+물
+뭐
+뭘
+미
+민
+밌
+밑
+바
+박
+밖
+반
+받
+발
+밤
+밥
+방
+배
+백
+밸
+뱀
+버
+번
+벌
+벚
+베
+벼
+벽
+별
+병
+보
+복
+본
+볼
+봐
+봤
+부
+분
+불
+비
+빔
+빛
+빠
+빨
+뼈
+뽀
+뿅
+쁘
+사
+산
+살
+삼
+샀
+상
+새
+색
+생
+서
+선
+설
+섭
+섰
+성
+세
+셔
+션
+셨
+소
+속
+손
+송
+수
+숙
+순
+술
+숫
+숭
+숲
+쉬
+쉽
+스
+슨
+습
+슷
+시
+식
+신
+실
+싫
+심
+십
+싶
+싸
+써
+쓰
+쓴
+씌
+씨
+씩
+씬
+아
+악
+안
+않
+알
+야
+약
+얀
+양
+얘
+어
+언
+얼
+엄
+업
+없
+었
+엉
+에
+여
+역
+연
+염
+엽
+영
+옆
+예
+옛
+오
+온
+올
+옷
+옹
+와
+왔
+왜
+요
+욕
+용
+우
+운
+울
+웃
+워
+원
+월
+웠
+위
+윙
+유
+육
+윤
+으
+은
+을
+음
+응
+의
+이
+익
+인
+일
+읽
+임
+입
+있
+자
+작
+잔
+잖
+잘
+잡
+잤
+장
+재
+저
+전
+점
+정
+제
+져
+졌
+조
+족
+좀
+종
+좋
+죠
+주
+준
+줄
+중
+줘
+즈
+즐
+즘
+지
+진
+집
+짜
+짝
+쩌
+쪼
+쪽
+쫌
+쭈
+쯔
+찌
+찍
+차
+착
+찾
+책
+처
+천
+철
+체
+쳐
+쳤
+초
+촌
+추
+출
+춤
+춥
+춰
+치
+친
+칠
+침
+칩
+칼
+커
+켓
+코
+콩
+쿠
+퀴
+크
+큰
+큽
+키
+킨
+타
+태
+터
+턴
+털
+테
+토
+통
+투
+트
+특
+튼
+틀
+티
+팀
+파
+팔
+패
+페
+펜
+펭
+평
+포
+폭
+표
+품
+풍
+프
+플
+피
+필
+하
+학
+한
+할
+함
+합
+항
+해
+햇
+했
+행
+허
+험
+형
+혜
+호
+혼
+홀
+화
+회
+획
+후
+휴
+흐
+흔
+희
+히
+힘
+ﷺ
+ﷻ
+！
+，
+？
+�
+ợ
+ỹ
+ỉ
+ĩ
+ừ
+ự
+ặ
+ẹ
+ỷ
+ẳ
+ổ
+ỳ
+ẩ
+ử
+ỗ
+ẫ
+ỡ
+ẵ
+ỵ
+輪
+망
+版

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
-torch==1.13.1
-torchvision==0.14.1
-torchaudio==0.13.1
-numpy==1.23.5
 face_alignment==1.3.0
 imageio==2.19.3
 imageio-ffmpeg==0.4.7
@@ -21,6 +21,7 @@ facexlib
 dlib-bin
 gfpgan
 av
 safetensors
 gradio
 soundfile
@@ -42,4 +43,5 @@ torchdiffeq
 transformers_stream_generator
 vocos
 wandb
-x_transformers>=1.31.14

+torch==2.4.0
+torchaudio==2.4.0
+torchvision==0.19.0
 face_alignment==1.3.0
 imageio==2.19.3
 imageio-ffmpeg==0.4.7
 dlib-bin
 gfpgan
 av
 safetensors
 gradio
 soundfile
 transformers_stream_generator
 vocos
 wandb
+x_transformers>=1.31.14
+f5-tts

src/face3d/models/arcface_torch/inference.py CHANGED Viewed

@@ -20,8 +20,15 @@ def inference(weight, name, img):
     img = torch.from_numpy(img).unsqueeze(0).float()
     img.div_(255).sub_(0.5).div_(0.5)
     net = get_model(name, fp16=False)
-    net.load_state_dict(torch.load(weight))
     net.eval()
     feat = net(img).numpy()
     print(feat)

     img = torch.from_numpy(img).unsqueeze(0).float()
     img.div_(255).sub_(0.5).div_(0.5)
     net = get_model(name, fp16=False)
+    # For PyTorch 2.x, weights_only is supported, but fallback for older checkpoints
+    try:
+        state_dict = torch.load(weight, weights_only=True)
+    except TypeError:
+        state_dict = torch.load(weight)
+    net.load_state_dict(state_dict)
     net.eval()
+    # Optional: For PyTorch 2.x, you can compile the model for speedup
+    # net = torch.compile(net)
     feat = net(img).numpy()
     print(feat)

src/face3d/models/networks.py CHANGED Viewed

@@ -59,11 +59,20 @@ def get_scheduler(optimizer, opt):
 def define_net_recon(net_recon, use_last_fc=False, init_path=None):
-    return ReconNetWrapper(net_recon, use_last_fc=use_last_fc, init_path=init_path)
 def define_net_recog(net_recog, pretrained_path=None):
     net = RecogNetWrapper(net_recog=net_recog, pretrained_path=pretrained_path)
     net.eval()
     return net
 class ReconNetWrapper(nn.Module):

 def define_net_recon(net_recon, use_last_fc=False, init_path=None):
+    model = ReconNetWrapper(net_recon, use_last_fc=use_last_fc, init_path=init_path)
+    # Chỉ compile sau khi load state_dict xong!
+    return model
 def define_net_recog(net_recog, pretrained_path=None):
     net = RecogNetWrapper(net_recog=net_recog, pretrained_path=pretrained_path)
     net.eval()
+    # Use torch.compile for PyTorch 2.x+ if available
+    try:
+        import torch
+        net = torch.compile(net)
+        print("[INFO] RecogNetWrapper compiled with torch.compile for PyTorch 2.x+")
+    except AttributeError:
+        print("[INFO] torch.compile not available; running RecogNetWrapper without compilation.")
     return net
 class ReconNetWrapper(nn.Module):

src/face3d/util/util.py CHANGED Viewed

@@ -10,7 +10,6 @@ import argparse
 from argparse import Namespace
 import torchvision
 def str2bool(v):
     if isinstance(v, bool):
         return v

 from argparse import Namespace
 import torchvision
 def str2bool(v):
     if isinstance(v, bool):
         return v

src/gradio_demo.py CHANGED Viewed

@@ -165,6 +165,14 @@ class SadTalker():
         import gc; gc.collect()
-        return return_path

         import gc; gc.collect()
+        # Fix: Copy video to a temp file to avoid ffmpeg overwrite error in Gradio
+        import tempfile
+        if os.path.isfile(return_path):
+            tmp_video = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+            tmp_video.close()
+            shutil.copy(return_path, tmp_video.name)
+            return tmp_video.name
+        else:
+            return return_path

src/test_audio2coeff.py CHANGED Viewed

@@ -14,7 +14,11 @@ from src.audio2exp_models.audio2exp import Audio2Exp
 from src.utils.safetensor_helper import load_x_from_safetensor
 def load_cpk(checkpoint_path, model=None, optimizer=None, device="cpu"):
-    checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
     if model is not None:
         model.load_state_dict(checkpoint['model'])
     if optimizer is not None:
@@ -119,5 +123,3 @@ class Audio2Coeff():
         #### relative head pose
         coeffs_pred_numpy[:, 64:70] = coeffs_pred_numpy[:, 64:70] + ( refpose_coeff[:num_frames, :] - refpose_coeff[0:1, :] )
         return coeffs_pred_numpy

 from src.utils.safetensor_helper import load_x_from_safetensor
 def load_cpk(checkpoint_path, model=None, optimizer=None, device="cpu"):
+    # For PyTorch 2.x, weights_only is supported, but fallback for older checkpoints
+    try:
+        checkpoint = torch.load(checkpoint_path, map_location=torch.device(device), weights_only=True)
+    except TypeError:
+        checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
     if model is not None:
         model.load_state_dict(checkpoint['model'])
     if optimizer is not None:
         #### relative head pose
         coeffs_pred_numpy[:, 64:70] = coeffs_pred_numpy[:, 64:70] + ( refpose_coeff[:num_frames, :] - refpose_coeff[0:1, :] )
         return coeffs_pred_numpy

src/utils/model2safetensor.py CHANGED Viewed

@@ -44,8 +44,11 @@ def load_cpk_facevid2vid(checkpoint_path, generator=None, discriminator=None,
                         kp_detector=None, he_estimator=None, optimizer_generator=None,
                         optimizer_discriminator=None, optimizer_kp_detector=None,
                         optimizer_he_estimator=None, device="cpu"):
-    checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
     if generator is not None:
         generator.load_state_dict(checkpoint['generator'])
     if kp_detector is not None:
@@ -138,4 +141,4 @@ model = SadTalker(kp_extractor, generator, netG, audio2pose_model, net_recon)
 save_file(model.state_dict(), "checkpoints/SadTalker_V0.0.2_"+str(size)+".safetensors")
 ### test
-load_cpk_facevid2vid_safetensor('checkpoints/SadTalker_V0.0.2_'+str(size)+'.safetensors', kp_detector=kp_extractor, generator=generator, he_estimator=None)

                         kp_detector=None, he_estimator=None, optimizer_generator=None,
                         optimizer_discriminator=None, optimizer_kp_detector=None,
                         optimizer_he_estimator=None, device="cpu"):
+    # For PyTorch 2.x, weights_only is supported, but fallback for older checkpoints
+    try:
+        checkpoint = torch.load(checkpoint_path, map_location=torch.device(device), weights_only=True)
+    except TypeError:
+        checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
     if generator is not None:
         generator.load_state_dict(checkpoint['generator'])
     if kp_detector is not None:
 save_file(model.state_dict(), "checkpoints/SadTalker_V0.0.2_"+str(size)+".safetensors")
 ### test
+load_cpk_facevid2vid_safetensor('checkpoints/SadTalker_V0.0.2_'+str(size)+'.safetensors', kp_detector=kp_extractor, generator=generator, he_estimator=None)