Spaces:
Paused
Paused
| import os | |
| import subprocess | |
| import sys | |
| # Fix OMP_NUM_THREADS issue before any imports | |
| os.environ["OMP_NUM_THREADS"] = "4" | |
| # Install dependencies programmatically to avoid conflicts | |
| def setup_dependencies(): | |
| try: | |
| # Check if already installed | |
| if os.path.exists('/tmp/deps_installed'): | |
| return | |
| print("Installing transformers dev version...") | |
| subprocess.check_call([ | |
| sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir", | |
| "git+https://github.com/huggingface/transformers.git" | |
| ]) | |
| # Mark as installed | |
| with open('/tmp/deps_installed', 'w') as f: | |
| f.write('done') | |
| except Exception as e: | |
| print(f"Dependencies setup error: {e}") | |
| # Run setup | |
| setup_dependencies() | |
| import spaces | |
| import gradio as gr | |
| from util import Config, NemoAudioPlayer, KaniModel, Demo | |
| import numpy as np | |
| import torch | |
| # Get HuggingFace token | |
| token_ = os.getenv('HF_TOKEN') | |
| # Model configurations | |
| models_configs = { | |
| 'base': Config(), | |
| 'female': Config( | |
| model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2', | |
| ), | |
| 'male': Config( | |
| model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1', | |
| ) | |
| } | |
| # Global variables for models (loaded once) | |
| player = NemoAudioPlayer(Config()) | |
| models = {} | |
| for model_name, config in models_configs.items(): | |
| print(f"Loading {model_name}...") | |
| models[model_name] = KaniModel(config, player, token_) | |
| print(f"{model_name} loaded!") | |
| print("All models loaded!") | |
| def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok): | |
| """ | |
| Generate speech from text using the selected model on GPU | |
| """ | |
| if not text.strip(): | |
| return None, "Please enter text for speech generation." | |
| if not model_choice: | |
| return None, "Please select a model." | |
| try: | |
| # Check GPU availability | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # Get selected model | |
| selected_model = models[model_choice] | |
| # Generate audio | |
| print(f"Generating speech with {model_choice}...") | |
| audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok) | |
| sample_rate = 22050 | |
| print("Speech generation completed!") | |
| return (sample_rate, audio), time_report #, f"✅ Audio generated successfully using {model_choice} on {device}" | |
| except Exception as e: | |
| print(f"Error during generation: {str(e)}") | |
| return None, f"❌ Error during generation: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo: | |
| gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model") | |
| gr.Markdown("Select a model and enter text to generate emotional speech") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| model_dropdown = gr.Dropdown( | |
| choices=list(models_configs.keys()), | |
| value=list(models_configs.keys())[0], | |
| label="Selected Model", | |
| info="Base generates random voices" | |
| ) | |
| text_input = gr.Textbox( | |
| label="Text", | |
| placeholder="Enter your text ...", | |
| lines=3, | |
| max_lines=10 | |
| ) | |
| with gr.Accordion("Settings", open=False): | |
| temp = gr.Slider( | |
| minimum=0.1, maximum=1.5, value=0.6, step=0.05, | |
| label="Temp", | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.1, maximum=1.0, value=0.95, step=0.05, | |
| label="Top P", | |
| ) | |
| rp = gr.Slider( | |
| minimum=1.0, maximum=2.0, value=1.1, step=0.05, | |
| label="Repetition Penalty", | |
| ) | |
| max_tok = gr.Slider( | |
| minimum=100, maximum=2000, value=1200, step=100, | |
| label="Max Tokens", | |
| ) | |
| generate_btn = gr.Button("Run", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio( | |
| label="Generated Audio", | |
| type="numpy" | |
| ) | |
| time_report_output = gr.Textbox( | |
| label="Time Report", | |
| interactive=False, | |
| value="Ready to generate speech", | |
| lines=3 | |
| ) | |
| # GPU generation event | |
| generate_btn.click( | |
| fn=generate_speech_gpu, | |
| inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok], | |
| outputs=[audio_output, time_report_output] | |
| ) | |
| with gr.Row(): | |
| examples = [ | |
| ["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male", 0.6, 0.95, 1.1, 1200], | |
| ["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male", 0.6, 0.95, 1.1, 1200], | |
| ["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male", 0.6, 0.95, 1.1, 1200], | |
| ["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female", 0.6, 0.95, 1.1, 1200], | |
| ["Holy fu- Oh my God! Don't you understand how dangerous it is, huh?", "male", 0.6, 0.95, 1.1, 1200], | |
| ["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female", 0.6, 0.95, 1.1, 1200], | |
| ["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "female", 0.6, 0.95, 1.1, 1200], | |
| ["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female", 0.6, 0.95, 1.1, 1200], | |
| ] | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok], | |
| fn=generate_speech_gpu, | |
| outputs=[audio_output, time_report_output], | |
| cache_examples=True, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) |