""" Llama 3.2 3B Fine-tuned Chatbot Fine-tuned conversational model based on FineTome-100k Deployed on HuggingFace Spaces """ import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download import os print("šŸš€ Starting Llama 3.2 3B Chatbot...") print(f"šŸ“‚ Working directory: {os.getcwd()}") # Download GGUF model (using absolute path) print("šŸ“„ Downloading model...") try: # Use absolute path in HuggingFace Spaces model_dir = "/app/models" os.makedirs(model_dir, exist_ok=True) print(f"šŸ“‚ Model directory: {model_dir}") model_path = hf_hub_download( repo_id="handsomeLiu/ID2223-llama-3.2-3b-finetune-lora_model_new", filename="llama-3.2-3b-finetuned-Q4_K_M.gguf", local_dir=model_dir, # Absolute path ) print(model_path) print(f"āœ… Model downloaded: {model_path}") # Verify file if os.path.exists(model_path): file_size = os.path.getsize(model_path) / (1024**3) print(f"šŸ“Š File size: {file_size:.2f} GB") # Check GGUF header with open(model_path, 'rb') as f: header = f.read(4) print(f"šŸ” File header: {header}") if header == b'GGUF': print("āœ… Valid GGUF header detected") else: print(f"āŒ INVALID GGUF! Expected b'GGUF', got {header}") print("šŸ’” This file is NOT valid GGUF format") else: raise FileNotFoundError(f"File not found: {model_path}") except Exception as e: print(f"āŒ Error: {e}") raise # Load model into llama.cpp print("\nšŸ”§ Loading model into llama.cpp...") try: llm = Llama( model_path=model_path, n_ctx=2048, n_threads=4, n_gpu_layers=0, verbose=True # Enable verbose logging for debugging ) print("āœ… Model loaded successfully!") except Exception as e: print(f"āŒ Load failed: {e}") print(f"šŸ“‚ File: {model_path}") print(f"šŸ“Š Size: {os.path.getsize(model_path) / (1024**3):.2f} GB") print("\nšŸ’” File downloaded but cannot be loaded by llama.cpp") print(" → The file is NOT in valid GGUF format") print("\nšŸŽÆ Solution: Use unsloth in Colab to generate correct GGUF") raise def chat(message, history, temperature, top_p, max_tokens): """ Chat function for generating responses Args: message: Current user input history: Conversation history [{'role': 'user', 'content': '...'}, ...] temperature: Temperature parameter (controls randomness) top_p: Nucleus sampling parameter max_tokens: Maximum number of tokens to generate """ # Build Llama 3.1 format prompt prompt = "<|begin_of_text|>" # Add conversation history for msg in history: role = msg['role'] content = msg['content'] prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>" # Add current user message prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|>" prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n" # Generate response try: output = llm( prompt, max_tokens=int(max_tokens), temperature=temperature, top_p=top_p, echo=False, stop=["<|eot_id|>", "<|end_of_text|>"] # Stop tokens ) response = output['choices'][0]['text'].strip() return response except Exception as e: return f"āŒ Generation error: {str(e)}" def chat_wrapper(message, history, temperature, top_p, max_tokens): """ Wrapper for chat function to handle Gradio Blocks interaction """ if not message: return "", history # Add user message to history new_history = history + [{"role": "user", "content": message}] yield "", new_history # Generate response response = chat(message, history, temperature, top_p, max_tokens) # Add assistant response to history new_history.append({"role": "assistant", "content": response}) yield "", new_history # Create Gradio interface using Blocks for full control with gr.Blocks(theme=gr.themes.Soft(), title="Llama 3.2 3B Chatbot") as demo: gr.Markdown(""" # šŸ¦™ Llama 3.2 3B Fine-tuned Chatbot Llama 3.2 3B model fine-tuned on **FineTome-100k** dataset - šŸ’¾ **Model**: Llama 3.2 3B Instruct + LoRA - šŸ“Š **Data**: 100k high-quality conversations - āš™ļø **Quantization**: Q4_K_M (CPU optimized) - šŸ”— **GitHub**: [ID2223_lab2](https://github.com/Jiananliu12138/ID2223_lab2) """) chatbot = gr.Chatbot( height=500, show_label=False, avatar_images=(None, "šŸ¦™"), type="messages" ) with gr.Row(): msg = gr.Textbox( placeholder="Enter your question...", container=False, scale=7 ) submit_btn = gr.Button("šŸ“¤ Send", scale=1) with gr.Accordion(label="āš™ļø Advanced Settings", open=False): temperature = gr.Slider( minimum=0, maximum=2, value=0.7, step=0.1, label="šŸŒ”ļø Temperature", info="Controls randomness: low=deterministic, high=creative" ) top_p = gr.Slider( minimum=0, maximum=1, value=0.9, step=0.05, label="šŸŽÆ Top P", info="Nucleus sampling: only consider top P probability tokens" ) max_tokens = gr.Slider( minimum=64, maximum=512, value=256, step=64, label="šŸ“ Max Tokens", info="Maximum generation length" ) # Bind click event ONLY to the button (disable enter key submission) submit_btn.click( fn=chat_wrapper, inputs=[msg, chatbot, temperature, top_p, max_tokens], outputs=[msg, chatbot], concurrency_limit=1 ) gr.Markdown(""" --- ### šŸ“Š Training Details - **Base Model**: unsloth/Llama-3.2-3B-Instruct - **Fine-tuning Method**: LoRA (r=16, alpha=16) - **Training Data**: FineTome-100k - **Training Steps**: 1 full epoch (~12,500 steps) - **Checkpoint**: Saved every 500 steps ### šŸ’” Usage Tips - **Temperature 0.3-0.7**: Best for factual Q&A - **Temperature 0.7-1.2**: Best for creative writing - **Max Tokens 128**: Best for short answers - **Max Tokens 256-512**: Best for detailed explanations ### šŸŽÆ Project Features āœ… Complete checkpoint mechanism (supports resume training) āœ… Saved to Google Drive (permanent storage) āœ… Converted to GGUF format (CPU inference optimized) āœ… Deployed on HuggingFace Spaces (free access) **Made for ID2223 Scalable Machine Learning Course** """) # Launch the application if __name__ == "__main__": demo.queue() # Enable queue for multi-user support demo.launch( share=False, show_error=True, server_name="0.0.0.0", server_port=7860 )