"""
Llama 3.2 3B Fine-tuned Chatbot
Fine-tuned conversational model based on FineTome-100k
Deployed on HuggingFace Spaces
"""

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

print("🚀 Starting Llama 3.2 3B Chatbot...")
print(f"📂 Working directory: {os.getcwd()}")

# Download GGUF model (using absolute path)
print("📥 Downloading model...")
try:
    # Use absolute path in HuggingFace Spaces
    model_dir = "/app/models"
    os.makedirs(model_dir, exist_ok=True)
    print(f"📂 Model directory: {model_dir}")
    
    model_path = hf_hub_download(
        repo_id="handsomeLiu/ID2223-llama-3.2-3b-finetune-lora_model_new",
        filename="llama-3.2-3b-finetuned-Q4_K_M.gguf",
        local_dir=model_dir,  # Absolute path
    )
    print(model_path)
    print(f"✅ Model downloaded: {model_path}")
    
    # Verify file
    if os.path.exists(model_path):
        file_size = os.path.getsize(model_path) / (1024**3)
        print(f"📊 File size: {file_size:.2f} GB")
        
        # Check GGUF header
        with open(model_path, 'rb') as f:
            header = f.read(4)
            print(f"🔍 File header: {header}")
            if header == b'GGUF':
                print("✅ Valid GGUF header detected")
            else:
                print(f"❌ INVALID GGUF! Expected b'GGUF', got {header}")
                print("💡 This file is NOT valid GGUF format")
    else:
        raise FileNotFoundError(f"File not found: {model_path}")
        
except Exception as e:
    print(f"❌ Error: {e}")
    raise

# Load model into llama.cpp
print("\n🔧 Loading model into llama.cpp...")
try:
    llm = Llama(
        model_path=model_path,
        n_ctx=2048,
        n_threads=4,
        n_gpu_layers=0,
        verbose=True  # Enable verbose logging for debugging
    )
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Load failed: {e}")
    print(f"📂 File: {model_path}")
    print(f"📊 Size: {os.path.getsize(model_path) / (1024**3):.2f} GB")
    print("\n💡 File downloaded but cannot be loaded by llama.cpp")
    print("   → The file is NOT in valid GGUF format")
    print("\n🎯 Solution: Use unsloth in Colab to generate correct GGUF")
    raise

def chat(message, history, temperature, top_p, max_tokens):
    """
    Chat function for generating responses
    
    Args:
        message: Current user input
        history: Conversation history [{'role': 'user', 'content': '...'}, ...]
        temperature: Temperature parameter (controls randomness)
        top_p: Nucleus sampling parameter
        max_tokens: Maximum number of tokens to generate
    """
    # Build Llama 3.1 format prompt
    prompt = "<|begin_of_text|>"
    
    # Add conversation history
    for msg in history:
        role = msg['role']
        content = msg['content']
        prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>"
    
    # Add current user message
    prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|>"
    prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"
    
    # Generate response
    try:
        output = llm(
            prompt,
            max_tokens=int(max_tokens),
            temperature=temperature,
            top_p=top_p,
            echo=False,
            stop=["<|eot_id|>", "<|end_of_text|>"]  # Stop tokens
        )
        response = output['choices'][0]['text'].strip()
        return response
    except Exception as e:
        return f"❌ Generation error: {str(e)}"

def chat_wrapper(message, history, temperature, top_p, max_tokens):
    """
    Wrapper for chat function to handle Gradio Blocks interaction
    """
    if not message:
        return "", history
    
    # Add user message to history
    new_history = history + [{"role": "user", "content": message}]
    yield "", new_history
    
    # Generate response
    response = chat(message, history, temperature, top_p, max_tokens)
    
    # Add assistant response to history
    new_history.append({"role": "assistant", "content": response})
    yield "", new_history

# Create Gradio interface using Blocks for full control
with gr.Blocks(theme=gr.themes.Soft(), title="Llama 3.2 3B Chatbot") as demo:
    gr.Markdown("""
    # 🦙 Llama 3.2 3B Fine-tuned Chatbot
    
    Llama 3.2 3B model fine-tuned on **FineTome-100k** dataset
    
    - 💾 **Model**: Llama 3.2 3B Instruct + LoRA
    - 📊 **Data**: 100k high-quality conversations
    - ⚙️ **Quantization**: Q4_K_M (CPU optimized)
    - 🔗 **GitHub**: [ID2223_lab2](https://github.com/Jiananliu12138/ID2223_lab2)
    """)
    
    chatbot = gr.Chatbot(
        height=500,
        show_label=False,
        avatar_images=(None, "🦙"),
        type="messages"
    )
    
    with gr.Row():
        msg = gr.Textbox(
            placeholder="Enter your question...",
            container=False,
            scale=7
        )
        submit_btn = gr.Button("📤 Send", scale=1)
    
    with gr.Accordion(label="⚙️ Advanced Settings", open=False):
        temperature = gr.Slider(
            minimum=0,
            maximum=2,
            value=0.7,
            step=0.1,
            label="🌡️ Temperature",
            info="Controls randomness: low=deterministic, high=creative"
        )
        top_p = gr.Slider(
            minimum=0,
            maximum=1,
            value=0.9,
            step=0.05,
            label="🎯 Top P",
            info="Nucleus sampling: only consider top P probability tokens"
        )
        max_tokens = gr.Slider(
            minimum=64,
            maximum=512,
            value=256,
            step=64,
            label="📏 Max Tokens",
            info="Maximum generation length"
        )
    
    # Bind click event ONLY to the button (disable enter key submission)
    submit_btn.click(
        fn=chat_wrapper,
        inputs=[msg, chatbot, temperature, top_p, max_tokens],
        outputs=[msg, chatbot],
        concurrency_limit=1
    )
    
    gr.Markdown("""
    ---
    ### 📊 Training Details
    
    - **Base Model**: unsloth/Llama-3.2-3B-Instruct
    - **Fine-tuning Method**: LoRA (r=16, alpha=16)
    - **Training Data**: FineTome-100k
    - **Training Steps**: 1 full epoch (~12,500 steps)
    - **Checkpoint**: Saved every 500 steps
    
    ### 💡 Usage Tips
    
    - **Temperature 0.3-0.7**: Best for factual Q&A
    - **Temperature 0.7-1.2**: Best for creative writing
    - **Max Tokens 128**: Best for short answers
    - **Max Tokens 256-512**: Best for detailed explanations
    
    ### 🎯 Project Features
    
    ✅ Complete checkpoint mechanism (supports resume training)  
    ✅ Saved to Google Drive (permanent storage)  
    ✅ Converted to GGUF format (CPU inference optimized)  
    ✅ Deployed on HuggingFace Spaces (free access)  
    
    **Made for ID2223 Scalable Machine Learning Course**
    """)

# Launch the application
if __name__ == "__main__":
    demo.queue()  # Enable queue for multi-user support
    demo.launch(
        share=False,
        show_error=True,
        server_name="0.0.0.0",
        server_port=7860
    )