Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| app = FastAPI() | |
| # Load the model | |
| llm = Llama.from_pretrained( | |
| repo_id="unsloth/phi-4-GGUF", | |
| filename="phi-4-Q4_K_M.gguf", | |
| n_ctx=16384 | |
| ) | |
| # Define request model | |
| class ChatRequest(BaseModel): | |
| system_prompt: str | |
| query: str | |
| async def chat(request: ChatRequest): | |
| try: | |
| response = llm.create_chat_completion( | |
| messages=[ | |
| {"role": "system", "content": request.system_prompt}, | |
| {"role": "user", "content": request.query}, | |
| ] | |
| ) | |
| return {"response": response} | |
| except Exception as e: | |
| # Log the error or print it for debugging | |
| print("Error during model inference:", e) | |
| return {"error": str(e)} | |