from fastapi import FastAPI from llama_cpp import Llama from huggingface_hub import hf_hub_download import os # GGUF model configuration REPO_ID = "TheBloke/deepseek-coder-6.7B-instruct-GGUF" FILENAME = "deepseek-coder-6.7b-instruct.Q4_K_M.gguf" app = FastAPI() # Download and cache the GGUF model print(f"Downloading {FILENAME} from {REPO_ID}...") model_path = hf_hub_download( repo_id=REPO_ID, filename=FILENAME, cache_dir=os.getenv("HF_HOME", "./models") ) print(f"Model downloaded to: {model_path}") # Load the model with llama-cpp-python print("Loading model into memory...") llm = Llama( model_path=model_path, n_ctx=2048, # Context window n_threads=4, # CPU threads n_gpu_layers=0, # Use CPU only (set >0 if GPU available) verbose=False ) print("Model loaded successfully!") @app.post("/v1/chat/completions") def chat(req: dict): messages = req.get("messages", []) max_tokens = req.get("max_tokens", 256) temperature = req.get("temperature", 0.7) # Use llama-cpp-python's built-in chat completion response = llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, stop=["", "User:", "###"] ) return { "choices": [{ "message": { "role": "assistant", "content": response["choices"][0]["message"]["content"] } }] } @app.get("/") def root(): return {"status": "DeepSeek API is online (GGUF)"}