Spaces:

sharath88
/

chatbot-demo

Running

sharath88 commited on Nov 18, 2025

Commit

f06a85e

1 Parent(s): 75d3db4

Switch to Hugging Face InferenceClient for chat backend

Replaced requests-based API calls with huggingface_hub's InferenceClient for model inference, updated model selection to Gemma, and refactored prompt construction and persona handling. Added CORS middleware and removed template/static serving for a pure API backend. Updated requirements.txt to include huggingface_hub.

Files changed (2) hide show

main.py +98 -85
requirements.txt +1 -0

main.py CHANGED Viewed

@@ -1,116 +1,129 @@
 import os
-import requests
-from typing import List, Literal, Optional
-from fastapi import FastAPI, Request
-from fastapi.responses import HTMLResponse, JSONResponse
-from fastapi.staticfiles import StaticFiles
-from fastapi.templating import Jinja2Templates
 from pydantic import BaseModel
-# -------------------- Config --------------------
-HF_MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"
-HF_API_TOKEN = os.getenv("HF_API_TOKEN")   # set in Space → Settings → Secrets
-HF_API_URL = (
-    f"https://router.huggingface.co/hf-inference/models/"
-    f"{HF_MODEL_ID}/v1/chat/completions"
-)
-DEFAULT_SYSTEM_PROMPT = (
-    "You are a helpful, concise AI assistant. "
-    "Answer clearly in plain English unless the user asks otherwise."
-)
-if HF_API_TOKEN is None:
     raise RuntimeError(
-        "HF_API_TOKEN is not set. Add it in Space settings → Variables & secrets."
     )
-# -------------------- FastAPI setup --------------------
-app = FastAPI()
-# serve /static and /templates
-app.mount("/static", StaticFiles(directory="static"), name="static")
-templates = Jinja2Templates(directory="templates")
-class ChatMessage(BaseModel):
-    role: Literal["user", "assistant", "system"]
-    content: str
 class ChatRequest(BaseModel):
-    messages: List[ChatMessage]
     temperature: float = 0.7
-    max_new_tokens: int = 256
-    system_prompt: Optional[str] = None
-# ------------- Routes -------------
-@app.get("/", response_class=HTMLResponse)
-async def home(request: Request):
-    # This renders templates/index.html instead of JSON
-    return templates.TemplateResponse("index.html", {"request": request})
-# ------------- HF Router helper -------------
-def call_hf_chat(req: ChatRequest) -> str:
     """
-    Call Zephyr via the new HF router chat-completions API
-    (OpenAI-style).
     """
-    system_prompt = req.system_prompt or DEFAULT_SYSTEM_PROMPT
-    # prepend system message
-    messages = [{"role": "system", "content": system_prompt}]
-    for m in req.messages:
-        messages.append({"role": m.role, "content": m.content})
-    # clamp params to safe values
-    temperature = max(0.1, min(req.temperature, 1.5))
-    max_tokens = max(32, min(req.max_new_tokens, 512))
-    headers = {
-        "Authorization": f"Bearer {HF_API_TOKEN}",
-        "Content-Type": "application/json",
-    }
-    body = {
-        "messages": messages,
-        "temperature": temperature,
-        "max_tokens": max_tokens,
-        "stream": False,
-    }
-    resp = requests.post(HF_API_URL, headers=headers, json=body, timeout=60)
-    if not resp.ok:
-        raise RuntimeError(f"Inference API error {resp.status_code}: {resp.text}")
-    data = resp.json()
-    # OpenAI-style: choices[0].message.content
-    try:
-        return data["choices"][0]["message"]["content"].strip()
-    except Exception:
-        raise RuntimeError(f"Unexpected response format: {data}")
-@app.post("/chat")
-async def chat_endpoint(payload: ChatRequest):
-    if not payload.messages:
-        return JSONResponse(
-            {"reply": "", "error": "No messages provided."}, status_code=400
-        )
-    try:
-        reply = call_hf_chat(payload)
-        return {"reply": reply}
-    except Exception as e:
-        return JSONResponse(
-            {"reply": "", "error": str(e)}, status_code=500
-        )

 import os
+from typing import List, Literal, Dict, Any
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from huggingface_hub import InferenceClient
+# ---------- Config ----------
+HF_TOKEN = os.environ.get("HF_TOKEN")  # Set this in Space secrets
+MODEL_ID = "google/gemma-2-2b-it"      # Medium-sized instruct model
+if HF_TOKEN is None:
     raise RuntimeError(
+        "HF_TOKEN is not set. Go to Space → Settings → Repository secrets and "
+        "add HF_TOKEN with your Hugging Face access token."
     )
+# Inference client (uses HF Inference API / router under the hood)
+hf_client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)
+# ---------- FastAPI setup ----------
+app = FastAPI(title="Zephyr Chat Demo (Gemma backend)")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # ok for demo
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ---------- Data models ----------
+Role = Literal["user", "assistant", "system"]
+class Message(BaseModel):
+    role: Role
+    content: str
 class ChatRequest(BaseModel):
+    messages: List[Message]
     temperature: float = 0.7
+    max_tokens: int = 256
+    persona: str = "General Assistant"
+class ChatResponse(BaseModel):
+    reply: str
+    messages: List[Message]
+# ---------- Simple in-memory sessions ----------
+sessions: Dict[str, List[Message]] = {}
+def build_system_prompt(persona: str) -> str:
+    if persona == "Code Helper":
+        return (
+            "You are a helpful coding assistant. Explain things clearly, "
+            "show small code snippets, and avoid hallucinating libraries or APIs."
+        )
+    elif persona == "Data Tutor":
+        return (
+            "You are a teacher who explains data, statistics, and ML concepts "
+            "with simple examples and step-by-step reasoning."
+        )
+    else:
+        return (
+            "You are a friendly, concise AI assistant. "
+            "Answer clearly and avoid unsafe or speculative advice."
+        )
+def build_prompt(messages: List[Message], persona: str) -> str:
     """
+    Convert chat history into a single text-generation prompt.
     """
+    system_prompt = build_system_prompt(persona)
+    lines = [f"System: {system_prompt}", ""]
+    for m in messages:
+        prefix = "User" if m.role == "user" else "Assistant" if m.role == "assistant" else "System"
+        lines.append(f"{prefix}: {m.content}")
+    lines.append("Assistant:")
+    return "\n".join(lines)
+def call_llm(prompt: str, temperature: float, max_tokens: int) -> str:
+    """
+    Call HF Inference text-generation endpoint via InferenceClient.
+    """
+    try:
+        text = hf_client.text_generation(
+            prompt,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            do_sample=True,
+            repetition_penalty=1.1,
+            return_full_text=False,  # only new assistant text
+        )
+        return text.strip()
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Inference API error: {e}"
+        )
+# ---------- Routes ----------
+@app.get("/")
+def health():
+    return {"status": "ok", "message": "Zephyr Chat Demo backend running."}
+@app.post("/chat", response_model=ChatResponse)
+def chat(req: ChatRequest):
+    """
+    Main chat endpoint. Frontend sends full message list each time.
+    """
+    if not req.messages:
+        raise HTTPException(400, "No messages provided.")
+    # Build prompt from conversation
+    prompt = build_prompt(req.messages, req.persona)
+    # Call model
+    reply_text = call_llm(prompt, req.temperature, req.max_tokens)
+    # Append assistant reply to conversation
+    new_messages = req.messages + [Message(role="assistant", content=reply_text)]
+    return ChatResponse(reply=reply_text, messages=new_messages)

requirements.txt CHANGED Viewed

@@ -3,3 +3,4 @@ uvicorn[standard]
 jinja2
 requests
 python-dotenv

 jinja2
 requests
 python-dotenv
+huggingface_hub