from io import BytesIO import json, re import os import base64 import requests from fastapi import FastAPI, UploadFile, File from fastapi.responses import JSONResponse from fastapi.middleware.cors import CORSMiddleware from PIL import Image app = FastAPI(title="GLM-4.1V-9B-Thinking") # Enable CORS for frontend interaction (Gradio/Spaces UI) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) API_URL = "https://router.huggingface.co/v1/chat/completions" HEADERS = { "Authorization": f"Bearer {os.environ['access_token']}", "Content-Type": "application/json" } PROMPT = """ You are an AI assistant. Extract item names and their prices from the following image. Your task is to extract item names and their corresponding prices from the image provided. Return ONLY a clean JSON array in this format: [ {"item": "", "price": ""}, ... ] ⚠️ Guidelines: - Do not include any explanation or text before/after the JSON. - Include only entries that have both item and price. - Preserve original spellings and formatting from the image. - If prices are written in ₹, Rs., or INR, keep the symbol as is. - Handle both packaged labels (like chips or snacks) and printed/handwritten menus. - If there are duplicates or unclear text, skip them. Only return the final JSON output, No explanation. Make sure each entry has both item and price, and preserve the original spelling. """ def resize_image(image: Image.Image, max_size=(1024, 1024)) -> Image.Image: image.thumbnail(max_size) return image async def encode_image_to_data_url(file: UploadFile=File(...)) -> str: image = Image.open(BytesIO(await file.read())) # Preprocessing image = resize_image(image) # Compress and convert to bytes buffered = BytesIO() image.save(buffered, quality=80, format=image.format) buffered.seek(0) image_bytes = buffered.getvalue() # Encode to base64 base64_image = base64.b64encode(image_bytes).decode("utf-8") mime_type = file.content_type return f"data:{mime_type};base64,{base64_image}" @app.get("/") def root(): return {"message": "GLM 4.1V API for menu extraction is running."} @app.post("/extract/") async def extract(file: UploadFile = File(...)): try: # Convert uploaded image to base64 URL format image_data_url = await encode_image_to_data_url(file) # Create chat-style payload payload = { "model": "zai-org/GLM-4.1V-9B-Thinking:novita", # "model": "meta-llama/Llama-3.2-11B-Vision-Instruct:together", # "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct:novita", # "model": "llama3.2-vision:11b", "messages": [ { "role": "user", "content": [ { "type": "text", "text": PROMPT }, { "type": "image_url", "image_url": { "url": image_data_url } } ] } ] } # Send POST request to Hugging Face Chat Completion endpoint response = requests.post(API_URL, headers=HEADERS, json=payload) result = response.json() print("result :", result) reply = result["choices"][0]["message"]["content"] except Exception as e: return JSONResponse(content={"error": str(e)}, status_code=400) match = re.search(r"\[\s*{.*?}\s*\]", reply, re.DOTALL) if match: json_str = match.group(0) try: items = json.loads(json_str) return JSONResponse(content={"menu_items": items}) except json.JSONDecodeError: return JSONResponse(status_code=500, content={"error": "Failed to parse JSON", "raw": json_str}) else: return JSONResponse(status_code=404, content={"error": "No JSON array found in response", "model_response": reply})