#!/usr/bin/env python3 """ Example usage of MonkeyOCR Vision GGUF model """ from llama_cpp import Llama import base64 from PIL import Image import io def encode_image_to_base64(image_path): """Convert image to base64 string""" with Image.open(image_path) as img: # Convert to RGB if necessary if img.mode != 'RGB': img = img.convert('RGB') # Save to bytes buffer = io.BytesIO() img.save(buffer, format='JPEG') img_bytes = buffer.getvalue() # Encode to base64 return base64.b64encode(img_bytes).decode('utf-8') def main(): # Initialize the model print("Loading MonkeyOCR Vision model...") llm = Llama( model_path="MonkeyOCR-pro-1.2B-Recognition.gguf", chat_format="qwen2vl", # Important: specify vision chat format n_ctx=2048, verbose=False ) print("Model loaded successfully!") # Example 1: Text-only query print("\n=== Text-only Example ===") response = llm.create_chat_completion( messages=[{ "role": "user", "content": "Hello! What can you help me with?" }] ) print("Response:", response['choices'][0]['message']['content']) # Example 2: Vision + text query (uncomment if you have an image) """ print("\n=== Vision Example ===") # Replace with your image path image_path = "your_image.jpg" response = llm.create_chat_completion( messages=[{ "role": "user", "content": [ {"type": "text", "text": "What text do you see in this image? Please extract all visible text."}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image_to_base64(image_path)}"}} ] }] ) print("OCR Response:", response['choices'][0]['message']['content']) """ if __name__ == "__main__": main()