Spaces:

cpg716
/

staffmanager-llama4-scout

Runtime error

App Files Files Community

cpg716 commited on Apr 9

Commit

e85540a

verified ·

1 Parent(s): 01596b5

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -106

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gradio as gr
 import torch
-from transformers import pipeline
 from PIL import Image
 import io
 import json
@@ -11,6 +10,7 @@ import base64
 from huggingface_hub import login
 import traceback
 import sys
 # Print Python and library versions for debugging
 print(f"Python version: {sys.version}")
@@ -33,15 +33,27 @@ except Exception as e:
     print(f"Error logging in: {e}")
 # Global variables
-llama_pipeline = None
-# Initialize Llama 4 Scout pipeline
-def load_llama_pipeline():
-    global llama_pipeline
-    if llama_pipeline is None:
         try:
-            print("Loading Llama 4 Scout pipeline...")
             # Use 4-bit quantization to reduce memory usage
             from transformers import BitsAndBytesConfig
@@ -52,83 +64,43 @@ def load_llama_pipeline():
                 bnb_4bit_quant_type="nf4"
             )
-            # Try different pipeline types for Llama 4 Scout
-            pipeline_types = [
-                "image-to-text",
-                "image-text-to-text",
-                "visual-question-answering"
-            ]
-            for pipeline_type in pipeline_types:
-                try:
-                    print(f"Trying pipeline type: {pipeline_type}")
-                    llama_pipeline = pipeline(
-                        pipeline_type,
-                        model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
-                        device_map="auto",
-                        model_kwargs={"quantization_config": quantization_config},
-                        token=token
-                    )
-                    print(f"Successfully loaded Llama 4 Scout with pipeline type: {pipeline_type}")
-                    break
-                except Exception as pipeline_error:
-                    print(f"Failed to load with pipeline type {pipeline_type}: {pipeline_error}")
-            if llama_pipeline is None:
-                # If all pipeline types fail, try loading with AutoModel classes
-                print("Trying to load with AutoModel classes...")
-                from transformers import AutoProcessor, AutoModelForVision2Seq
-                processor = AutoProcessor.from_pretrained(
-                    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-                    token=token
-                )
-                model = AutoModelForVision2Seq.from_pretrained(
-                    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-                    token=token,
-                    quantization_config=quantization_config,
-                    device_map="auto"
-                )
-                # Create a custom pipeline function
-                def custom_pipeline(image, prompt, max_new_tokens=300):
-                    inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
-                    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
-                    return processor.decode(outputs[0], skip_special_tokens=True)
-                llama_pipeline = custom_pipeline
-                print("Successfully created custom Llama 4 Scout pipeline")
-            # If still None, fall back to LLaVA
-            if llama_pipeline is None:
-                print("All Llama 4 Scout loading attempts failed, falling back to LLaVA...")
-                llama_pipeline = pipeline(
-                    "image-to-text",
-                    model="llava-hf/llava-1.5-7b-hf",
-                    device_map="auto",
-                    model_kwargs={"quantization_config": quantization_config}
-                )
-                print("LLaVA pipeline loaded as fallback")
         except Exception as e:
-            print(f"Error loading pipeline: {e}")
             print(traceback.format_exc())
-            # Final fallback to LLaVA if everything else fails
             try:
-                print("Falling back to LLaVA after error...")
-                llama_pipeline = pipeline(
-                    "image-to-text",
-                    model="llava-hf/llava-1.5-7b-hf",
                     device_map="auto"
                 )
-                print("LLaVA pipeline loaded as fallback after error")
             except Exception as fallback_error:
                 print(f"Even fallback failed: {fallback_error}")
                 raise
-    return llama_pipeline
 # Simple caching mechanism
 cache = {}
@@ -160,38 +132,57 @@ def verify_document(img, doc_type, verification_info):
         return f"[CACHED] {cache[cache_key]}"
     try:
-        # Load pipeline
-        pipeline = load_llama_pipeline()
         # Create prompt
         prompt = f"""This is a {doc_type} document.
 Verify if it's authentic and extract the following information: {verification_info}
 Provide your analysis in a structured format."""
-        # Process with pipeline (with timeout)
         start_time = time.time()
         print(f"Starting document verification at {start_time}")
-        # Handle different pipeline types
-        if callable(pipeline) and not hasattr(pipeline, 'task'):  # Custom pipeline
-            result_text = pipeline(image=img, prompt=prompt, max_new_tokens=300)
-        elif hasattr(pipeline, 'task') and pipeline.task == "visual-question-answering":
-            result = pipeline(image=img, question=prompt, max_new_tokens=300)
-            result_text = result[0]["answer"] if isinstance(result, list) else result["answer"]
-        else:  # Standard pipeline
-            result = pipeline(image=img, prompt=prompt, max_new_tokens=300)
-            if isinstance(result, list):
-                result_text = result[0].get('generated_text', str(result))
-            else:
-                result_text = str(result)
         end_time = time.time()
         print(f"Completed document verification in {end_time - start_time:.2f} seconds")
         # Save to cache
-        cache[cache_key] = result_text
-        return result_text
     except Exception as e:
         error_details = traceback.format_exc()
         print(f"Error in verify_document: {e}")
@@ -212,8 +203,8 @@ def check_workplace(img, industry):
         return f"[CACHED] {cache[cache_key]}"
     try:
-        # Load pipeline
-        pipeline = load_llama_pipeline()
         # Create prompt
         prompt = f"""This is a workplace in the {industry} industry.
@@ -230,30 +221,49 @@ Format your response as a detailed assessment with:
 - Severity level for each issue
 - Recommendations for correction"""
-        # Process with pipeline (with timeout)
         start_time = time.time()
         print(f"Starting workplace compliance check at {start_time}")
-        # Handle different pipeline types
-        if callable(pipeline) and not hasattr(pipeline, 'task'):  # Custom pipeline
-            result_text = pipeline(image=img, prompt=prompt, max_new_tokens=300)
-        elif hasattr(pipeline, 'task') and pipeline.task == "visual-question-answering":
-            result = pipeline(image=img, question=prompt, max_new_tokens=300)
-            result_text = result[0]["answer"] if isinstance(result, list) else result["answer"]
-        else:  # Standard pipeline
-            result = pipeline(image=img, prompt=prompt, max_new_tokens=300)
-            if isinstance(result, list):
-                result_text = result[0].get('generated_text', str(result))
-            else:
-                result_text = str(result)
         end_time = time.time()
         print(f"Completed workplace compliance check in {end_time - start_time:.2f} seconds")
         # Save to cache
-        cache[cache_key] = result_text
-        return result_text
     except Exception as e:
         error_details = traceback.format_exc()
         print(f"Error in check_workplace: {e}")

 import gradio as gr
 import torch
 from PIL import Image
 import io
 import json
 from huggingface_hub import login
 import traceback
 import sys
+import requests
 # Print Python and library versions for debugging
 print(f"Python version: {sys.version}")
     print(f"Error logging in: {e}")
 # Global variables
+model = None
+processor = None
+# Initialize Llama 4 Scout model
+def load_llama4_model():
+    global model, processor
+    if model is None or processor is None:
         try:
+            print("Loading Llama 4 Scout model...")
+            # Import the correct classes for Llama 4
+            from transformers import AutoProcessor, Llama4ForConditionalGeneration
+            model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+            # Load processor and model
+            processor = AutoProcessor.from_pretrained(
+                model_id,
+                token=token
+            )
             # Use 4-bit quantization to reduce memory usage
             from transformers import BitsAndBytesConfig
                 bnb_4bit_quant_type="nf4"
             )
+            model = Llama4ForConditionalGeneration.from_pretrained(
+                model_id,
+                token=token,
+                device_map="auto",
+                torch_dtype=torch.bfloat16,
+                quantization_config=quantization_config
+            )
+            print("Llama 4 Scout model loaded successfully!")
         except Exception as e:
+            print(f"Error loading Llama 4 Scout model: {e}")
             print(traceback.format_exc())
+            # Fall back to LLaVA if Llama 4 fails
             try:
+                print("Falling back to LLaVA...")
+                from transformers import AutoProcessor, AutoModelForVision2Seq
+                processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+                model = AutoModelForVision2Seq.from_pretrained(
+                    "llava-hf/llava-1.5-7b-hf",
                     device_map="auto"
                 )
+                print("LLaVA model loaded as fallback")
             except Exception as fallback_error:
                 print(f"Even fallback failed: {fallback_error}")
                 raise
+    return model, processor
+# Function to convert PIL Image to base64
+def image_to_base64(img):
+    buffered = io.BytesIO()
+    img.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode()
+    return f"data:image/png;base64,{img_str}"
 # Simple caching mechanism
 cache = {}
         return f"[CACHED] {cache[cache_key]}"
     try:
+        # Load model and processor
+        model, processor = load_llama4_model()
         # Create prompt
         prompt = f"""This is a {doc_type} document.
 Verify if it's authentic and extract the following information: {verification_info}
 Provide your analysis in a structured format."""
+        # Process with model
         start_time = time.time()
         print(f"Starting document verification at {start_time}")
+        # Convert image to base64 URL
+        img_url = image_to_base64(img)
+        # Create messages format
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": img_url},
+                    {"type": "text", "text": prompt},
+                ]
+            },
+        ]
+        # Process input using the chat template
+        inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(model.device)
+        # Generate output
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=300,
+        )
+        # Decode output
+        result = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
         end_time = time.time()
         print(f"Completed document verification in {end_time - start_time:.2f} seconds")
         # Save to cache
+        cache[cache_key] = result
+        return result
     except Exception as e:
         error_details = traceback.format_exc()
         print(f"Error in verify_document: {e}")
         return f"[CACHED] {cache[cache_key]}"
     try:
+        # Load model and processor
+        model, processor = load_llama4_model()
         # Create prompt
         prompt = f"""This is a workplace in the {industry} industry.
 - Severity level for each issue
 - Recommendations for correction"""
+        # Process with model
         start_time = time.time()
         print(f"Starting workplace compliance check at {start_time}")
+        # Convert image to base64 URL
+        img_url = image_to_base64(img)
+        # Create messages format
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": img_url},
+                    {"type": "text", "text": prompt},
+                ]
+            },
+        ]
+        # Process input using the chat template
+        inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(model.device)
+        # Generate output
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=300,
+        )
+        # Decode output
+        result = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
         end_time = time.time()
         print(f"Completed workplace compliance check in {end_time - start_time:.2f} seconds")
         # Save to cache
+        cache[cache_key] = result
+        return result
     except Exception as e:
         error_details = traceback.format_exc()
         print(f"Error in check_workplace: {e}")