Spaces:

MCP-1st-Birthday
/

MedLLM-Agent

Running on Zero

Y Phung Nguyen commited on 25 days ago

Commit

03d8100

1 Parent(s): e52570b

Upd model efficiency and GPU task assignment

ZeroGPU tagging: Each MedSwin task has @spaces.GPU(max_duration=120) decorator
No batching: Tasks execute individually (respects token limits)
Retry logic: Automatic retry with exponential backoff for GPU errors
Sequential delays: Small delays between GPU requests to prevent conflicts
Model status tracking: Real-time status updates
UI protection: Prevents submission while model is loading
Auto-loading: Models load automatically when selected

Files changed (5) hide show

config.py +9 -0
models.py +49 -12
pipeline.py +6 -0
supervisor.py +43 -3
ui.py +97 -2

config.py CHANGED Viewed

@@ -131,6 +131,15 @@ CSS = """
     background: #f3e5f5;
     color: #7b1fa2;
 }
 @media (min-width: 768px) {
     .main-container {
         display: flex;

     background: #f3e5f5;
     color: #7b1fa2;
 }
+.model-status {
+    margin-top: 5px;
+    padding: 8px;
+    border-radius: 5px;
+    font-size: 13px;
+    font-weight: 500;
+    background-color: #f5f5f5;
+    border: 1px solid #e0e0e0;
+}
 @media (min-width: 768px) {
     .main-container {
         display: flex;

models.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Model initialization and management"""
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from llama_index.llms.huggingface import HuggingFaceLLM
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
@@ -13,23 +14,59 @@ except ImportError:
     TTS_AVAILABLE = False
     TTS = None
 def initialize_medical_model(model_name: str):
     """Initialize medical model (MedSwin) - download on demand"""
     if model_name not in config.global_medical_models or config.global_medical_models[model_name] is None:
         logger.info(f"Initializing medical model: {model_name}...")
-        model_path = config.MEDSWIN_MODELS[model_name]
-        tokenizer = AutoTokenizer.from_pretrained(model_path, token=config.HF_TOKEN)
-        model = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            device_map="auto",
-            trust_remote_code=True,
-            token=config.HF_TOKEN,
-            torch_dtype=torch.float16
-        )
-        config.global_medical_models[model_name] = model
-        config.global_medical_tokenizers[model_name] = tokenizer
-        logger.info(f"Medical model {model_name} initialized successfully")
     return config.global_medical_models[model_name], config.global_medical_tokenizers[model_name]

 """Model initialization and management"""
 import torch
+import threading
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from llama_index.llms.huggingface import HuggingFaceLLM
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
     TTS_AVAILABLE = False
     TTS = None
+# Model loading state tracking
+_model_loading_states = {}
+_model_loading_lock = threading.Lock()
+def set_model_loading_state(model_name: str, state: str):
+    """Set model loading state: 'loading', 'loaded', 'error'"""
+    with _model_loading_lock:
+        _model_loading_states[model_name] = state
+        logger.debug(f"Model {model_name} state set to: {state}")
+def get_model_loading_state(model_name: str) -> str:
+    """Get model loading state: 'loading', 'loaded', 'error', or 'unknown'"""
+    with _model_loading_lock:
+        return _model_loading_states.get(model_name, "unknown")
+def is_model_loaded(model_name: str) -> bool:
+    """Check if model is loaded and ready"""
+    with _model_loading_lock:
+        return (model_name in config.global_medical_models and
+                config.global_medical_models[model_name] is not None and
+                _model_loading_states.get(model_name) == "loaded")
 def initialize_medical_model(model_name: str):
     """Initialize medical model (MedSwin) - download on demand"""
     if model_name not in config.global_medical_models or config.global_medical_models[model_name] is None:
+        set_model_loading_state(model_name, "loading")
         logger.info(f"Initializing medical model: {model_name}...")
+        try:
+            model_path = config.MEDSWIN_MODELS[model_name]
+            tokenizer = AutoTokenizer.from_pretrained(model_path, token=config.HF_TOKEN)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                device_map="auto",
+                trust_remote_code=True,
+                token=config.HF_TOKEN,
+                torch_dtype=torch.float16
+            )
+            config.global_medical_models[model_name] = model
+            config.global_medical_tokenizers[model_name] = tokenizer
+            set_model_loading_state(model_name, "loaded")
+            logger.info(f"Medical model {model_name} initialized successfully")
+        except Exception as e:
+            set_model_loading_state(model_name, "error")
+            logger.error(f"Failed to initialize medical model {model_name}: {e}")
+            raise
+    else:
+        # Model already loaded, ensure state is set
+        if get_model_loading_state(model_name) != "loaded":
+            set_model_loading_state(model_name, "loaded")
     return config.global_medical_models[model_name], config.global_medical_tokenizers[model_name]

pipeline.py CHANGED Viewed

@@ -571,6 +571,12 @@ def stream_chat(
         if len(rag_contexts) > 1 and idx <= len(rag_contexts):
             task_context = rag_contexts[idx - 1] if idx <= len(rag_contexts) else combined_context
         try:
             task_answer = execute_medswin_task(
                 medical_model_obj=medical_model_obj,

         if len(rag_contexts) > 1 and idx <= len(rag_contexts):
             task_context = rag_contexts[idx - 1] if idx <= len(rag_contexts) else combined_context
+        # Add small delay between GPU requests to prevent ZeroGPU scheduler conflicts
+        if idx > 1:
+            delay = 0.5  # 500ms delay between sequential GPU requests
+            logger.debug(f"[MEDSWIN] Waiting {delay}s before next GPU request to avoid scheduler conflicts...")
+            time.sleep(delay)
         try:
             task_answer = execute_medswin_task(
                 medical_model_obj=medical_model_obj,

supervisor.py CHANGED Viewed

@@ -559,8 +559,7 @@ def gemini_supervisor_rag_brainstorm(query: str, retrieved_docs: str, time_elaps
         }
-@spaces.GPU(max_duration=120)
-def execute_medswin_task(
     medical_model_obj,
     medical_tokenizer,
     task_instruction: str,
@@ -572,7 +571,7 @@ def execute_medswin_task(
     top_k: int,
     penalty: float
 ) -> str:
-    """MedSwin Specialist: Execute a single task assigned by Gemini Supervisor"""
     if context:
         full_prompt = f"{system_prompt_base}\n\nContext:\n{context}\n\nTask: {task_instruction}\n\nAnswer concisely with key bullet points (Markdown format, no tables):"
     else:
@@ -622,6 +621,47 @@ def execute_medswin_task(
     return response
 async def gemini_supervisor_synthesize_async(query: str, medswin_answers: list, rag_contexts: list, search_contexts: list, breakdown: dict) -> str:
     """Gemini Supervisor: Synthesize final answer from all MedSwin responses"""
     context_summary = ""

         }
+def _execute_medswin_core(
     medical_model_obj,
     medical_tokenizer,
     task_instruction: str,
     top_k: int,
     penalty: float
 ) -> str:
+    """Core MedSwin execution logic (without GPU decorator for retry logic)"""
     if context:
         full_prompt = f"{system_prompt_base}\n\nContext:\n{context}\n\nTask: {task_instruction}\n\nAnswer concisely with key bullet points (Markdown format, no tables):"
     else:
     return response
+@spaces.GPU(max_duration=120)
+def execute_medswin_task(
+    medical_model_obj,
+    medical_tokenizer,
+    task_instruction: str,
+    context: str,
+    system_prompt_base: str,
+    temperature: float,
+    max_new_tokens: int,
+    top_p: float,
+    top_k: int,
+    penalty: float
+) -> str:
+    """
+    MedSwin Specialist: Execute a single task assigned by Gemini Supervisor (with ZeroGPU tag)
+    Includes retry logic with exponential backoff to handle GPU task aborted errors
+    """
+    import time
+    max_retries = 3
+    base_delay = 1.0  # Base delay in seconds
+    for attempt in range(max_retries):
+        try:
+            return _execute_medswin_core(
+                medical_model_obj, medical_tokenizer, task_instruction, context,
+                system_prompt_base, temperature, max_new_tokens, top_p, top_k, penalty
+            )
+        except Exception as e:
+            error_msg = str(e).lower()
+            is_gpu_error = 'gpu task aborted' in error_msg or 'gpu' in error_msg or 'zerogpu' in error_msg
+            if is_gpu_error and attempt < max_retries - 1:
+                delay = base_delay * (2 ** attempt)  # Exponential backoff: 1s, 2s, 4s
+                logger.warning(f"[MEDSWIN] GPU task aborted (attempt {attempt + 1}/{max_retries}), retrying after {delay}s...")
+                time.sleep(delay)
+                continue
+            else:
+                logger.error(f"[MEDSWIN] Task failed after {attempt + 1} attempts: {e}")
+                raise
 async def gemini_supervisor_synthesize_async(query: str, medswin_answers: list, rag_contexts: list, search_contexts: list, breakdown: dict) -> str:
     """Gemini Supervisor: Synthesize final answer from all MedSwin responses"""
     context_summary = ""

ui.py CHANGED Viewed

@@ -5,6 +5,7 @@ from config import TITLE, DESCRIPTION, CSS, MEDSWIN_MODELS, DEFAULT_MEDICAL_MODE
 from indexing import create_or_update_index
 from pipeline import stream_chat
 from voice import transcribe_audio, generate_speech
 def create_demo():
@@ -176,6 +177,13 @@ def create_demo():
                         label="Medical Model",
                         info="MedSwin TA (default), others download on first use"
                     )
                     system_prompt = gr.Textbox(
                         value="As a medical specialist, provide detailed and accurate answers based on the provided medical documents and context. Ensure all information is clinically accurate and cite sources when available.",
@@ -250,8 +258,95 @@ def create_demo():
                     outputs=[agentic_thoughts_box, show_thoughts_state]
                 )
                 submit_button.click(
-                    fn=stream_chat,
                     inputs=[
                         message_input,
                         chatbot,
@@ -274,7 +369,7 @@ def create_demo():
                 )
                 message_input.submit(
-                    fn=stream_chat,
                     inputs=[
                         message_input,
                         chatbot,

 from indexing import create_or_update_index
 from pipeline import stream_chat
 from voice import transcribe_audio, generate_speech
+from models import initialize_medical_model, is_model_loaded, get_model_loading_state, set_model_loading_state
 def create_demo():
                         label="Medical Model",
                         info="MedSwin TA (default), others download on first use"
                     )
+                    model_status = gr.Textbox(
+                        value="Checking model status...",
+                        label="Model Status",
+                        interactive=False,
+                        visible=True,
+                        elem_classes="model-status"
+                    )
                     system_prompt = gr.Textbox(
                         value="As a medical specialist, provide detailed and accurate answers based on the provided medical documents and context. Ensure all information is clinically accurate and cite sources when available.",
                     outputs=[agentic_thoughts_box, show_thoughts_state]
                 )
+                def load_model_and_update_status(model_name):
+                    """Load model and update status, return status text and whether model is ready"""
+                    try:
+                        if is_model_loaded(model_name):
+                            return "✅ The model has been loaded successfully", True
+                        state = get_model_loading_state(model_name)
+                        if state == "loading":
+                            return "⏳ The model is being loaded, please wait...", False
+                        elif state == "error":
+                            return "❌ Error loading model. Please try again.", False
+                        # Start loading
+                        set_model_loading_state(model_name, "loading")
+                        try:
+                            initialize_medical_model(model_name)
+                            return "✅ The model has been loaded successfully", True
+                        except Exception as e:
+                            set_model_loading_state(model_name, "error")
+                            return f"❌ Error loading model: {str(e)[:100]}", False
+                    except Exception as e:
+                        return f"❌ Error: {str(e)[:100]}", False
+                def check_model_status(model_name):
+                    """Check current model status without loading"""
+                    if is_model_loaded(model_name):
+                        return "✅ The model has been loaded successfully", True
+                    state = get_model_loading_state(model_name)
+                    if state == "loading":
+                        return "⏳ The model is being loaded, please wait...", False
+                    elif state == "error":
+                        return "❌ Error loading model. Please try again.", False
+                    else:
+                        return "⚠️ Model not loaded. Click to load or it will load on first use.", False
+                # Initialize status on load
+                def init_model_status():
+                    status_text, is_ready = check_model_status(DEFAULT_MEDICAL_MODEL)
+                    return status_text
+                # Handle model selection change
+                def on_model_change(model_name):
+                    status_text, is_ready = load_model_and_update_status(model_name)
+                    submit_enabled = is_ready
+                    return (
+                        status_text,
+                        gr.update(interactive=submit_enabled),
+                        gr.update(interactive=submit_enabled)
+                    )
+                medical_model.change(
+                    fn=on_model_change,
+                    inputs=[medical_model],
+                    outputs=[model_status, submit_button, message_input]
+                )
+                # Initialize status
+                demo.load(
+                    fn=init_model_status,
+                    outputs=[model_status]
+                )
+                # Wrap stream_chat to check model status before execution
+                def stream_chat_with_model_check(
+                    message, history, system_prompt, temperature, max_new_tokens,
+                    top_p, top_k, penalty, retriever_k, merge_threshold,
+                    use_rag, medical_model_name, use_web_search,
+                    enable_clinical_intake, disable_agentic_reasoning, show_thoughts, request: gr.Request
+                ):
+                    # Check if model is loaded
+                    if not is_model_loaded(medical_model_name):
+                        # Try to load it
+                        status_text, is_ready = load_model_and_update_status(medical_model_name)
+                        if not is_ready:
+                            error_msg = "⚠️ Model is not ready. Please wait for the model to finish loading before sending messages."
+                            yield history + [{"role": "assistant", "content": error_msg}], ""
+                            return
+                    # Model is ready, proceed with chat
+                    for result in stream_chat(
+                        message, history, system_prompt, temperature, max_new_tokens,
+                        top_p, top_k, penalty, retriever_k, merge_threshold,
+                        use_rag, medical_model_name, use_web_search,
+                        enable_clinical_intake, disable_agentic_reasoning, show_thoughts, request
+                    ):
+                        yield result
                 submit_button.click(
+                    fn=stream_chat_with_model_check,
                     inputs=[
                         message_input,
                         chatbot,
                 )
                 message_input.submit(
+                    fn=stream_chat_with_model_check,
                     inputs=[
                         message_input,
                         chatbot,