Spaces:

MCP-1st-Birthday
/

MedLLM-Agent

Running on Zero

App Files Files Community

Y Phung Nguyen commited on 20 days ago

Commit

acc39fd

1 Parent(s): 020a4b5

Fix GPU quota err

Browse files

Files changed (1) hide show

ui.py +39 -12

ui.py CHANGED Viewed

@@ -461,12 +461,21 @@ def create_demo():
                     except Exception as e:
                         error_msg = str(e)
-                        # Check if it's a ZeroGPU quota/rate limit error - re-raise for retry
-                        if ("429" in error_msg or "Too Many Requests" in error_msg or
                             "quota" in error_msg.lower() or "ZeroGPU" in error_msg or
-                            "runnning out" in error_msg.lower() or "running out" in error_msg.lower()):
                             logger.warning(f"[STARTUP] ZeroGPU quota/rate limit error detected: {error_msg[:100]}")
-                            raise  # Re-raise to trigger retry logic in wrapper
                         logger.error(f"[STARTUP] ❌ Error in model loading startup: {e}")
                         import traceback
                         logger.debug(f"[STARTUP] Full traceback: {traceback.format_exc()}")
@@ -579,6 +588,20 @@ def create_demo():
                                 logger.info(f"[STARTUP] ✅ Model loaded successfully on attempt {attempt}")
                                 return status_text, gr.update(interactive=is_ready), gr.update(interactive=is_ready)
                             else:
                                 # Model didn't load, but no exception - might be a state issue
                                 logger.warning(f"[STARTUP] Model not ready after attempt {attempt}, but no error")
                                 if attempt < max_retries:
@@ -587,11 +610,13 @@ def create_demo():
                                     time.sleep(delay)
                                     continue
                                 else:
-                                    return status_text, gr.update(interactive=False), gr.update(interactive=False)
                         except Exception as e:
                             error_msg = str(e)
                             is_quota_error = ("429" in error_msg or "Too Many Requests" in error_msg or
-                                            "quota" in error_msg.lower() or "ZeroGPU" in error_msg)
                             if is_quota_error and attempt < max_retries:
                                 delay = base_delay * attempt  # Exponential backoff: 5s, 10s, 15s
@@ -605,16 +630,18 @@ def create_demo():
                                 logger.debug(f"[STARTUP] Full traceback: {traceback.format_exc()}")
                                 if is_quota_error:
-                                    error_display = "⚠️ ZeroGPU quota/rate limit reached. Please wait or try again later."
                                 else:
                                     error_display = f"⚠️ Startup error: {str(e)[:100]}"
-                                if attempt >= max_retries:
-                                    logger.error(f"[STARTUP] Failed after {max_retries} attempts")
-                                    return error_display, gr.update(interactive=False), gr.update(interactive=False)
                     # Should not reach here, but just in case
-                    return "⚠️ Startup failed after retries", gr.update(interactive=False), gr.update(interactive=False)
                 demo.load(
                     fn=load_startup_and_update_ui,

                     except Exception as e:
                         error_msg = str(e)
+                        # Check if it's a ZeroGPU quota/rate limit error
+                        is_quota_error = ("429" in error_msg or "Too Many Requests" in error_msg or
                             "quota" in error_msg.lower() or "ZeroGPU" in error_msg or
+                            "runnning out" in error_msg.lower() or "running out" in error_msg.lower())
+                        if is_quota_error:
                             logger.warning(f"[STARTUP] ZeroGPU quota/rate limit error detected: {error_msg[:100]}")
+                            # Return status message indicating quota error (will be handled by retry logic)
+                            status_messages.append("⚠️ ZeroGPU quota error - will retry")
+                            status_text = "\n".join(status_messages)
+                            # Also add ASR status
+                            if WHISPER_AVAILABLE:
+                                status_text += "\n⏳ ASR (Whisper): will load on first use"
+                            return status_text  # Return status instead of raising, let wrapper handle retry
                         logger.error(f"[STARTUP] ❌ Error in model loading startup: {e}")
                         import traceback
                         logger.debug(f"[STARTUP] Full traceback: {traceback.format_exc()}")
                                 logger.info(f"[STARTUP] ✅ Model loaded successfully on attempt {attempt}")
                                 return status_text, gr.update(interactive=is_ready), gr.update(interactive=is_ready)
                             else:
+                                # Check if status text indicates quota error
+                                if status_text and ("quota" in status_text.lower() or "ZeroGPU" in status_text or
+                                                   "429" in status_text or "runnning out" in status_text.lower() or
+                                                   "running out" in status_text.lower()):
+                                    if attempt < max_retries:
+                                        delay = base_delay * attempt
+                                        logger.warning(f"[STARTUP] Quota error detected in status, retrying in {delay} seconds...")
+                                        time.sleep(delay)
+                                        continue
+                                    else:
+                                        # Quota exhausted after retries - allow user to proceed, model will load on-demand
+                                        status_msg = "⚠️ ZeroGPU quota exhausted.\n⏳ Model will load automatically when you send a message.\n💡 You can also select a model from the dropdown."
+                                        logger.info("[STARTUP] Quota exhausted after retries - allowing user to proceed with on-demand loading")
+                                        return status_msg, gr.update(interactive=True), gr.update(interactive=True)
                                 # Model didn't load, but no exception - might be a state issue
                                 logger.warning(f"[STARTUP] Model not ready after attempt {attempt}, but no error")
                                 if attempt < max_retries:
                                     time.sleep(delay)
                                     continue
                                 else:
+                                    # Even if model didn't load, allow user to try selecting another model
+                                    return status_text + "\n⚠️ Model not loaded. Please select a model from dropdown.", gr.update(interactive=True), gr.update(interactive=True)
                         except Exception as e:
                             error_msg = str(e)
                             is_quota_error = ("429" in error_msg or "Too Many Requests" in error_msg or
+                                            "quota" in error_msg.lower() or "ZeroGPU" in error_msg or
+                                            "runnning out" in error_msg.lower() or "running out" in error_msg.lower())
                             if is_quota_error and attempt < max_retries:
                                 delay = base_delay * attempt  # Exponential backoff: 5s, 10s, 15s
                                 logger.debug(f"[STARTUP] Full traceback: {traceback.format_exc()}")
                                 if is_quota_error:
+                                    # If quota exhausted, allow user to proceed - model will load on-demand
+                                    error_display = "⚠️ ZeroGPU quota exhausted.\n⏳ Model will load automatically when you send a message.\n💡 You can also select a model from the dropdown."
+                                    logger.info("[STARTUP] Quota exhausted - allowing user to proceed with on-demand loading")
+                                    return error_display, gr.update(interactive=True), gr.update(interactive=True)
                                 else:
                                     error_display = f"⚠️ Startup error: {str(e)[:100]}"
+                                    if attempt >= max_retries:
+                                        logger.error(f"[STARTUP] Failed after {max_retries} attempts")
+                                        return error_display, gr.update(interactive=False), gr.update(interactive=False)
                     # Should not reach here, but just in case
+                    return "⚠️ Startup failed after retries. Please select a model from dropdown.", gr.update(interactive=True), gr.update(interactive=True)
                 demo.load(
                     fn=load_startup_and_update_ui,