Y Phung Nguyen commited on
Commit
ab36fa0
·
1 Parent(s): 8056774

Upd first time loader #3

Browse files
Files changed (1) hide show
  1. ui.py +86 -40
ui.py CHANGED
@@ -277,7 +277,7 @@ def create_demo():
277
  <p style="margin-top: 10px; margin-bottom: 0; font-size: 11px; color: #666;">Click any model name to view details on Hugging Face</p>
278
  </div>
279
  """
280
- )
281
 
282
  show_thoughts_state = gr.State(value=False)
283
 
@@ -377,10 +377,10 @@ def create_demo():
377
  if is_model_loaded(model_name):
378
  status_lines.append(f"✅ MedSwin ({model_name}): loaded and ready")
379
  else:
380
- state = get_model_loading_state(model_name)
381
- if state == "loading":
382
  status_lines.append(f"⏳ MedSwin ({model_name}): loading...")
383
- elif state == "error":
384
  status_lines.append(f"❌ MedSwin ({model_name}): error loading")
385
  else:
386
  status_lines.append(f"⚠️ MedSwin ({model_name}): not loaded")
@@ -460,6 +460,13 @@ def create_demo():
460
  return status_text
461
 
462
  except Exception as e:
 
 
 
 
 
 
 
463
  logger.error(f"[STARTUP] ❌ Error in model loading startup: {e}")
464
  import traceback
465
  logger.debug(f"[STARTUP] Full traceback: {traceback.format_exc()}")
@@ -487,7 +494,7 @@ def create_demo():
487
  result = check_model_status(model_name)
488
  if result and isinstance(result, tuple) and len(result) == 2:
489
  status_text, is_ready = result
490
- return status_text
491
  else:
492
  return "⚠️ Unable to check model status"
493
  except Exception as e:
@@ -500,12 +507,12 @@ def create_demo():
500
  result = load_model_and_update_status(model_name)
501
  if result and isinstance(result, tuple) and len(result) == 2:
502
  status_text, is_ready = result
503
- submit_enabled = is_ready
504
- return (
505
- status_text,
506
- gr.update(interactive=submit_enabled),
507
- gr.update(interactive=submit_enabled)
508
- )
509
  else:
510
  error_msg = "⚠️ Unable to load model status"
511
  return (
@@ -555,20 +562,59 @@ def create_demo():
555
  return f"❌ ASR: error - {str(e)[:100]}"
556
 
557
  # Load medical model on startup and update status
558
- # Use a wrapper to handle GPU context properly
559
  def load_startup_and_update_ui():
560
- """Load model on startup and return status with UI updates"""
561
- try:
562
- status_text = load_medical_model_on_startup()
563
- # Check if model is ready and update submit button state
564
- is_ready = is_model_loaded(DEFAULT_MEDICAL_MODEL)
565
- return status_text, gr.update(interactive=is_ready), gr.update(interactive=is_ready)
566
- except Exception as e:
567
- logger.error(f"Error in load_startup_and_update_ui: {e}")
568
- import traceback
569
- logger.debug(f"Full traceback: {traceback.format_exc()}")
570
- error_msg = f"⚠️ Startup error: {str(e)[:100]}"
571
- return error_msg, gr.update(interactive=False), gr.update(interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
 
573
  demo.load(
574
  fn=load_startup_and_update_ui,
@@ -596,25 +642,25 @@ def create_demo():
596
  updated_history = history + [{"role": "assistant", "content": error_msg}]
597
  yield updated_history, ""
598
  return
599
-
600
  # If request is None, create a mock request for compatibility
601
  if request is None:
602
- class MockRequest:
603
- session_hash = "anonymous"
604
- request = MockRequest()
605
-
606
  # Let stream_chat handle model loading (it's GPU-decorated and can load on-demand)
607
- for result in stream_chat(
608
- message, history, system_prompt, temperature, max_new_tokens,
609
- top_p, top_k, penalty, retriever_k, merge_threshold,
610
- use_rag, medical_model_name, use_web_search,
611
- enable_clinical_intake, disable_agentic_reasoning, show_thoughts, request
612
- ):
613
- yield result
614
  # If we get here, stream_chat completed successfully
615
  return
616
 
617
- except Exception as e:
618
  error_msg_lower = str(e).lower()
619
  is_gpu_error = 'gpu task aborted' in error_msg_lower or 'gpu' in error_msg_lower or 'zerogpu' in error_msg_lower
620
 
@@ -636,10 +682,10 @@ def create_demo():
636
  if is_gpu_error:
637
  error_msg = f"⚠️ GPU task was aborted. This can happen if:\n- The request took too long\n- Multiple GPU requests conflicted\n- GPU quota was exceeded\n\nPlease try again or select a different model."
638
  else:
639
- error_msg = f"⚠️ An error occurred: {str(e)[:200]}"
640
 
641
- updated_history = history + [{"role": "assistant", "content": error_msg}]
642
- yield updated_history, ""
643
  return
644
 
645
  submit_button.click(
 
277
  <p style="margin-top: 10px; margin-bottom: 0; font-size: 11px; color: #666;">Click any model name to view details on Hugging Face</p>
278
  </div>
279
  """
280
+ )
281
 
282
  show_thoughts_state = gr.State(value=False)
283
 
 
377
  if is_model_loaded(model_name):
378
  status_lines.append(f"✅ MedSwin ({model_name}): loaded and ready")
379
  else:
380
+ state = get_model_loading_state(model_name)
381
+ if state == "loading":
382
  status_lines.append(f"⏳ MedSwin ({model_name}): loading...")
383
+ elif state == "error":
384
  status_lines.append(f"❌ MedSwin ({model_name}): error loading")
385
  else:
386
  status_lines.append(f"⚠️ MedSwin ({model_name}): not loaded")
 
460
  return status_text
461
 
462
  except Exception as e:
463
+ error_msg = str(e)
464
+ # Check if it's a ZeroGPU quota/rate limit error - re-raise for retry
465
+ if ("429" in error_msg or "Too Many Requests" in error_msg or
466
+ "quota" in error_msg.lower() or "ZeroGPU" in error_msg or
467
+ "runnning out" in error_msg.lower() or "running out" in error_msg.lower()):
468
+ logger.warning(f"[STARTUP] ZeroGPU quota/rate limit error detected: {error_msg[:100]}")
469
+ raise # Re-raise to trigger retry logic in wrapper
470
  logger.error(f"[STARTUP] ❌ Error in model loading startup: {e}")
471
  import traceback
472
  logger.debug(f"[STARTUP] Full traceback: {traceback.format_exc()}")
 
494
  result = check_model_status(model_name)
495
  if result and isinstance(result, tuple) and len(result) == 2:
496
  status_text, is_ready = result
497
+ return status_text
498
  else:
499
  return "⚠️ Unable to check model status"
500
  except Exception as e:
 
507
  result = load_model_and_update_status(model_name)
508
  if result and isinstance(result, tuple) and len(result) == 2:
509
  status_text, is_ready = result
510
+ submit_enabled = is_ready
511
+ return (
512
+ status_text,
513
+ gr.update(interactive=submit_enabled),
514
+ gr.update(interactive=submit_enabled)
515
+ )
516
  else:
517
  error_msg = "⚠️ Unable to load model status"
518
  return (
 
562
  return f"❌ ASR: error - {str(e)[:100]}"
563
 
564
  # Load medical model on startup and update status
565
+ # Use a wrapper to handle GPU context properly with retry logic
566
  def load_startup_and_update_ui():
567
+ """Load model on startup with retry logic (max 3 attempts) and return status with UI updates"""
568
+ import time
569
+ max_retries = 3
570
+ base_delay = 5.0 # Start with 5 seconds delay
571
+
572
+ for attempt in range(1, max_retries + 1):
573
+ try:
574
+ logger.info(f"[STARTUP] Attempt {attempt}/{max_retries} to load medical model...")
575
+ status_text = load_medical_model_on_startup()
576
+ # Check if model is ready and update submit button state
577
+ is_ready = is_model_loaded(DEFAULT_MEDICAL_MODEL)
578
+ if is_ready:
579
+ logger.info(f"[STARTUP] ✅ Model loaded successfully on attempt {attempt}")
580
+ return status_text, gr.update(interactive=is_ready), gr.update(interactive=is_ready)
581
+ else:
582
+ # Model didn't load, but no exception - might be a state issue
583
+ logger.warning(f"[STARTUP] Model not ready after attempt {attempt}, but no error")
584
+ if attempt < max_retries:
585
+ delay = base_delay * attempt # Exponential backoff: 5s, 10s, 15s
586
+ logger.info(f"[STARTUP] Retrying in {delay} seconds...")
587
+ time.sleep(delay)
588
+ continue
589
+ else:
590
+ return status_text, gr.update(interactive=False), gr.update(interactive=False)
591
+ except Exception as e:
592
+ error_msg = str(e)
593
+ is_quota_error = ("429" in error_msg or "Too Many Requests" in error_msg or
594
+ "quota" in error_msg.lower() or "ZeroGPU" in error_msg)
595
+
596
+ if is_quota_error and attempt < max_retries:
597
+ delay = base_delay * attempt # Exponential backoff: 5s, 10s, 15s
598
+ logger.warning(f"[STARTUP] ZeroGPU rate limit/quota error on attempt {attempt}/{max_retries}")
599
+ logger.info(f"[STARTUP] Retrying in {delay} seconds...")
600
+ time.sleep(delay)
601
+ continue
602
+ else:
603
+ logger.error(f"[STARTUP] Error in load_startup_and_update_ui (attempt {attempt}/{max_retries}): {e}")
604
+ import traceback
605
+ logger.debug(f"[STARTUP] Full traceback: {traceback.format_exc()}")
606
+
607
+ if is_quota_error:
608
+ error_display = "⚠️ ZeroGPU quota/rate limit reached. Please wait or try again later."
609
+ else:
610
+ error_display = f"⚠️ Startup error: {str(e)[:100]}"
611
+
612
+ if attempt >= max_retries:
613
+ logger.error(f"[STARTUP] Failed after {max_retries} attempts")
614
+ return error_display, gr.update(interactive=False), gr.update(interactive=False)
615
+
616
+ # Should not reach here, but just in case
617
+ return "⚠️ Startup failed after retries", gr.update(interactive=False), gr.update(interactive=False)
618
 
619
  demo.load(
620
  fn=load_startup_and_update_ui,
 
642
  updated_history = history + [{"role": "assistant", "content": error_msg}]
643
  yield updated_history, ""
644
  return
645
+
646
  # If request is None, create a mock request for compatibility
647
  if request is None:
648
+ class MockRequest:
649
+ session_hash = "anonymous"
650
+ request = MockRequest()
651
+
652
  # Let stream_chat handle model loading (it's GPU-decorated and can load on-demand)
653
+ for result in stream_chat(
654
+ message, history, system_prompt, temperature, max_new_tokens,
655
+ top_p, top_k, penalty, retriever_k, merge_threshold,
656
+ use_rag, medical_model_name, use_web_search,
657
+ enable_clinical_intake, disable_agentic_reasoning, show_thoughts, request
658
+ ):
659
+ yield result
660
  # If we get here, stream_chat completed successfully
661
  return
662
 
663
+ except Exception as e:
664
  error_msg_lower = str(e).lower()
665
  is_gpu_error = 'gpu task aborted' in error_msg_lower or 'gpu' in error_msg_lower or 'zerogpu' in error_msg_lower
666
 
 
682
  if is_gpu_error:
683
  error_msg = f"⚠️ GPU task was aborted. This can happen if:\n- The request took too long\n- Multiple GPU requests conflicted\n- GPU quota was exceeded\n\nPlease try again or select a different model."
684
  else:
685
+ error_msg = f"⚠️ An error occurred: {str(e)[:200]}"
686
 
687
+ updated_history = history + [{"role": "assistant", "content": error_msg}]
688
+ yield updated_history, ""
689
  return
690
 
691
  submit_button.click(