""" Gradio app to explore pancreas cancer clinical report annotations. Loads data from rntc/biomed-fr-pancreas-annotations on HuggingFace. """ import gradio as gr from datasets import load_dataset from difflib import SequenceMatcher # Load the dataset print("Loading dataset from HuggingFace...") dataset = load_dataset("rntc/biomed-fr-pancreas-annotations", split="train") print(f"Loaded {len(dataset)} samples") def fuzzy_find_span(text: str, span: str, threshold: float = 0.85) -> tuple: """ Find a span in text with fuzzy matching. Returns (start, end) or None if not found. """ # First try exact match idx = text.find(span) if idx != -1: return (idx, idx + len(span)) # Try fuzzy match with sliding window span_len = len(span) if span_len < 10 or span_len > len(text): return None best_ratio = 0 best_pos = None # Use a window slightly larger than span window_size = min(span_len + 20, len(text)) for i in range(0, len(text) - span_len + 1, max(1, span_len // 4)): window = text[i:i + window_size] ratio = SequenceMatcher(None, span, window[:span_len]).ratio() if ratio > best_ratio and ratio >= threshold: best_ratio = ratio best_pos = i if best_pos is not None: return (best_pos, best_pos + span_len) return None def escape_html(text: str) -> str: """Escape HTML special characters.""" if not text: return "" return (str(text) .replace("&", "&") .replace("<", "<") .replace(">", ">") .replace('"', """)) # Soft pastel colors for better readability COLORS = [ "#FFE082", # amber "#A5D6A7", # green "#90CAF9", # blue "#FFAB91", # deep orange "#CE93D8", # purple "#80DEEA", # cyan "#C5E1A5", # light green "#FFCC80", # orange "#B39DDB", # deep purple "#81D4FA", # light blue "#EF9A9A", # red "#FFF59D", # yellow "#F48FB1", # pink "#80CBC4", # teal "#BCAAA4", # brown ] def highlight_spans_in_text(cr_text: str, annotation: dict) -> str: """ Highlight spans in the CR text based on annotations. Returns HTML with highlighted spans. """ if not cr_text or not annotation: return f"
No annotations
" # Group variables by category (simple heuristic based on name) categories = { "Patient Info": ["date_of_birth", "age_at_cancer_diagnosis", "biological_gender", "vital_status", "date_of_death"], "Diagnosis": ["date_of_cancer_diagnostic", "primary_tumor_localisation", "ctnm_stage", "stage_as_per_ehr", "histological_type", "epithelial_tumor_subtype"], "Tumor Characteristics": ["resectability_status", "two_largest_diameters", "metastasis_localisation", "number_of_metastatic_sites"], "Lab Results": ["crp_at_diagnosis", "albumin_at_diagnosis", "alanine_transaminase", "aspartate_aminotransferase", "conjugated_bilirubin", "ca19_9"], "Treatment": ["surgery", "loco_regional_radiotherapy", "immunotherapy", "targeted_therapy", "full_course_of_initial_treatment"], "Molecular": ["germline_mutation", "tumor_molecular_profiling"], "Progression": ["date_of_first_progression", "type_of_first_progression", "treatment_at_first_progression", "best_response", "reason_for_treatment_end"], } def get_category(var_name): for cat, keywords in categories.items(): for kw in keywords: if kw in var_name.lower(): return cat return "Other" # Group rows by category categorized = {} for var_name, var_data in annotation.items(): if var_data and isinstance(var_data, dict): value = var_data.get("value") if value: cat = get_category(var_name) if cat not in categorized: categorized[cat] = [] categorized[cat].append((var_name, var_data)) if not categorized: return "No extracted values
" html_parts = [] for category in ["Patient Info", "Diagnosis", "Tumor Characteristics", "Lab Results", "Treatment", "Molecular", "Progression", "Other"]: if category not in categorized: continue html_parts.append(f"| {escape_html(var_label)} | {escape_html(str(value))} | {escape_html(span_preview) if span_preview else '-'} |
Invalid sample index
", "Invalid" sample = dataset[int(sample_idx)] cr_text = sample.get("CR", "") annotation = sample.get("annotation", {}) highlighted_html = highlight_spans_in_text(cr_text, annotation) annotations_html = format_annotations_table(annotation) stats = get_stats(annotation) return highlighted_html, annotations_html, stats def search_samples(query: str): """Search samples by text content.""" if not query or len(query) < 3: # Return first 20 samples return [[i, dataset[i]["CR"][:80] + "..."] for i in range(min(20, len(dataset)))] results = [] query_lower = query.lower() for i, sample in enumerate(dataset): cr = sample.get("CR", "") if query_lower in cr.lower(): results.append([i, cr[:80] + "..."]) if len(results) >= 50: break if not results: return [["No results", f"No samples found containing '{query}'"]] return results # Custom CSS for better styling custom_css = """ .cr-text { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; font-size: 14px; line-height: 1.8; padding: 20px; background: #fafafa; border-radius: 8px; white-space: pre-wrap; max-height: 500px; overflow-y: auto; } .entity { padding: 2px 6px; border-radius: 4px; cursor: help; position: relative; display: inline; transition: all 0.2s; } .entity:hover { filter: brightness(0.9); box-shadow: 0 2px 8px rgba(0,0,0,0.15); } .entity-label { display: none; position: absolute; bottom: 100%; left: 0; background: #333; color: white; padding: 4px 8px; border-radius: 4px; font-size: 11px; white-space: nowrap; z-index: 100; } .entity:hover .entity-label { display: block; } .category { margin-bottom: 20px; } .category h4 { color: #1976d2; border-bottom: 2px solid #1976d2; padding-bottom: 8px; margin-bottom: 12px; } .annotations-table { width: 100%; border-collapse: collapse; font-size: 13px; } .annotations-table tr:nth-child(even) { background: #f5f5f5; } .annotations-table td { padding: 10px 12px; border-bottom: 1px solid #e0e0e0; vertical-align: top; } .var-name { font-weight: 600; color: #333; width: 30%; } .var-value { color: #1976d2; font-weight: 500; width: 25%; } .var-span { color: #666; font-style: italic; font-size: 12px; width: 45%; } .no-data { color: #999; font-style: italic; padding: 20px; text-align: center; } .stats-badge { background: #e3f2fd; color: #1976d2; padding: 8px 16px; border-radius: 20px; font-weight: 500; display: inline-block; } """ # Build the Gradio interface with gr.Blocks( title="Pancreas Cancer Annotations Explorer", theme=gr.themes.Soft(primary_hue="blue"), css=custom_css ) as demo: gr.Markdown(""" # š¬ Pancreas Cancer Clinical Report Annotations Explorer Explore structured annotations extracted from synthetic French clinical reports about pancreas cancer. **How to use:** - Use the slider or search to navigate samples - Hover over highlighted text to see extracted variables - View the complete annotation table below """) with gr.Row(): with gr.Column(scale=2): sample_slider = gr.Slider( minimum=0, maximum=len(dataset) - 1, step=1, value=0, label=f"š Sample Index (0 - {len(dataset) - 1})", info="Drag to browse samples" ) with gr.Column(scale=1): stats_display = gr.Markdown("", elem_classes=["stats-badge"]) with gr.Row(): with gr.Column(scale=1): search_box = gr.Textbox( label="š Search", placeholder="Type to search in clinical reports...", info="Min 3 characters" ) search_results = gr.Dataframe( headers=["#", "Preview"], label="Results", interactive=False, height=200 ) gr.Markdown("---") gr.Markdown("### š Clinical Report with Entity Highlighting") gr.Markdown("*Hover over colored text to see the extracted variable and value*") cr_display = gr.HTML() gr.Markdown("---") gr.Markdown("### š Extracted Annotations") annotations_display = gr.HTML() # Event handlers sample_slider.change( fn=display_sample, inputs=[sample_slider], outputs=[cr_display, annotations_display, stats_display] ) search_box.change( fn=search_samples, inputs=[search_box], outputs=[search_results] ) def on_select(evt: gr.SelectData, data): if data is not None and len(data) > 0: try: selected_idx = int(data[evt.index[0]][0]) return selected_idx except (ValueError, IndexError, TypeError): pass return 0 search_results.select( fn=on_select, inputs=[search_results], outputs=[sample_slider] ) # Load first sample on start demo.load( fn=display_sample, inputs=[sample_slider], outputs=[cr_display, annotations_display, stats_display] ) if __name__ == "__main__": demo.launch()