import gradio as gr
available_models = {
    "Llama 3.2": "unsloth/Llama-3.2-3B-Instruct",
    "Microsoft Phi-4 Mini": "microsoft/Phi-4-mini-instruct",
    "Google Gemma 3": "unsloth/gemma-3-4b-it-GGUF"
    }
# --- Global State (or use gr.State in Blocks) ---
# To keep track of the currently loaded model/pipeline
current_model_id = None
current_pipeline = None
print(f"Models available for selection: {list(available_models.keys())}")


# Define a function to Load/Switch Models
def load_llm_model(model_name):
    """Loads the selected LLM, unloading the previous one."""
    global current_model_id, current_pipeline, tokenizer, model

    new_model_id = available_models.get(model_name)
    if not new_model_id:
        return "Invalid model selected.", None  # Return error message and None pipeline

    if new_model_id == current_model_id and current_pipeline is not None:
        print(f"Model {model_name} is already loaded.")
        # Indicate success but don't reload
        return f"{model_name} already loaded.", current_pipeline

    print(f"Switching to model: {model_name} ({new_model_id})...")

    # Unload previous model (important for memory)
    # Clear variables and run garbage collection
    current_pipeline = None
    if "model" in locals():
        del model
    if "tokenizer" in locals():
        del tokenizer
    if "pipe" in locals():
        del pipe
    torch.cuda.empty_cache()  # Clear GPU memory cache
    import gc

    gc.collect()
    print("Previous model unloaded (if any).")

    # --- Load the new model ---
    loading_message = f"Loading {model_name}..."
    try:
        # Load Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(new_model_id, trust_remote_code = True)

        # Load Model (Quantized)
        model = AutoModelForCausalLM.from_pretrained(new_model_id,
                                                     torch_dtype = "auto",  # "torch.float16", # Or bfloat16 if available
                                                     load_in_4bit = True,
                                                     device_map = "auto",
                                                     trust_remote_code = True)

        # Create Pipeline
        loaded_pipeline = pipeline(
            "text-generation", model = model, tokenizer = tokenizer, torch_dtype = "auto", device_map = "auto")

        print(f"Model {model_name} loaded successfully!")
        current_model_id = new_model_id
        current_pipeline = loaded_pipeline  # Update global state
        # Use locals() or return values with gr.State for better Gradio practice
        return f"{model_name} loaded successfully!", loaded_pipeline  # Status message and the pipeline object

    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        current_model_id = None
        current_pipeline = None
        return f"Error loading {model_name}: {e}", None  # Error message and None pipeline


# --- Function to handle Q&A Submission ---
# This function now relies on the globally managed 'current_pipeline'
# In a more robust Gradio app, you'd pass the pipeline via gr.State
def handle_submit(question):
    """Handles the user submitting a question."""
    if not current_pipeline:
        return "Error: No model is currently loaded. Please select a model."
    if not pdf_text:
        return "Error: PDF text is not loaded. Please run Section 4."
    if not question:
        return "Please enter a question."

    print(f"Handling submission for question: '{question}' using {current_model_id}")
    # Call the Q&A function defined in Section 5
    answer = answer_question_from_pdf(pdf_text, question, current_pipeline)
    return answer


# --- Build Gradio Interface using Blocks ---
print("Building Gradio interface...")
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        f"""
    # PDF Q&A Bot Using Hugging Face Open-Source Models
    Ask questions about the document ('{pdf_filename}' if loaded, {len(pdf_text)} chars).
    Select an open-source LLM to answer your question.
    **Note:** Switching models takes time as the new model needs to be downloaded and loaded into the GPU.
    """
    )

    # Store the pipeline in Gradio state for better practice (optional for this simple version)
    # llm_pipeline_state = gr.State(None)

    with gr.Row():
        model_dropdown = gr.Dropdown(
            choices=list(available_models.keys()),
            label="🤖 Select LLM Model",
            value=list(available_models.keys())[0],  # Default to the first model
        )
        status_textbox = gr.Textbox(label="Model Status", interactive=False)

    question_textbox = gr.Textbox(
        label="❓ Your Question", lines=2, placeholder="Enter your question about the document here..."
    )
    submit_button = gr.Button("Submit Question", variant="primary")
    answer_textbox = gr.Textbox(label="💡 Answer", lines=5, interactive=False)

    # --- Event Handlers ---
    # When the dropdown changes, load the selected model
    model_dropdown.change(
        fn = load_llm_model,
        inputs = [model_dropdown],
        outputs = [status_textbox],  # Update status text. Ideally also update a gr.State for the pipeline
        # outputs=[status_textbox, llm_pipeline_state] # If using gr.State
    )

    # When the button is clicked, call the submit handler
    submit_button.click(
        fn = handle_submit,
        inputs = [question_textbox],
        outputs = [answer_textbox],
        # inputs=[question_textbox, llm_pipeline_state], # Pass state if using it
    )

    # --- Initial Model Load ---
    # Easier: Manually load first model *before* launching Gradio for simplicity here
    initial_model_name = list(available_models.keys())[0]
    print(f"Performing initial load of default model: {initial_model_name}...")
    status, _ = load_llm_model(initial_model_name)
    status_textbox.value = status  # Set initial status
    print("Initial load complete.")


# --- Launch the Gradio App ---
print("Launching Gradio demo...")
demo.launch(debug=True)  # debug=True provides more detailed logs