Spaces:

NiranjanSathish
/

DrugBot_Retrieval_Based

Running

App Files Files Community

Niranjan Sathish commited on Oct 15

Commit

40d2f99

1 Parent(s): a337894

Initial Commit

Browse files

Files changed (16) hide show

.gitattributes +13 -34
.gitignore +52 -0
Data/Dataset.json +3 -0
Data/doc_metadata.pkl +3 -0
Data/doc_vectors.npy +3 -0
Data/faiss_index.idx +3 -0
Data/flattened_drug_dataset_cleaned.csv +3 -0
Evaluation/Evaluation_metrics_score.py +96 -0
Evaluation/custom_drug_eval_set_id.csv +3 -0
README.md +53 -7
Scripts/Answer_Generation.py +125 -0
Scripts/Query_processing.py +182 -0
Scripts/Retrieval.py +175 -0
Scripts/app.py +246 -0
Scripts/demo.py +53 -0
requirements.txt +22 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,14 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pkl filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.idx filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
+Data/* filter=lfs diff=lfs merge=lfs -text
+Data/*.pkl filter=lfs diff=lfs merge=lfs -text
+Data/*.npy filter=lfs diff=lfs merge=lfs -text
+Data/*.idx filter=lfs diff=lfs merge=lfs -text
+Data/*.csv filter=lfs diff=lfs merge=lfs -text
+Data/*.npygit filter=lfs diff=lfs merge=lfs -text
+Data/*.idxgit filter=lfs diff=lfs merge=lfs -text
+Data/*.pklgit filter=lfs diff=lfs merge=lfs -text
+lfs filter=lfs diff=lfs merge=lfs -text
+track filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,52 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+ENV/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Environment
+.env
+Chatbot.venv/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Data (don't commit large files)
+*.pkl
+*.npy
+*.idx
+!Data/*.pkl
+!Data/*.npy
+!Data/*.idx
+# Model cache
+.cache/
+model_cache/
+---

Data/Dataset.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc38f7e5bfad6d7c2865ed7c94d483c8b9b887a47853e4a3c16ce957ce1f06a0
+size 35120734

Data/doc_metadata.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:800157a95b50080634fdce730014af49a8e0cf01d2dbb484785b15936dc9abff
+size 53368209

Data/doc_vectors.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f54da3cd890cf384fdc3b7abcd6ed5f840c0f53da30615fd417fc8256fd1b5ca
+size 70190720

Data/faiss_index.idx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58d68a5ccb27c94e357ab12eec21d5d54d903949ae37648202643eb33387156b
+size 70190637

Data/flattened_drug_dataset_cleaned.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0669d5d7366973a342a3cc35321366a02837c66ac5e7c28c3bf0569897db5b84
+size 31338099

Evaluation/Evaluation_metrics_score.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""
+Evaluation Script for Retrieval-based QA Chatbot
+=================================================
+This module handles:
+1. Loading evaluation questions and expected chunk IDs
+2. Preprocessing queries and retrieving top chunks
+3. Calculating Precision@3, Recall@3, F1-Score@3, and Success Rate@3
+"""
+import pandas as pd
+from Query_processing import preprocess_query
+from Retrieval import Retrieval_averagedQP
+import os
+# -------------------------------
+# File Paths
+# -------------------------------
+# Get the directory of the current script
+script_dir = os.path.dirname(os.path.abspath(__file__))
+# Path to evaluation dataset
+csv_path = os.path.join(script_dir, 'custom_drug_eval_set_id.csv')
+# -------------------------------
+# Load Evaluation Dataset
+# -------------------------------
+df = pd.read_csv(csv_path)
+# -------------------------------
+# Evaluation Storage
+# -------------------------------
+all_precisions = []
+all_recalls = []
+all_f1s = []
+all_successes = []
+# -------------------------------
+# Evaluation Loop
+# -------------------------------
+for _, row in df.iterrows():
+    question = row['question']
+    expected_ids = set(map(int, filter(None, str(row['relevant_chunk']).split(';'))))
+    print(f"\n[Evaluation] Question: {question}")
+    print(f"[Expected Chunk IDs] {expected_ids}")
+    # Preprocess the query
+    intent, entities = preprocess_query(question)
+    # Retrieve top-k chunk predictions
+    retrieved_df = Retrieval_averagedQP(question, intent, entities, top_k=10, alpha=0.8)
+    retrieved_df = retrieved_df.head(3)  # Limit to top 3 results
+    retrieved_ids = set(retrieved_df['chunk_id'].astype(int).tolist())
+    print(f"[Retrieved Chunk IDs] {retrieved_ids}")
+    # Evaluation Metrics Calculation
+    tp = len(retrieved_ids & expected_ids)
+    fp = len(retrieved_ids - expected_ids)
+    fn = len(expected_ids - retrieved_ids)
+    print(f"[Metrics] TP: {tp}, FP: {fp}, FN: {fn}")
+    success = 1 if tp > 0 else 0
+    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+    all_precisions.append(precision)
+    all_recalls.append(recall)
+    all_f1s.append(f1)
+    all_successes.append(success)
+# -------------------------------
+# Aggregate Results
+# -------------------------------
+mean_precision = sum(all_precisions) / len(all_precisions)
+mean_recall = sum(all_recalls) / len(all_recalls)
+mean_f1 = sum(all_f1s) / len(all_f1s)
+mean_success = sum(all_successes) / len(all_successes)
+# -------------------------------
+# Display Final Metrics
+# -------------------------------
+print("\n========= Final Evaluation Metrics =========")
+print(f"Success Rate@3: {mean_success:.4f}")
+print(f"Precision@3:    {mean_precision:.4f}")
+print(f"Recall@3:       {mean_recall:.4f}")
+print(f"F1 Score@3:     {mean_f1:.4f}")

Evaluation/custom_drug_eval_set_id.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a32b1282d7fd5e6d55b73499ee314410cffa69b456a7372983225a71da6b5674
+size 4001

README.md CHANGED Viewed

@@ -1,12 +1,58 @@
 ---
-title: DrugBot Retrieval Based
-emoji: 🐠
-colorFrom: gray
-colorTo: green
 sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Medical Drug QA Chatbot
+emoji: 💊
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: 4.0.0
+app_file: Scripts/app.py
 pinned: false
+license: mit
 ---
+# 💊 Medical Drug QA Chatbot
+An intelligent chatbot that answers questions about medications using advanced NLP techniques.
+## Features
+- 🔍 **Smart Query Processing**: BioBERT-based NER for drug entity extraction
+- 📚 **Hybrid Retrieval**: FAISS + BioBERT semantic reranking
+- 🤖 **AI-Powered Answers**: Groq Llama-4 for natural language generation
+- 💾 **Comprehensive Database**: Mayo Clinic drug information
+## Usage
+Simply ask questions about:
+- Side effects and warnings
+- Dosage and usage instructions
+- Drug interactions
+- Storage guidelines
+- Precautions for specific conditions
+## Example Questions
+- "What are the side effects of Aspirin?"
+- "How should I store Insulin?"
+- "What precautions should I take with Lisinopril?"
+- "Can I take Metformin with alcohol?"
+## Tech Stack
+- **Frontend**: Gradio
+- **NER**: BioBERT (alvaroalon2/biobert_chemical_ner)
+- **Embeddings**: MiniLM-V6, BioBERT
+- **Vector DB**: FAISS
+- **LLM**: Llama-4 via Groq API
+## ⚠️ Disclaimer
+This chatbot provides educational information only. Always consult healthcare professionals for medical advice.
+## Setup
+1. Clone the repository
+2. Install dependencies: `pip install -r requirements.txt`
+3. Set `GROQ_API_KEY` environment variable
+4. Build FAISS index: `python Scripts/Retrieval.py`
+5. Run: `python app.py`

Scripts/Answer_Generation.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""
+Answer Generation Module for Retrieval-based Medical QA Chatbot
+=================================================================
+This module handles answer generation using Groq API with proper error handling.
+"""
+import os
+from openai import OpenAI
+# Get API key from environment
+GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
+if GROQ_API_KEY is None:
+    print("[Warning] GROQ_API_KEY not set!")
+    client = None
+else:
+    client = OpenAI(
+        api_key=GROQ_API_KEY,
+        base_url="https://api.groq.com/openai/v1"
+    )
+# -------------------------------
+# Function: Query Groq API
+# -------------------------------
+def query_groq(prompt, model="meta-llama/llama-4-scout-17b-16e-instruct", max_tokens=300):
+    """
+    Sends a prompt to Groq API and returns the generated response.
+    Parameters:
+        prompt (str): The text prompt for the model.
+        model (str): Model name deployed on Groq API.
+        max_tokens (int): Maximum tokens allowed in the output.
+    Returns:
+        str: Model-generated response text.
+    """
+    if client is None:
+        return "⚠️ Error: API key not configured. Please contact the administrator."
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": "You are a helpful biomedical assistant providing accurate drug information."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.7,
+            max_tokens=max_tokens
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"[Answer Generation] Error calling Groq API: {e}")
+        return f"⚠️ Error generating answer: {str(e)}"
+# -------------------------------
+# Function: Build Prompt
+# -------------------------------
+def build_prompt(question, context):
+    """
+    Constructs a prompt for the model combining the user question and retrieved context.
+    Parameters:
+        question (str): User's question.
+        context (str): Retrieved relevant text chunks.
+    Returns:
+        str: Complete prompt text.
+    """
+    return f"""Based strictly on the following medical information, answer the question clearly and concisely.
+Question: {question}
+Context:
+{context}
+Instructions:
+- Provide a direct, accurate answer based only on the context
+- Use clear, simple language
+- If the context doesn't contain enough information, say so
+- Do not add information not present in the context
+"""
+# -------------------------------
+# Function: Answer Generation
+# -------------------------------
+def answer_generation(question, top_chunks, top_k=3):
+    """
+    Generates an answer based on retrieved top chunks.
+    Parameters:
+        question (str): User's question.
+        top_chunks (DataFrame): Retrieved top chunks with context.
+        top_k (int): Number of top chunks to use for answer generation.
+    Returns:
+        str: Final generated answer.
+    """
+    try:
+        # Select top-k chunks
+        top_chunks = top_chunks.head(top_k)
+        print(f"[Answer Generation] Using top {len(top_chunks)} chunks")
+        if top_chunks.empty:
+            return "⚠️ No relevant information found. Please try rephrasing your question."
+        # Join context
+        context = "\n\n".join([
+            f"Drug: {row['drug_name']}\n"
+            f"Section: {row['section']}\n"
+            f"Info: {row['chunk_text']}"
+            for _, row in top_chunks.iterrows()
+        ])
+        # Build prompt and query Groq
+        prompt = build_prompt(question, context)
+        answer = query_groq(prompt)
+        return answer
+    except Exception as e:
+        print(f"[Answer Generation] Error: {e}")
+        return f"⚠️ Error generating answer: {str(e)}"

Scripts/Query_processing.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""
+Query Processing Pipeline for Retrieval-based QA Chatbot
+========================================================
+This module handles:
+1. Query preprocessing
+2. Intent and sub-intent classification
+3. Named Entity Recognition (NER) using lightweight BioBERT
+Uses: alvaroalon2/biobert_chemical_ner (~140MB, optimized for drugs/chemicals)
+"""
+import re
+from typing import List, Tuple
+from transformers import pipeline
+import torch
+# -------------------------------
+# Initialize Lightweight NER Model
+# -------------------------------
+print("[NER] Loading lightweight BioBERT NER model...")
+try:
+    # This model is specifically trained for chemical/drug entity recognition
+    ner_model = pipeline(
+        "ner",
+        model="alvaroalon2/biobert_chemical_ner",
+        aggregation_strategy="simple",
+        device=0 if torch.cuda.is_available() else -1
+    )
+    print("[NER] ✓ Model loaded successfully\n")
+except Exception as e:
+    print(f"[NER] ✗ Failed to load model: {e}")
+    ner_model = None
+# -------------------------------
+# Named Entity Extraction
+# -------------------------------
+def extract_entities_BERT(question: str) -> List[str]:
+    """
+    Extract biomedical entities using lightweight BioBERT NER.
+    Parameters:
+        question (str): User query
+    Returns:
+        List[str]: Extracted entities (drugs, chemicals, etc.)
+    """
+    if ner_model is None:
+        print("[NER] Model not available, returning empty list")
+        return []
+    try:
+        # Run NER pipeline
+        entities = ner_model(question)
+        # Filter and clean entities
+        extracted = []
+        for ent in entities:
+            # Only keep high-confidence entities (>70%)
+            if ent['score'] > 0.7:
+                # Clean up subword tokens (remove ##)
+                entity_text = ent['word'].replace('##', '').strip()
+                # Filter out very short entities and common words
+                if len(entity_text) > 2 and entity_text.lower() not in ['the', 'and', 'for', 'with']:
+                    extracted.append(entity_text)
+        # Remove duplicates while preserving order
+        unique_entities = []
+        seen = set()
+        for ent in extracted:
+            ent_lower = ent.lower()
+            if ent_lower not in seen:
+                seen.add(ent_lower)
+                unique_entities.append(ent)
+        return unique_entities
+    except Exception as e:
+        print(f"[NER] Extraction failed: {e}")
+        return []
+# -------------------------------
+# Rule-Based Intent Classification
+# -------------------------------
+def classify_intent(question: str) -> str:
+    """
+    Classify the user's query into a high-level intent based on keywords.
+    Parameters:
+        question (str): The user's question.
+    Returns:
+        str: One of ['description', 'before_using', 'proper_use', 'precautions', 'side_effects']
+    """
+    q = question.lower()
+    if re.search(r"\bwhat is\b|\bused for\b|\bdefine\b", q):
+        return "description"
+    elif re.search(r"\bbefore using\b|\bshould I tell\b|\bdoctor know\b", q):
+        return "before_using"
+    elif re.search(r"\bhow to\b|\bdosage\b|\btake\b|\binstructions\b", q):
+        return "proper_use"
+    elif re.search(r"\bprecaution\b|\bpregnan\b|\bbreastfeed\b|\brisk\b", q):
+        return "precautions"
+    elif re.search(r"\bside effect\b|\badverse\b|\bnausea\b|\bdizziness\b", q):
+        return "side_effects"
+    else:
+        return "description"  # default fallback
+# -------------------------------
+# Query Preprocessing Wrapper
+# -------------------------------
+def preprocess_query(raw_query: str) -> Tuple[Tuple[str, str], List[str]]:
+    """
+    Main preprocessing function that extracts:
+    - Intent
+    - Subsection
+    - Named Entities
+    Parameters:
+        raw_query (str): The raw user question.
+    Returns:
+        Tuple[Tuple[str, str], List[str]]: ((intent, sub_intent), list of entities)
+    """
+    try:
+        intent = classify_intent(raw_query)
+        entities = extract_entities_BERT(raw_query)
+        if not entities:
+            print("[NER fallback] No entities found. Using raw query.")
+            return (intent or ""), []
+        print(f"[Query Processed] Intent = {intent}| Entities = {entities}")
+        return (intent or ""), entities
+    except Exception as e:
+        print(f"[Preprocessing failed] {e}")
+        return (""), []
+# -------------------------------
+# Optional: Test Function
+# -------------------------------
+if __name__ == "__main__":
+    """Test the NER with sample queries."""
+    test_queries = [
+        "What are the side effects of Azithromycin?",
+        "How much dosage of aspirin should I take for headache?",
+        "Can I take Lisinopril during pregnancy?",
+        "What is Metformin used for?",
+        "Are there interactions between Warfarin and Ibuprofen?",
+        "How should I store insulin?",
+    ]
+    print("\n" + "="*70)
+    print("TESTING LIGHTWEIGHT TRANSFORMER NER")
+    print("="*70 + "\n")
+    for i, query in enumerate(test_queries, 1):
+        print(f"[Test {i}] Query: {query}")
+        print("-" * 70)
+        (intent), entities = preprocess_query(query)
+        print(f"  Intent: {intent}")
+        print(f"  Entities: {entities if entities else 'None detected'}")
+        print("-" * 70 + "\n")
+    print("="*70)
+    print("TESTING COMPLETE")
+    print("="*70)

Scripts/Retrieval.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""
+Retrieval and FAISS Embedding Module for Medical QA Chatbot
+============================================================
+This module handles:
+1. Embedding documents
+2. Building and saving FAISS index
+3. Retrieval with initial FAISS search + reranking using BioBERT similarity
+"""
+import faiss
+import pandas as pd
+import numpy as np
+import torch
+from sentence_transformers import SentenceTransformer, util
+from sklearn.preprocessing import normalize
+from Query_processing import preprocess_query
+import os
+# -------------------------------
+# File Paths
+# -------------------------------
+# Get the directory of the current script
+script_dir = os.path.dirname(os.path.abspath(__file__))
+# Go up one level to project root, then into Data folder
+PROJECT_ROOT = os.path.dirname(script_dir)  # Go up from Scripts/ to project root
+DATA_FOLDER = os.path.join(PROJECT_ROOT, 'Data')
+# Define all paths
+csv_path = os.path.join(DATA_FOLDER, 'flattened_drug_dataset_cleaned.csv')
+faiss_index_path = os.path.join(DATA_FOLDER, 'faiss_index.idx')
+doc_metadata_path = os.path.join(DATA_FOLDER, 'doc_metadata.pkl')
+doc_vectors_path = os.path.join(DATA_FOLDER, 'doc_vectors.npy')
+# Load the dataset
+df = pd.read_csv(csv_path).dropna(subset=['chunk_text'])
+# -------------------------------
+# Model Initialization
+# -------------------------------
+fast_embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+biobert = SentenceTransformer('pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb')
+# -------------------------------
+# Function: Embed and Build FAISS Index
+# -------------------------------
+def Embed_and_FAISS():
+    """
+    Embeds the drug dataset and builds a FAISS index for fast retrieval.
+    Saves the index, metadata, and document vectors to disk.
+    """
+    print("Embedding document chunks using fast embedder...")
+    # Build full context strings
+    df['full_text'] = df.apply(lambda x: f"{x['drug_name']} | {x['section']} > {x['subsection']} | {x['chunk_text']}", axis=1)
+    full_texts = df['full_text'].tolist()
+    doc_embeddings = fast_embedder.encode(full_texts, convert_to_numpy=True, show_progress_bar=True)
+    # Normalize embeddings and build index
+    doc_embeddings = normalize(doc_embeddings, axis=1, norm='l2')
+    dimension = doc_embeddings.shape[1]
+    index = faiss.IndexFlatIP(dimension)
+    index.add(doc_embeddings)
+    # Save index and metadata
+    faiss.write_index(index, faiss_index_path)
+    df.to_pickle(doc_metadata_path)
+    np.save(doc_vectors_path, doc_embeddings)
+    print("FAISS index built and saved successfully.")
+# -------------------------------
+# Function: Retrieve with Context and Averaged Embeddings
+# -------------------------------
+def retrieve_with_context_averagedembeddings(query, top_k=10, predicted_intent=None, detected_entities=None, alpha=0.8):
+    """
+    Retrieve top chunks using FAISS followed by reranking with BioBERT similarity.
+    Parameters:
+        query (str): User query text.
+        top_k (int): Number of top results to retrieve.
+        predicted_intent (str, optional): Detected intent to adjust retrieval.
+        detected_entities (list, optional): List of named entities.
+        alpha (float): Weight for combining query and intent embeddings.
+    Returns:
+        pd.DataFrame: Retrieved chunks with metadata and reranked scores.
+    """
+    print(f"[Retrieval Pipeline Started] Query: {query}")
+    # Embed and normalize the query
+    query_vec = fast_embedder.encode([query], convert_to_numpy=True)
+    if predicted_intent:
+        intent_vec = fast_embedder.encode([predicted_intent], convert_to_numpy=True)
+        query_vec = normalize((alpha * query_vec + (1 - alpha) * intent_vec), axis=1)
+    # Load FAISS index and search
+    index = faiss.read_index(faiss_index_path)
+    D, I = index.search(query_vec, top_k)
+    df_meta = pd.read_pickle(doc_metadata_path)
+    retrieved_df = df_meta.loc[I[0]].copy()
+    retrieved_df['faiss_score'] = D[0]
+    # BioBERT reranking
+    query_emb = biobert.encode(query, convert_to_tensor=True)
+    chunk_embs = biobert.encode(retrieved_df['full_text'].tolist(), convert_to_tensor=True)
+    cos_scores = util.pytorch_cos_sim(query_emb, chunk_embs)[0]
+    reranked_idx = torch.argsort(cos_scores, descending=True)
+    # Boost scores based on intent, subsection match, or entity presence
+    results = []
+    for idx in reranked_idx:
+        idx = int(idx)
+        row = retrieved_df.iloc[idx]
+        score = cos_scores[idx].item()
+        section = row['section'][0] if isinstance(row['section'], tuple) else row['section']
+        subsection = row['subsection'][0] if isinstance(row['subsection'], tuple) else row['subsection']
+        if isinstance(predicted_intent, tuple):
+            predicted_intent = predicted_intent[0]
+        if predicted_intent and section.strip().lower() == predicted_intent.strip().lower():
+            score += 0.05
+        if predicted_intent and predicted_intent.lower() in subsection.strip().lower():
+            score += 0.03
+        if detected_entities:
+            if any(ent.lower() in row['chunk_text'].lower() for ent in detected_entities):
+                score += 0.1
+        results.append({
+            'chunk_id': row['chunk_id'],
+            'drug_name': row['drug_name'],
+            'section': row['section'],
+            'subsection': row['subsection'],
+            'chunk_text': row['chunk_text'],
+            'faiss_score': row['faiss_score'],
+            'semantic_similarity_score': score
+        })
+    return pd.DataFrame(results)
+# -------------------------------
+# Function: Retrieval Wrapper
+# -------------------------------
+def Retrieval_averagedQP(raw_query, intent, entities, top_k=10, alpha=0.8):
+    """
+    Wrapper to retrieve top-k chunks given a raw user query.
+    Parameters:
+        raw_query (str): The user query.
+        intent (str): Predicted intent from query processing.
+        entities (list): Detected biomedical entities.
+        top_k (int): Number of top results to return.
+        alpha (float): Weighting between query and intent embeddings.
+    Returns:
+        pd.DataFrame: Top retrieved chunks with scores.
+    """
+    results_df = retrieve_with_context_averagedembeddings(
+        raw_query,
+        top_k=top_k,
+        predicted_intent=intent,
+        detected_entities=entities,
+        alpha=alpha
+    )
+    return results_df[['chunk_id', 'drug_name', 'section', 'subsection', 'chunk_text', 'faiss_score', 'semantic_similarity_score']]

Scripts/app.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""
+Medical Drug QA Chatbot - Gradio Interface
+Optimized for Hugging Face Spaces Deployment
+"""
+"""
+Medical Drug QA Chatbot - Gradio Interface
+"""
+import gradio as gr
+import os
+import sys
+# This ensures the imports work correctly
+current_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, current_dir)
+from Query_processing import preprocess_query
+from Retrieval import Retrieval_averagedQP
+from Answer_Generation import answer_generation
+# Rest of your code stays exactly the same...
+# Lazy imports - only load when needed
+_query_processor = None
+_retrieval_system = None
+_answer_generator = None
+def initialize_models():
+    """Lazy loading of models to speed up startup."""
+    global _query_processor, _retrieval_system, _answer_generator
+    if _query_processor is None:
+        print("[App] Loading query processor...")
+        from Query_processing import preprocess_query
+        _query_processor = preprocess_query
+    if _retrieval_system is None:
+        print("[App] Loading retrieval system...")
+        from Retrieval import Retrieval_averagedQP
+        _retrieval_system = Retrieval_averagedQP
+    if _answer_generator is None:
+        print("[App] Loading answer generator...")
+        from Answer_Generation import answer_generation
+        _answer_generator = answer_generation
+    return _query_processor, _retrieval_system, _answer_generator
+def chat_agent(message: str, history: list) -> tuple:
+    """
+    Main chat function with error handling and loading states.
+    Parameters:
+        message (str): User's question
+        history (list): Chat history
+    Returns:
+        tuple: (empty string, updated history)
+    """
+    if not message or message.strip() == "":
+        return "", history
+    try:
+        # Initialize models
+        preprocess_query, Retrieval_averagedQP, answer_generation = initialize_models()
+        # Step 1: Query Processing
+        print(f"[Chat] Processing query: {message}")
+        intent, entities = preprocess_query(message)
+        # Step 2: Retrieval
+        print(f"[Chat] Retrieving relevant chunks...")
+        chunks = Retrieval_averagedQP(message, intent, entities, top_k=10, alpha=0.8)
+        if chunks.empty:
+            error_msg = "⚠️ Sorry, I couldn't find relevant information in the database. Please try rephrasing your question."
+            history.append({"role": "user", "content": message})
+            history.append({"role": "assistant", "content": error_msg})
+            return "", history
+        # Step 3: Answer Generation
+        print(f"[Chat] Generating answer...")
+        answer = answer_generation(message, chunks, top_k=3)
+        # Format context for display
+        context = "\n\n".join([
+            f"**{row['drug_name']} | {row['section']} > {row['subsection']}**\n"
+            f"{row['chunk_text'][:200]}{'...' if len(row['chunk_text']) > 200 else ''}\n"
+            f"*Relevance Score: {round(row['semantic_similarity_score'], 3)}*"
+            for i, row in chunks.head(3).iterrows()
+        ])
+        # Add to history
+        history.append({"role": "user", "content": message})
+        history.append({"role": "assistant", "content": answer})
+        history.append({
+            "role": "assistant",
+            "content": f"<details><summary>📚 View Source Chunks</summary>\n\n{context}\n\n</details>"
+        })
+        print(f"[Chat] ✓ Response generated successfully")
+        return "", history
+    except Exception as e:
+        print(f"[Chat] ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        error_msg = f"❌ An error occurred: {str(e)}\n\nPlease try again or rephrase your question."
+        history.append({"role": "user", "content": message})
+        history.append({"role": "assistant", "content": error_msg})
+        return "", history
+# Build Gradio Interface
+with gr.Blocks(
+    theme=gr.themes.Soft(primary_hue="cyan"),
+    title="Medical Drug QA Chatbot",
+    css="""
+    .info-container, .info-footer {
+        width: 90%;
+        max-width: 1000px;
+        margin: 0 auto;
+    }
+    details.info-section, details.about-section {
+        background: white;
+        border-radius: 12px;
+        box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+        margin: 1em 0;
+        padding: 0;
+    }
+    details > summary {
+        padding: 1em 1.5em;
+        font-size: 1.1em;
+        font-weight: bold;
+        color: #00838f;
+        cursor: pointer;
+        border-radius: 12px;
+        transition: background-color 0.3s ease;
+    }
+    details > summary:hover {
+        background-color: #e0f7fa;
+    }
+    .disclaimer {
+        background: #fff3cd;
+        border: 1px solid #ffc107;
+        border-radius: 8px;
+        padding: 1em;
+        margin: 1em 0;
+    }
+    """
+) as demo:
+    # Header
+    gr.Markdown("# 💊 Medical Drug QA Chatbot")
+    gr.Markdown("_Ask questions about medications and get reliable answers from trusted medical sources._")
+    # Instructions
+    with gr.Accordion("🤔 How to Use", open=False):
+        gr.Markdown("""
+        Simply type your question about any medication. You can ask about:
+        - **Side effects** and warnings
+        - **Dosage** and usage instructions
+        - **Drug interactions**
+        - **Storage** and handling
+        - **Precautions** for specific conditions
+        ### 💡 Example Questions:
+        - "What are the common side effects of Aspirin?"
+        - "How should I store Insulin?"
+        - "What precautions should I take with Lisinopril?"
+        - "Can I drink alcohol while taking Metformin?"
+        """)
+    # Chatbot
+    chatbot = gr.Chatbot(
+        type="messages",
+        height=500,
+        label="Chat",
+        show_label=False,
+        avatar_images=(None, "🤖")
+    )
+    # Input
+    with gr.Row():
+        msg = gr.Textbox(
+            placeholder="Ask your medical question here...",
+            scale=9,
+            container=False,
+            show_label=False
+        )
+        submit = gr.Button("Send", scale=1, variant="primary")
+    with gr.Row():
+        clear = gr.Button("🗑️ Clear Chat", scale=1)
+    # Event handlers
+    msg.submit(
+        fn=chat_agent,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+    )
+    submit.click(
+        fn=chat_agent,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+    )
+    clear.click(
+        fn=lambda: (None, []),
+        inputs=None,
+        outputs=[msg, chatbot],
+    )
+    # About section
+    with gr.Accordion("📚 About This Project", open=False):
+        gr.Markdown("""
+        This Medical Drug QA system uses advanced NLP technologies:
+        - **Data Source**: Mayo Clinic's comprehensive drug database
+        - **NER**: BioBERT for chemical/drug entity recognition
+        - **Retrieval**: Hybrid system with MiniLM-V6 + BioBERT reranking
+        - **Answer Generation**: Llama-4 via Groq API
+        **Technologies**: Transformers, FAISS, Sentence-BERT, Gradio
+        """)
+    # Disclaimer
+    gr.HTML("""
+    <div class="disclaimer">
+        <strong>⚠️ Medical Disclaimer</strong>: This chatbot provides educational information only.
+        It should NOT be used as a substitute for professional medical advice, diagnosis, or treatment.
+        Always consult a qualified healthcare provider for medical decisions.
+    </div>
+    """)
+# Launch
+if __name__ == "__main__":
+    demo.queue()  # Enable queuing for better performance
+    demo.launch(
+        share=False,  # Set to False for HF Spaces
+        show_error=True
+    )

Scripts/demo.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+Main Execution Script for Retrieval-based Medical QA Chatbot
+============================================================
+This script handles:
+1. Query preprocessing
+2. Information retrieval
+3. Answer generation
+"""
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+from Query_processing import preprocess_query
+from Retrieval import Retrieval_averagedQP
+from Answer_Generation import answer_generation
+from Retrieval import Embed_and_FAISS
+# -------------------------------
+# Optional: Embed and Store FAISS Index
+# -------------------------------
+# Uncomment the below line to generate embeddings and build the FAISS index if not already done.
+# Embed_and_FAISS()
+# -------------------------------
+# Define User Question
+# -------------------------------
+Question = "how much dosage of ibuprofen should be taken for treatment of fever?"
+# -------------------------------
+# Step 1: Query Preprocessing
+# -------------------------------
+intent, entities = preprocess_query(Question)
+# -------------------------------
+# Step 2: Retrieve Relevant Chunks
+# -------------------------------
+top_chunks = Retrieval_averagedQP(Question, intent, entities, top_k=10, alpha=0.8)
+# -------------------------------
+# Step 3: Answer Generation
+# -------------------------------
+Generated_answer = answer_generation(Question, top_chunks, top_k=3)
+# -------------------------------
+# Display Generated Answer
+# -------------------------------
+print("Generated Answer:", Generated_answer)

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+# Web Framework
+gradio>=4.0.0
+# Data Processing
+pandas>=2.0.0
+numpy>=1.24.0
+# NLP & ML
+torch>=2.0.0
+transformers>=4.35.0
+sentence-transformers>=2.2.0
+scikit-learn>=1.3.0
+# Vector Search
+faiss-cpu>=1.7.4
+# API Client
+openai>=1.0.0
+# Optional Performance
+accelerate>=0.24.0
+sentencepiece>=0.1.99