import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import numpy as np

# Load PDF and extract text
@st.cache_data
def load_pdf_text(pdf_path):
    reader = PdfReader(pdf_path)
    text = ''
    for page in reader.pages:
        text += page.extract_text()
    return text

# Split text into chunks
def chunk_text(text, max_len=500):
    sentences = text.split('. ')
    chunks, chunk = [], ''
    for sentence in sentences:
        if len(chunk) + len(sentence) <= max_len:
            chunk += sentence + '. '
        else:
            chunks.append(chunk.strip())
            chunk = sentence + '. '
    chunks.append(chunk.strip())
    return chunks

# Embed text using SentenceTransformer
@st.cache_resource
def embed_chunks(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(chunks)
    return embeddings, model

# RAG-style QA using FAISS and Transformers
def answer_query(query, embeddings, chunks, model, qa_pipeline):
    query_embedding = model.encode([query])
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    _, I = index.search(np.array(query_embedding), k=3)
    context = "\n".join([chunks[i] for i in I[0]])
    result = qa_pipeline(question=query, context=context)
    return result['answer']

# Streamlit UI
st.title("📄 PDF QA with RAG")
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

if uploaded_file:
    with open("document.pdf", "wb") as f:
        f.write(uploaded_file.read())

    raw_text = load_pdf_text("document.pdf")
    chunks = chunk_text(raw_text)
    embeddings, embedder = embed_chunks(chunks)
    qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

    query = st.text_input("Ask a question about the PDF:")
    if query:
        answer = answer_query(query, embeddings, chunks, embedder, qa)
        st.success(f"Answer: {answer}")