import streamlit as st from PyPDF2 import PdfReader from sentence_transformers import SentenceTransformer from transformers import pipeline import faiss import numpy as np # Load PDF and extract text @st.cache_data def load_pdf_text(pdf_path): reader = PdfReader(pdf_path) text = '' for page in reader.pages: text += page.extract_text() return text # Split text into chunks def chunk_text(text, max_len=500): sentences = text.split('. ') chunks, chunk = [], '' for sentence in sentences: if len(chunk) + len(sentence) <= max_len: chunk += sentence + '. ' else: chunks.append(chunk.strip()) chunk = sentence + '. ' chunks.append(chunk.strip()) return chunks # Embed text using SentenceTransformer @st.cache_resource def embed_chunks(chunks): model = SentenceTransformer('all-MiniLM-L6-v2') embeddings = model.encode(chunks) return embeddings, model # RAG-style QA using FAISS and Transformers def answer_query(query, embeddings, chunks, model, qa_pipeline): query_embedding = model.encode([query]) index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(np.array(embeddings)) _, I = index.search(np.array(query_embedding), k=3) context = "\n".join([chunks[i] for i in I[0]]) result = qa_pipeline(question=query, context=context) return result['answer'] # Streamlit UI st.title("📄 PDF QA with RAG") uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"]) if uploaded_file: with open("document.pdf", "wb") as f: f.write(uploaded_file.read()) raw_text = load_pdf_text("document.pdf") chunks = chunk_text(raw_text) embeddings, embedder = embed_chunks(chunks) qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") query = st.text_input("Ask a question about the PDF:") if query: answer = answer_query(query, embeddings, chunks, embedder, qa) st.success(f"Answer: {answer}")