|
|
import streamlit as st |
|
|
from PyPDF2 import PdfReader |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from transformers import pipeline |
|
|
import faiss |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def load_pdf_text(pdf_path): |
|
|
reader = PdfReader(pdf_path) |
|
|
text = '' |
|
|
for page in reader.pages: |
|
|
text += page.extract_text() |
|
|
return text |
|
|
|
|
|
|
|
|
def chunk_text(text, max_len=500): |
|
|
sentences = text.split('. ') |
|
|
chunks, chunk = [], '' |
|
|
for sentence in sentences: |
|
|
if len(chunk) + len(sentence) <= max_len: |
|
|
chunk += sentence + '. ' |
|
|
else: |
|
|
chunks.append(chunk.strip()) |
|
|
chunk = sentence + '. ' |
|
|
chunks.append(chunk.strip()) |
|
|
return chunks |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def embed_chunks(chunks): |
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
embeddings = model.encode(chunks) |
|
|
return embeddings, model |
|
|
|
|
|
|
|
|
def answer_query(query, embeddings, chunks, model, qa_pipeline): |
|
|
query_embedding = model.encode([query]) |
|
|
index = faiss.IndexFlatL2(embeddings.shape[1]) |
|
|
index.add(np.array(embeddings)) |
|
|
_, I = index.search(np.array(query_embedding), k=3) |
|
|
context = "\n".join([chunks[i] for i in I[0]]) |
|
|
result = qa_pipeline(question=query, context=context) |
|
|
return result['answer'] |
|
|
|
|
|
|
|
|
st.title("π PDF QA with RAG") |
|
|
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"]) |
|
|
|
|
|
if uploaded_file: |
|
|
with open("document.pdf", "wb") as f: |
|
|
f.write(uploaded_file.read()) |
|
|
|
|
|
raw_text = load_pdf_text("document.pdf") |
|
|
chunks = chunk_text(raw_text) |
|
|
embeddings, embedder = embed_chunks(chunks) |
|
|
qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") |
|
|
|
|
|
query = st.text_input("Ask a question about the PDF:") |
|
|
if query: |
|
|
answer = answer_query(query, embeddings, chunks, embedder, qa) |
|
|
st.success(f"Answer: {answer}") |
|
|
|