dataset / app.py
rahideer's picture
Create app.py
1506f8e verified
raw
history blame
2.01 kB
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import numpy as np
# Load PDF and extract text
@st.cache_data
def load_pdf_text(pdf_path):
reader = PdfReader(pdf_path)
text = ''
for page in reader.pages:
text += page.extract_text()
return text
# Split text into chunks
def chunk_text(text, max_len=500):
sentences = text.split('. ')
chunks, chunk = [], ''
for sentence in sentences:
if len(chunk) + len(sentence) <= max_len:
chunk += sentence + '. '
else:
chunks.append(chunk.strip())
chunk = sentence + '. '
chunks.append(chunk.strip())
return chunks
# Embed text using SentenceTransformer
@st.cache_resource
def embed_chunks(chunks):
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks)
return embeddings, model
# RAG-style QA using FAISS and Transformers
def answer_query(query, embeddings, chunks, model, qa_pipeline):
query_embedding = model.encode([query])
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
_, I = index.search(np.array(query_embedding), k=3)
context = "\n".join([chunks[i] for i in I[0]])
result = qa_pipeline(question=query, context=context)
return result['answer']
# Streamlit UI
st.title("πŸ“„ PDF QA with RAG")
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
if uploaded_file:
with open("document.pdf", "wb") as f:
f.write(uploaded_file.read())
raw_text = load_pdf_text("document.pdf")
chunks = chunk_text(raw_text)
embeddings, embedder = embed_chunks(chunks)
qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
query = st.text_input("Ask a question about the PDF:")
if query:
answer = answer_query(query, embeddings, chunks, embedder, qa)
st.success(f"Answer: {answer}")