Spaces:

rahideer
/

dataset

Sleeping

App Files Files Community

dataset / app.py

rahideer

Create app.py

1506f8e verified 8 months ago

raw

history blame

2.01 kB

	import streamlit as st
	from PyPDF2 import PdfReader
	from sentence_transformers import SentenceTransformer
	from transformers import pipeline
	import faiss
	import numpy as np

	# Load PDF and extract text
	@st.cache_data
	def load_pdf_text(pdf_path):
	reader = PdfReader(pdf_path)
	text = ''
	for page in reader.pages:
	text += page.extract_text()
	return text

	# Split text into chunks
	def chunk_text(text, max_len=500):
	sentences = text.split('. ')
	chunks, chunk = [], ''
	for sentence in sentences:
	if len(chunk) + len(sentence) <= max_len:
	chunk += sentence + '. '
	else:
	chunks.append(chunk.strip())
	chunk = sentence + '. '
	chunks.append(chunk.strip())
	return chunks

	# Embed text using SentenceTransformer
	@st.cache_resource
	def embed_chunks(chunks):
	model = SentenceTransformer('all-MiniLM-L6-v2')
	embeddings = model.encode(chunks)
	return embeddings, model

	# RAG-style QA using FAISS and Transformers
	def answer_query(query, embeddings, chunks, model, qa_pipeline):
	query_embedding = model.encode([query])
	index = faiss.IndexFlatL2(embeddings.shape[1])
	index.add(np.array(embeddings))
	_, I = index.search(np.array(query_embedding), k=3)
	context = "\n".join([chunks[i] for i in I[0]])
	result = qa_pipeline(question=query, context=context)
	return result['answer']

	# Streamlit UI
	st.title("📄 PDF QA with RAG")
	uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

	if uploaded_file:
	with open("document.pdf", "wb") as f:
	f.write(uploaded_file.read())

	raw_text = load_pdf_text("document.pdf")
	chunks = chunk_text(raw_text)
	embeddings, embedder = embed_chunks(chunks)
	qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

	query = st.text_input("Ask a question about the PDF:")
	if query:
	answer = answer_query(query, embeddings, chunks, embedder, qa)
	st.success(f"Answer: {answer}")