Spaces:

rahideer
/

dataset

Sleeping

App Files Files Community

rahideer commited on Apr 18

Commit

1506f8e

verified ·

1 Parent(s): b87a5cd

Create app.py

Browse files

Files changed (1) hide show

app.py +63 -0

app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import streamlit as st
+from PyPDF2 import PdfReader
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+import faiss
+import numpy as np
+# Load PDF and extract text
+@st.cache_data
+def load_pdf_text(pdf_path):
+    reader = PdfReader(pdf_path)
+    text = ''
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+# Split text into chunks
+def chunk_text(text, max_len=500):
+    sentences = text.split('. ')
+    chunks, chunk = [], ''
+    for sentence in sentences:
+        if len(chunk) + len(sentence) <= max_len:
+            chunk += sentence + '. '
+        else:
+            chunks.append(chunk.strip())
+            chunk = sentence + '. '
+    chunks.append(chunk.strip())
+    return chunks
+# Embed text using SentenceTransformer
+@st.cache_resource
+def embed_chunks(chunks):
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    embeddings = model.encode(chunks)
+    return embeddings, model
+# RAG-style QA using FAISS and Transformers
+def answer_query(query, embeddings, chunks, model, qa_pipeline):
+    query_embedding = model.encode([query])
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(np.array(embeddings))
+    _, I = index.search(np.array(query_embedding), k=3)
+    context = "\n".join([chunks[i] for i in I[0]])
+    result = qa_pipeline(question=query, context=context)
+    return result['answer']
+# Streamlit UI
+st.title("📄 PDF QA with RAG")
+uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
+if uploaded_file:
+    with open("document.pdf", "wb") as f:
+        f.write(uploaded_file.read())
+    raw_text = load_pdf_text("document.pdf")
+    chunks = chunk_text(raw_text)
+    embeddings, embedder = embed_chunks(chunks)
+    qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
+    query = st.text_input("Ask a question about the PDF:")
+    if query:
+        answer = answer_query(query, embeddings, chunks, embedder, qa)
+        st.success(f"Answer: {answer}")