rahideer commited on
Commit
1506f8e
·
verified ·
1 Parent(s): b87a5cd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader
3
+ from sentence_transformers import SentenceTransformer
4
+ from transformers import pipeline
5
+ import faiss
6
+ import numpy as np
7
+
8
+ # Load PDF and extract text
9
+ @st.cache_data
10
+ def load_pdf_text(pdf_path):
11
+ reader = PdfReader(pdf_path)
12
+ text = ''
13
+ for page in reader.pages:
14
+ text += page.extract_text()
15
+ return text
16
+
17
+ # Split text into chunks
18
+ def chunk_text(text, max_len=500):
19
+ sentences = text.split('. ')
20
+ chunks, chunk = [], ''
21
+ for sentence in sentences:
22
+ if len(chunk) + len(sentence) <= max_len:
23
+ chunk += sentence + '. '
24
+ else:
25
+ chunks.append(chunk.strip())
26
+ chunk = sentence + '. '
27
+ chunks.append(chunk.strip())
28
+ return chunks
29
+
30
+ # Embed text using SentenceTransformer
31
+ @st.cache_resource
32
+ def embed_chunks(chunks):
33
+ model = SentenceTransformer('all-MiniLM-L6-v2')
34
+ embeddings = model.encode(chunks)
35
+ return embeddings, model
36
+
37
+ # RAG-style QA using FAISS and Transformers
38
+ def answer_query(query, embeddings, chunks, model, qa_pipeline):
39
+ query_embedding = model.encode([query])
40
+ index = faiss.IndexFlatL2(embeddings.shape[1])
41
+ index.add(np.array(embeddings))
42
+ _, I = index.search(np.array(query_embedding), k=3)
43
+ context = "\n".join([chunks[i] for i in I[0]])
44
+ result = qa_pipeline(question=query, context=context)
45
+ return result['answer']
46
+
47
+ # Streamlit UI
48
+ st.title("📄 PDF QA with RAG")
49
+ uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
50
+
51
+ if uploaded_file:
52
+ with open("document.pdf", "wb") as f:
53
+ f.write(uploaded_file.read())
54
+
55
+ raw_text = load_pdf_text("document.pdf")
56
+ chunks = chunk_text(raw_text)
57
+ embeddings, embedder = embed_chunks(chunks)
58
+ qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
59
+
60
+ query = st.text_input("Ask a question about the PDF:")
61
+ if query:
62
+ answer = answer_query(query, embeddings, chunks, embedder, qa)
63
+ st.success(f"Answer: {answer}")