bibeksah commited on
Commit
146b51b
·
1 Parent(s): 323e21c

changing field frm pdf to raw data

Browse files
Files changed (1) hide show
  1. mylangv2.py +116 -99
mylangv2.py CHANGED
@@ -6,33 +6,9 @@ import os
6
  import logging
7
  import re
8
  import json
 
9
  from dotenv import load_dotenv
10
- from typing import Dict, List, Any, Tuple
11
-
12
- # Load environment variables
13
- dotenv_path = os.getenv("DOTENV_PATH")
14
- if dotenv_path:
15
- load_dotenv(dotenv_path)
16
- else:
17
- load_dotenv()
18
-
19
- # Validate required environment variables
20
- def check_env():
21
- required = [
22
- "AZURE_OPENAI_API_KEY",
23
- "AZURE_OPENAI_ENDPOINT",
24
- "AZURE_OPENAI_EMBEDDING_DEPLOYMENT",
25
- "AZURE_OPENAI_CHAT_DEPLOYMENT",
26
- "AZURE_OPENAI_API_VERSION"
27
- ]
28
- missing = [v for v in required if not os.getenv(v)]
29
- if missing:
30
- raise EnvironmentError(f"Missing required environment variables: {', '.join(missing)}")
31
-
32
- check_env()
33
-
34
- # Configure logging
35
- logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s:%(message)s")
36
 
37
  # LangChain and Azure OpenAI imports
38
  from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -52,103 +28,106 @@ except ImportError as e:
52
  "Install faiss-cpu/faiss-gpu or downgrade NumPy to <2.0 to enable FAISS." % e
53
  )
54
 
55
- # Fallback in-memory vectorstore with shape validation
56
- def _fallback_vectorstore(texts: List[str], embeddings_client) -> Any:
57
- """Creates a basic in-memory vectorstore with cosine similarity search and embedding shape checks."""
58
- import numpy as _np
59
-
60
- embs = embeddings_client.embed_documents(texts)
61
- dim = len(embs[0]) if embs else 0
62
-
63
- class Doc:
64
- def __init__(self, content: str):
65
- self.page_content = content
66
-
67
- class BasicVectorStore:
68
- def __init__(self, texts: List[str], embs: List[List[float]]):
69
- self.texts = texts
70
- self.embs = embs
71
-
72
- def similarity_search(self, query: str, k: int = 3) -> List[Doc]:
73
- q_emb = embeddings_client.embed_query(query)
74
- if len(q_emb) != dim:
75
- raise ValueError(f"Query embedding dimension {len(q_emb)} != stored dimension {dim}")
76
- sims = []
77
- for emb in self.embs:
78
- if len(emb) != dim:
79
- raise ValueError("Stored embedding has unexpected dimension")
80
- sims.append(_np.dot(q_emb, emb) / (_np.linalg.norm(q_emb) * _np.linalg.norm(emb)))
81
- idxs = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:k]
82
- return [Doc(self.texts[i]) for i in idxs]
83
-
84
- return BasicVectorStore(texts, embs)
85
-
86
  class DocumentProcessor:
87
- def __init__(self):
88
- self.embeddings = AzureOpenAIEmbeddings(
 
 
 
 
 
89
  azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
90
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),
91
  api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
92
  model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
93
  )
94
- self.text_splitter = RecursiveCharacterTextSplitter(
95
  chunk_size=1000,
96
  chunk_overlap=200,
97
- length_function=len,
98
  separators=["\n\n", "\n", " ", ""]
99
  )
100
 
101
- def process_text(self, text: str, persist_directory: str = None) -> Tuple[Any, List[str]]:
102
- texts = self.text_splitter.split_text(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  if FAISS:
104
  try:
105
- vs = FAISS.from_texts(texts=texts, embedding=self.embeddings)
 
106
  if persist_directory:
107
  vs.save_local(persist_directory)
108
- logging.info(f"Processed {len(texts)} chunks into FAISS vectorstore.")
109
- return vs, texts
 
110
  except Exception as e:
111
  logging.warning(f"FAISS.from_texts failed ({e}), using fallback vectorstore.")
112
- vs_fb = _fallback_vectorstore(texts, self.embeddings)
113
- logging.info(f"Processed {len(texts)} chunks into fallback vectorstore.")
114
- return vs_fb, texts
115
 
116
- def process_uploaded_document(self, pdf_path: str, persist_directory: str = None) -> Tuple[Any, List[str]]:
 
 
 
117
  loader = PyPDFLoader(pdf_path)
118
  pages = loader.load()
119
  docs = self.text_splitter.split_documents(pages)
 
 
120
  if FAISS:
121
  try:
122
  vs = FAISS.from_documents(documents=docs, embedding=self.embeddings)
 
123
  if persist_directory:
124
  vs.save_local(persist_directory)
125
- logging.info(f"Processed PDF with {len(docs)} chunks into FAISS vectorstore.")
126
- raw = [doc.page_content for doc in docs]
127
- return vs, raw
128
  except Exception as e:
129
  logging.warning(f"FAISS.from_documents failed ({e}), falling back.")
130
- texts = [doc.page_content for doc in docs]
131
- vs_fb = _fallback_vectorstore(texts, self.embeddings)
132
  logging.info(f"Processed PDF with {len(texts)} chunks into fallback vectorstore.")
133
- return vs_fb, texts
134
 
135
  class QuestionGenerator:
136
- def __init__(self):
137
- self.llm = AzureChatOpenAI(
138
- azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
139
- api_key=os.getenv("AZURE_OPENAI_API_KEY"),
140
- api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
141
- model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT"),
142
- temperature=0.3
143
- )
144
- self.chain = LLMChain(
145
- llm=self.llm,
146
- prompt=PromptTemplate(
147
- input_variables=[
148
- "context","num_questions","question_type","subject",
149
- "class_grade","topic","difficulty","bloom_level","instructions"
150
- ],
151
- template=(
152
  """
153
  Based on the following context:
154
  {context}
@@ -165,16 +144,43 @@ Additional Instructions: {instructions}
165
  Format as JSON:
166
  {"questions": [{"question":"","options":[],"correctAnswer":"","explanation":""}]}
167
  """
168
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  )
170
  )
171
 
172
  def generate_questions(self, topic_data: Dict[str, Any], vectorstore: Any) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
173
  context = ""
174
  if vectorstore:
175
- docs = vectorstore.similarity_search(f"{topic_data['subjectName']} {topic_data['sectionName']}", k=3)
 
 
176
  context = "\n".join(getattr(doc, 'page_content', '') for doc in docs)
177
  logging.info(f"Context length: {len(context)}")
 
178
  payload = {
179
  "context": context,
180
  "num_questions": topic_data['numQuestions'],
@@ -195,7 +201,7 @@ Format as JSON:
195
  result = json.loads(output)
196
  except json.JSONDecodeError:
197
  logging.error(f"JSON parsing failed. Raw output: {output}")
198
- raise ValueError(f"Failed to parse JSON from LLM output: {output}")
199
  if 'questions' not in result:
200
  raise ValueError(f"Missing 'questions' key in output JSON: {result}")
201
  return result
@@ -225,15 +231,26 @@ class QuestionEvaluator:
225
  logging.error(f"Evaluation error: {e}")
226
  raise
227
 
228
- # CLI test
 
 
 
 
 
 
 
 
229
  def main():
 
 
230
  dp = DocumentProcessor()
231
  sample = "This is a simple test. It splits into chunks and embeds."
232
- vs, chunks = dp.process_text(sample)
233
  print("Chunks:", chunks)
 
234
  if os.path.exists('sample.pdf'):
235
- vs2, raw = dp.process_uploaded_document('sample.pdf')
236
- print("PDF raw chunks count:", len(raw))
237
 
238
  if __name__ == "__main__":
239
  main()
 
6
  import logging
7
  import re
8
  import json
9
+ import numpy as np
10
  from dotenv import load_dotenv
11
+ from typing import Dict, List, Any, Tuple, Optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # LangChain and Azure OpenAI imports
14
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
28
  "Install faiss-cpu/faiss-gpu or downgrade NumPy to <2.0 to enable FAISS." % e
29
  )
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  class DocumentProcessor:
32
+ def __init__(
33
+ self,
34
+ embeddings: Optional[AzureOpenAIEmbeddings] = None,
35
+ text_splitter: Optional[RecursiveCharacterTextSplitter] = None
36
+ ):
37
+ """Initialize DocumentProcessor with injectable embeddings and splitter."""
38
+ self.embeddings = embeddings or AzureOpenAIEmbeddings(
39
  azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
40
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),
41
  api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
42
  model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
43
  )
44
+ self.text_splitter = text_splitter or RecursiveCharacterTextSplitter(
45
  chunk_size=1000,
46
  chunk_overlap=200,
 
47
  separators=["\n\n", "\n", " ", ""]
48
  )
49
 
50
+ def _create_fallback_vectorstore(self, texts: List[str]) -> Any:
51
+ """Creates a basic in-memory vectorstore with cosine similarity search and embedding shape checks."""
52
+ embs = self.embeddings.embed_documents(texts)
53
+ dim = len(embs[0]) if embs else 0
54
+
55
+ class Doc:
56
+ def __init__(self, content: str):
57
+ self.page_content = content
58
+
59
+ class BasicVectorStore:
60
+ def __init__(self, texts: List[str], embs: List[List[float]]):
61
+ self.texts = texts
62
+ self.embs = embs
63
+
64
+ def similarity_search(self, query: str, k: int = 3) -> List[Doc]:
65
+ q_emb = self.embeddings.embed_query(query)
66
+ if len(q_emb) != dim:
67
+ raise ValueError(f"Query embedding dimension {len(q_emb)} != stored dimension {dim}")
68
+ sims = []
69
+ for emb in self.embs:
70
+ if len(emb) != dim:
71
+ raise ValueError("Stored embedding has unexpected dimension")
72
+ sims.append(np.dot(q_emb, emb) / (np.linalg.norm(q_emb) * np.linalg.norm(emb)))
73
+ idxs = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:k]
74
+ return [Doc(self.texts[i]) for i in idxs]
75
+
76
+ # Bind embeddings for inner class
77
+ BasicVectorStore.embeddings = self.embeddings
78
+ return BasicVectorStore(texts, embs)
79
+
80
+ def process_text(self, text: str, persist_directory: str = None) -> Tuple[Any, List[str], Dict[str, str]]:
81
+ """Split raw text, build vectorstore (FAISS or fallback), return store, chunks, and metadata."""
82
+ chunks = self.text_splitter.split_text(text)
83
+ backend = 'fallback'
84
  if FAISS:
85
  try:
86
+ vs = FAISS.from_texts(texts=chunks, embedding=self.embeddings)
87
+ backend = 'faiss'
88
  if persist_directory:
89
  vs.save_local(persist_directory)
90
+ _log_vectorstore_size(persist_directory)
91
+ logging.info(f"Processed {len(chunks)} chunks into FAISS vectorstore.")
92
+ return vs, chunks, {'backend': backend}
93
  except Exception as e:
94
  logging.warning(f"FAISS.from_texts failed ({e}), using fallback vectorstore.")
95
+ vs_fb = self._create_fallback_vectorstore(chunks)
96
+ logging.info(f"Processed {len(chunks)} chunks into fallback vectorstore.")
97
+ return vs_fb, chunks, {'backend': backend}
98
 
99
+ def process_uploaded_document(
100
+ self, pdf_path: str, persist_directory: str = None
101
+ ) -> Tuple[Any, List[str], Dict[str, str]]:
102
+ """Load PDF, split, build vectorstore, and return store, raw texts, and metadata."""
103
  loader = PyPDFLoader(pdf_path)
104
  pages = loader.load()
105
  docs = self.text_splitter.split_documents(pages)
106
+ texts = [doc.page_content for doc in docs]
107
+ backend = 'fallback'
108
  if FAISS:
109
  try:
110
  vs = FAISS.from_documents(documents=docs, embedding=self.embeddings)
111
+ backend = 'faiss'
112
  if persist_directory:
113
  vs.save_local(persist_directory)
114
+ _log_vectorstore_size(persist_directory)
115
+ logging.info(f"Processed PDF with {len(texts)} chunks into FAISS vectorstore.")
116
+ return vs, texts, {'backend': backend}
117
  except Exception as e:
118
  logging.warning(f"FAISS.from_documents failed ({e}), falling back.")
119
+ vs_fb = self._create_fallback_vectorstore(texts)
 
120
  logging.info(f"Processed PDF with {len(texts)} chunks into fallback vectorstore.")
121
+ return vs_fb, texts, {'backend': backend}
122
 
123
  class QuestionGenerator:
124
+ def __init__(self, prompt_template_path: str = None):
125
+ # Load prompt template from file or default
126
+ if prompt_template_path and os.path.exists(prompt_template_path):
127
+ with open(prompt_template_path) as f:
128
+ template_str = f.read()
129
+ else:
130
+ template_str = (
 
 
 
 
 
 
 
 
 
131
  """
132
  Based on the following context:
133
  {context}
 
144
  Format as JSON:
145
  {"questions": [{"question":"","options":[],"correctAnswer":"","explanation":""}]}
146
  """
147
+ )
148
+ self.llm = AzureChatOpenAI(
149
+ azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
150
+ api_key=os.getenv("AZURE_OPENAI_API_KEY"),
151
+ api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
152
+ model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT"),
153
+ temperature=0.3
154
+ )
155
+ self.chain = LLMChain(
156
+ llm=self.llm,
157
+ prompt=PromptTemplate(
158
+ input_variables=[
159
+ "context","num_questions","question_type","subject",
160
+ "class_grade","topic","difficulty","bloom_level","instructions"
161
+ ],
162
+ template=template_str
163
  )
164
  )
165
 
166
  def generate_questions(self, topic_data: Dict[str, Any], vectorstore: Any) -> Dict[str, Any]:
167
+ # Validate topic_data keys
168
+ required_keys = [
169
+ 'subjectName','sectionName','numQuestions','questionType',
170
+ 'classGrade','difficulty','bloomLevel'
171
+ ]
172
+ missing = [k for k in required_keys if k not in topic_data]
173
+ if missing:
174
+ raise ValueError(f"Missing required topic_data keys: {', '.join(missing)}")
175
+
176
  context = ""
177
  if vectorstore:
178
+ docs = vectorstore.similarity_search(
179
+ f"{topic_data['subjectName']} {topic_data['sectionName']}", k=3
180
+ )
181
  context = "\n".join(getattr(doc, 'page_content', '') for doc in docs)
182
  logging.info(f"Context length: {len(context)}")
183
+
184
  payload = {
185
  "context": context,
186
  "num_questions": topic_data['numQuestions'],
 
201
  result = json.loads(output)
202
  except json.JSONDecodeError:
203
  logging.error(f"JSON parsing failed. Raw output: {output}")
204
+ raise
205
  if 'questions' not in result:
206
  raise ValueError(f"Missing 'questions' key in output JSON: {result}")
207
  return result
 
231
  logging.error(f"Evaluation error: {e}")
232
  raise
233
 
234
+ # Helper for logging vectorstore size
235
+ def _log_vectorstore_size(directory: str):
236
+ total = 0
237
+ for root, _, files in os.walk(directory):
238
+ for f in files:
239
+ total += os.path.getsize(os.path.join(root, f))
240
+ logging.info(f"Vectorstore on disk: {total/1024:.2f} KB")
241
+
242
+ # CLI test and env validation
243
  def main():
244
+ # Validate env only on script run
245
+ check_env()
246
  dp = DocumentProcessor()
247
  sample = "This is a simple test. It splits into chunks and embeds."
248
+ vs, chunks, meta = dp.process_text(sample)
249
  print("Chunks:", chunks)
250
+ print("Backend used:", meta['backend'])
251
  if os.path.exists('sample.pdf'):
252
+ vs2, raw, meta2 = dp.process_uploaded_document('sample.pdf')
253
+ print("PDF raw chunks count:", len(raw), "Backend:", meta2['backend'])
254
 
255
  if __name__ == "__main__":
256
  main()