bibeksah commited on
Commit
323e21c
·
1 Parent(s): 7ecbb59

changing field frm pdf to raw data

Browse files
Files changed (1) hide show
  1. mylangv2.py +77 -81
mylangv2.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Mylangv2: Process raw text or uploaded PDF into vectorstore and generate questions via Azure OpenAI using LangChain.
3
  Includes a simple CLI test at the bottom to verify both `process_text` and `process_uploaded_document`.
4
  """
5
  import os
@@ -9,14 +9,14 @@ import json
9
  from dotenv import load_dotenv
10
  from typing import Dict, List, Any, Tuple
11
 
12
- # Load env vars
13
- dotenv_path = os.getenv('DOTENV_PATH')
14
  if dotenv_path:
15
  load_dotenv(dotenv_path)
16
  else:
17
  load_dotenv()
18
 
19
- # Validate env vars
20
  def check_env():
21
  required = [
22
  "AZURE_OPENAI_API_KEY",
@@ -31,55 +31,60 @@ def check_env():
31
 
32
  check_env()
33
 
34
- # Azure/OpenAI and LangChain imports
 
 
 
35
  from langchain_text_splitters import RecursiveCharacterTextSplitter
36
  from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
37
  from langchain.document_loaders import PyPDFLoader
 
 
 
 
38
  # Vectorstore: FAISS
39
  try:
40
  from langchain_community.vectorstores import FAISS
41
  except ImportError as e:
42
  FAISS = None
43
  logging.warning(
44
- "FAISS import failed (%s). "
45
- "Install faiss-cpu/faiss-gpu compatible with NumPy 2.0+ "
46
- "or downgrade NumPy to <2.0 to use FAISS." % e
47
  )
48
- from langchain.chains import LLMChain
49
- from langchain_core.prompts import PromptTemplate
50
- from langchain.evaluation import load_evaluator
51
 
52
- # Configure logging
53
- logging.basicConfig(level=logging.INFO)
54
-
55
- # Fallback in-memory vectorstore if FAISS is unavailable at runtime
56
- def _fallback_vectorstore(texts, embeddings_client):
57
- """Creates a basic in-memory vectorstore with cosine similarity search."""
58
  import numpy as _np
59
- # get embeddings for texts
60
  embs = embeddings_client.embed_documents(texts)
 
 
61
  class Doc:
62
- def __init__(self, content): self.page_content = content
 
 
63
  class BasicVectorStore:
64
- def __init__(self, texts, embs):
65
  self.texts = texts
66
  self.embs = embs
67
- def similarity_search(self, query, k=3):
68
- # embed query
69
  q_emb = embeddings_client.embed_query(query)
70
- # cosine similarities
71
- sims = (_np.dot(q_emb, emb) / (_np.linalg.norm(q_emb) * _np.linalg.norm(emb)) for emb in self.embs)
72
- sims = list(sims)
73
- # get top k
 
 
 
74
  idxs = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:k]
75
  return [Doc(self.texts[i]) for i in idxs]
76
- return BasicVectorStore(texts, embs)
77
 
78
- logging.basicConfig(level=logging.INFO)
79
 
80
  class DocumentProcessor:
81
  def __init__(self):
82
- # Initialize Azure embeddings client
83
  self.embeddings = AzureOpenAIEmbeddings(
84
  azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
85
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),
@@ -94,10 +99,8 @@ class DocumentProcessor:
94
  )
95
 
96
  def process_text(self, text: str, persist_directory: str = None) -> Tuple[Any, List[str]]:
97
- """Split raw text, build FAISS vectorstore (or fallback), return store and chunks."""
98
  texts = self.text_splitter.split_text(text)
99
- # Try FAISS, else fallback
100
- if FAISS is not None:
101
  try:
102
  vs = FAISS.from_texts(texts=texts, embedding=self.embeddings)
103
  if persist_directory:
@@ -105,40 +108,32 @@ class DocumentProcessor:
105
  logging.info(f"Processed {len(texts)} chunks into FAISS vectorstore.")
106
  return vs, texts
107
  except Exception as e:
108
- logging.warning(f"FAISS failed ({e}), using in-memory fallback.")
109
- # Fallback
110
  vs_fb = _fallback_vectorstore(texts, self.embeddings)
111
  logging.info(f"Processed {len(texts)} chunks into fallback vectorstore.")
112
  return vs_fb, texts
113
 
114
  def process_uploaded_document(self, pdf_path: str, persist_directory: str = None) -> Tuple[Any, List[str]]:
115
- """Load and split PDF, build FAISS vectorstore, return store and raw text chunks."""
116
- if FAISS is None:
117
- raise ImportError(
118
- "FAISS vectorstore is unavailable. "
119
- "Install faiss-cpu/faiss-gpu or adjust NumPy version."
120
- )
121
  loader = PyPDFLoader(pdf_path)
122
  pages = loader.load()
123
- texts = self.text_splitter.split_documents(pages)
124
- try:
125
- vectorstore = FAISS.from_documents(
126
- documents=texts,
127
- embedding=self.embeddings
128
- )
129
- except Exception as e:
130
- logging.error(f"FAISS.from_documents error: {e}")
131
- raise
132
- if persist_directory:
133
- vectorstore.save_local(persist_directory)
134
- logging.info(f"Processed PDF with {len(texts)} chunks into FAISS vectorstore.")
135
- # Return raw strings for reference
136
- raw_texts = [doc.page_content for doc in texts]
137
- return vectorstore, raw_texts
138
 
139
  class QuestionGenerator:
140
  def __init__(self):
141
- # Chat LLM
142
  self.llm = AzureChatOpenAI(
143
  azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
144
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),
@@ -146,7 +141,14 @@ class QuestionGenerator:
146
  model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT"),
147
  temperature=0.3
148
  )
149
- template = (
 
 
 
 
 
 
 
150
  """
151
  Based on the following context:
152
  {context}
@@ -163,27 +165,17 @@ Additional Instructions: {instructions}
163
  Format as JSON:
164
  {"questions": [{"question":"","options":[],"correctAnswer":"","explanation":""}]}
165
  """
166
- )
167
- self.chain = LLMChain(
168
- llm=self.llm,
169
- prompt=PromptTemplate(
170
- input_variables=[
171
- "context","num_questions","question_type","subject",
172
- "class_grade","topic","difficulty","bloom_level","instructions"
173
- ],
174
- template=template
175
  )
176
  )
177
 
178
  def generate_questions(self, topic_data: Dict[str, Any], vectorstore: Any) -> Dict[str, Any]:
179
  context = ""
180
  if vectorstore:
181
- docs = vectorstore.similarity_search(
182
- f"{topic_data['subjectName']} {topic_data['sectionName']}", k=3
183
- )
184
- context = "\n".join(doc.page_content for doc in docs)
185
  logging.info(f"Context length: {len(context)}")
186
- response = self.chain.invoke({
187
  "context": context,
188
  "num_questions": topic_data['numQuestions'],
189
  "question_type": topic_data['questionType'],
@@ -193,18 +185,19 @@ Format as JSON:
193
  "difficulty": topic_data['difficulty'],
194
  "bloom_level": topic_data['bloomLevel'],
195
  "instructions": topic_data.get('additionalInstructions','')
196
- })
197
- text = response['text'] if isinstance(response, dict) and 'text' in response else response
 
198
  output = text.strip()
199
- if output.startswith('```'):
200
- output = re.sub(r'^```[a-zA-Z]*','', output)
201
- output = re.sub(r'```$','', output).strip()
202
  try:
203
  result = json.loads(output)
204
  except json.JSONDecodeError:
205
- raise ValueError(f"Failed to parse JSON: {output}")
 
206
  if 'questions' not in result:
207
- raise ValueError("Missing 'questions' in output JSON")
208
  return result
209
 
210
  class QuestionEvaluator:
@@ -221,6 +214,7 @@ class QuestionEvaluator:
221
  )
222
 
223
  def evaluate(self, question: str, answer: str, reference: str) -> Dict[str, Any]:
 
224
  try:
225
  return self.evaluator.evaluate_strings(
226
  input=question,
@@ -231,13 +225,15 @@ class QuestionEvaluator:
231
  logging.error(f"Evaluation error: {e}")
232
  raise
233
 
234
- # Simple CLI test
235
- if __name__ == "__main__":
236
  dp = DocumentProcessor()
237
  sample = "This is a simple test. It splits into chunks and embeds."
238
  vs, chunks = dp.process_text(sample)
239
  print("Chunks:", chunks)
240
- # optional PDF test (if sample.pdf exists)
241
  if os.path.exists('sample.pdf'):
242
  vs2, raw = dp.process_uploaded_document('sample.pdf')
243
  print("PDF raw chunks count:", len(raw))
 
 
 
 
1
  """
2
+ Mylangv2: Process raw text or uploaded document into vectorstore and generate questions via Azure OpenAI using LangChain.
3
  Includes a simple CLI test at the bottom to verify both `process_text` and `process_uploaded_document`.
4
  """
5
  import os
 
9
  from dotenv import load_dotenv
10
  from typing import Dict, List, Any, Tuple
11
 
12
+ # Load environment variables
13
+ dotenv_path = os.getenv("DOTENV_PATH")
14
  if dotenv_path:
15
  load_dotenv(dotenv_path)
16
  else:
17
  load_dotenv()
18
 
19
+ # Validate required environment variables
20
  def check_env():
21
  required = [
22
  "AZURE_OPENAI_API_KEY",
 
31
 
32
  check_env()
33
 
34
+ # Configure logging
35
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s:%(message)s")
36
+
37
+ # LangChain and Azure OpenAI imports
38
  from langchain_text_splitters import RecursiveCharacterTextSplitter
39
  from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
40
  from langchain.document_loaders import PyPDFLoader
41
+ from langchain.chains import LLMChain
42
+ from langchain_core.prompts import PromptTemplate
43
+ from langchain.evaluation import load_evaluator
44
+
45
  # Vectorstore: FAISS
46
  try:
47
  from langchain_community.vectorstores import FAISS
48
  except ImportError as e:
49
  FAISS = None
50
  logging.warning(
51
+ "FAISS import failed (%s). Falling back to in-memory store. "
52
+ "Install faiss-cpu/faiss-gpu or downgrade NumPy to <2.0 to enable FAISS." % e
 
53
  )
 
 
 
54
 
55
+ # Fallback in-memory vectorstore with shape validation
56
+ def _fallback_vectorstore(texts: List[str], embeddings_client) -> Any:
57
+ """Creates a basic in-memory vectorstore with cosine similarity search and embedding shape checks."""
 
 
 
58
  import numpy as _np
59
+
60
  embs = embeddings_client.embed_documents(texts)
61
+ dim = len(embs[0]) if embs else 0
62
+
63
  class Doc:
64
+ def __init__(self, content: str):
65
+ self.page_content = content
66
+
67
  class BasicVectorStore:
68
+ def __init__(self, texts: List[str], embs: List[List[float]]):
69
  self.texts = texts
70
  self.embs = embs
71
+
72
+ def similarity_search(self, query: str, k: int = 3) -> List[Doc]:
73
  q_emb = embeddings_client.embed_query(query)
74
+ if len(q_emb) != dim:
75
+ raise ValueError(f"Query embedding dimension {len(q_emb)} != stored dimension {dim}")
76
+ sims = []
77
+ for emb in self.embs:
78
+ if len(emb) != dim:
79
+ raise ValueError("Stored embedding has unexpected dimension")
80
+ sims.append(_np.dot(q_emb, emb) / (_np.linalg.norm(q_emb) * _np.linalg.norm(emb)))
81
  idxs = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:k]
82
  return [Doc(self.texts[i]) for i in idxs]
 
83
 
84
+ return BasicVectorStore(texts, embs)
85
 
86
  class DocumentProcessor:
87
  def __init__(self):
 
88
  self.embeddings = AzureOpenAIEmbeddings(
89
  azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
90
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),
 
99
  )
100
 
101
  def process_text(self, text: str, persist_directory: str = None) -> Tuple[Any, List[str]]:
 
102
  texts = self.text_splitter.split_text(text)
103
+ if FAISS:
 
104
  try:
105
  vs = FAISS.from_texts(texts=texts, embedding=self.embeddings)
106
  if persist_directory:
 
108
  logging.info(f"Processed {len(texts)} chunks into FAISS vectorstore.")
109
  return vs, texts
110
  except Exception as e:
111
+ logging.warning(f"FAISS.from_texts failed ({e}), using fallback vectorstore.")
 
112
  vs_fb = _fallback_vectorstore(texts, self.embeddings)
113
  logging.info(f"Processed {len(texts)} chunks into fallback vectorstore.")
114
  return vs_fb, texts
115
 
116
  def process_uploaded_document(self, pdf_path: str, persist_directory: str = None) -> Tuple[Any, List[str]]:
 
 
 
 
 
 
117
  loader = PyPDFLoader(pdf_path)
118
  pages = loader.load()
119
+ docs = self.text_splitter.split_documents(pages)
120
+ if FAISS:
121
+ try:
122
+ vs = FAISS.from_documents(documents=docs, embedding=self.embeddings)
123
+ if persist_directory:
124
+ vs.save_local(persist_directory)
125
+ logging.info(f"Processed PDF with {len(docs)} chunks into FAISS vectorstore.")
126
+ raw = [doc.page_content for doc in docs]
127
+ return vs, raw
128
+ except Exception as e:
129
+ logging.warning(f"FAISS.from_documents failed ({e}), falling back.")
130
+ texts = [doc.page_content for doc in docs]
131
+ vs_fb = _fallback_vectorstore(texts, self.embeddings)
132
+ logging.info(f"Processed PDF with {len(texts)} chunks into fallback vectorstore.")
133
+ return vs_fb, texts
134
 
135
  class QuestionGenerator:
136
  def __init__(self):
 
137
  self.llm = AzureChatOpenAI(
138
  azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
139
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),
 
141
  model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT"),
142
  temperature=0.3
143
  )
144
+ self.chain = LLMChain(
145
+ llm=self.llm,
146
+ prompt=PromptTemplate(
147
+ input_variables=[
148
+ "context","num_questions","question_type","subject",
149
+ "class_grade","topic","difficulty","bloom_level","instructions"
150
+ ],
151
+ template=(
152
  """
153
  Based on the following context:
154
  {context}
 
165
  Format as JSON:
166
  {"questions": [{"question":"","options":[],"correctAnswer":"","explanation":""}]}
167
  """
168
+ )
 
 
 
 
 
 
 
 
169
  )
170
  )
171
 
172
  def generate_questions(self, topic_data: Dict[str, Any], vectorstore: Any) -> Dict[str, Any]:
173
  context = ""
174
  if vectorstore:
175
+ docs = vectorstore.similarity_search(f"{topic_data['subjectName']} {topic_data['sectionName']}", k=3)
176
+ context = "\n".join(getattr(doc, 'page_content', '') for doc in docs)
 
 
177
  logging.info(f"Context length: {len(context)}")
178
+ payload = {
179
  "context": context,
180
  "num_questions": topic_data['numQuestions'],
181
  "question_type": topic_data['questionType'],
 
185
  "difficulty": topic_data['difficulty'],
186
  "bloom_level": topic_data['bloomLevel'],
187
  "instructions": topic_data.get('additionalInstructions','')
188
+ }
189
+ response = self.chain.invoke(payload)
190
+ text = response.get('text', response) if isinstance(response, dict) else response
191
  output = text.strip()
192
+ if output.startswith('```') and output.endswith('```'):
193
+ output = re.sub(r'^```[a-zA-Z]*|```$', '', output).strip()
 
194
  try:
195
  result = json.loads(output)
196
  except json.JSONDecodeError:
197
+ logging.error(f"JSON parsing failed. Raw output: {output}")
198
+ raise ValueError(f"Failed to parse JSON from LLM output: {output}")
199
  if 'questions' not in result:
200
+ raise ValueError(f"Missing 'questions' key in output JSON: {result}")
201
  return result
202
 
203
  class QuestionEvaluator:
 
214
  )
215
 
216
  def evaluate(self, question: str, answer: str, reference: str) -> Dict[str, Any]:
217
+ """Evaluate question-answer pair against reference."""
218
  try:
219
  return self.evaluator.evaluate_strings(
220
  input=question,
 
225
  logging.error(f"Evaluation error: {e}")
226
  raise
227
 
228
+ # CLI test
229
+ def main():
230
  dp = DocumentProcessor()
231
  sample = "This is a simple test. It splits into chunks and embeds."
232
  vs, chunks = dp.process_text(sample)
233
  print("Chunks:", chunks)
 
234
  if os.path.exists('sample.pdf'):
235
  vs2, raw = dp.process_uploaded_document('sample.pdf')
236
  print("PDF raw chunks count:", len(raw))
237
+
238
+ if __name__ == "__main__":
239
+ main()