bibeksah commited on
Commit
08c1eb5
·
1 Parent(s): 1756424

changing field frm pdf to raw data

Browse files
Files changed (1) hide show
  1. mylangv2.py +72 -140
mylangv2.py CHANGED
@@ -7,15 +7,18 @@ from langchain_core.prompts import PromptTemplate
7
  from langchain.evaluation import load_evaluator
8
  import os
9
  from dotenv import load_dotenv
10
- from typing import Dict, List, Any, Optional, Tuple
11
  import logging
12
  from datetime import datetime
13
  import re
14
  import json
15
 
16
-
17
  # Load environment variables
18
- load_dotenv()
 
 
 
 
19
 
20
  # Check for required environment variables at startup
21
  REQUIRED_ENV_VARS = [
@@ -31,16 +34,16 @@ if missing_vars:
31
 
32
  logging.info("Checking Azure environment variables...")
33
  for var in REQUIRED_ENV_VARS:
34
- logging.info(f"{var}: {os.environ.get(var, 'Not Set')}")
35
 
36
  class DocumentProcessor:
37
  def __init__(self):
38
- # Use Azure OpenAI for embeddings
39
  self.embeddings = AzureOpenAIEmbeddings(
40
  azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
41
  api_key=os.environ["AZURE_OPENAI_API_KEY"],
42
  api_version=os.environ["AZURE_OPENAI_API_VERSION"],
43
- deployment_name="text-embedding-ada-002",
44
  )
45
  self.text_splitter = RecursiveCharacterTextSplitter(
46
  chunk_size=1000,
@@ -49,45 +52,32 @@ class DocumentProcessor:
49
  separators=["\n\n", "\n", " ", ""]
50
  )
51
 
52
- def process_uploaded_document(self, pdf_path, persist_directory=None) -> Tuple[Any, List[Any]]:
53
  """Process uploaded PDF document and create vector store"""
54
  try:
55
- # Load PDF using LangChain
56
  loader = PyPDFLoader(pdf_path)
57
  pages = loader.load()
58
-
59
- # Split text into chunks
60
  texts = self.text_splitter.split_documents(pages)
61
-
62
- # Create vector store with FAISS
63
  vectorstore = FAISS.from_documents(
64
  documents=texts,
65
  embedding=self.embeddings
66
  )
67
- # Save the vector store
68
- if persist_directory:
69
- vectorstore.save_local(persist_directory)
70
- else:
71
- vectorstore.save_local("./faiss_index")
72
-
73
  logging.info(f"Successfully processed PDF '{pdf_path}' into {len(texts)} chunks.")
74
  return vectorstore, texts
75
  except Exception as e:
76
  logging.error(f"Error processing document: {str(e)}")
77
  raise
78
 
79
- def process_text(self, text: str) -> Tuple[Any, List[Any]]:
80
  """Process raw text and create vector store"""
81
  try:
82
- # Split text into chunks
83
  texts = self.text_splitter.split_text(text)
84
-
85
- # Create vector store with FAISS
86
  vectorstore = FAISS.from_texts(
87
  texts=texts,
88
  embedding=self.embeddings
89
  )
90
-
91
  logging.info(f"Successfully processed raw text into {len(texts)} chunks.")
92
  return vectorstore, texts
93
  except Exception as e:
@@ -96,47 +86,46 @@ class DocumentProcessor:
96
 
97
  class QuestionGenerator:
98
  def __init__(self):
99
- # Use Azure OpenAI for chat
100
  self.llm = AzureChatOpenAI(
101
  azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
102
  api_key=os.environ["AZURE_OPENAI_API_KEY"],
103
  api_version=os.environ["AZURE_OPENAI_API_VERSION"],
104
- deployment_name=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT", "your-chat-deployment"),
105
- model="gpt-4",
106
  temperature=0.3
107
  )
108
 
109
  self.question_template = """
110
- Based on the following context from study materials:
111
- {context}
112
-
113
- Generate {num_questions} {question_type} questions for:
114
- Subject: {subject}
115
- Class: {class_grade}
116
- Topic: {topic}
117
- Difficulty: {difficulty}
118
- Bloom's Level: {bloom_level}
119
-
120
- Additional Instructions: {instructions}
121
-
122
- Generate questions that:
123
- 1. Are directly related to the provided context
124
- 2. Test understanding at the specified Bloom's level
125
- 3. Match the difficulty level
126
- 4. Include detailed explanations
127
-
128
- Format the response as a JSON object with the following structure:
129
- {{
130
- "questions": [
131
- {{
132
- "question": "question text",
133
- "options": ["option1", "option2", "option3", "option4"],
134
- "correctAnswer": "correct answer",
135
- "explanation": "detailed explanation"
136
- }}
137
- ]
138
- }}
139
- """
140
 
141
  self.prompt = PromptTemplate(
142
  input_variables=[
@@ -150,21 +139,14 @@ class QuestionGenerator:
150
 
151
  def generate_questions(self, topic_data: Dict[str, Any], vectorstore: Any) -> Dict[str, Any]:
152
  try:
153
- # Initialize context as empty string
154
  context = ""
155
-
156
- # Only do summarization if vectorstore exists
157
  if vectorstore:
158
- # Get relevant documents from vectorstore
159
  docs = vectorstore.similarity_search(
160
  f"{topic_data['subjectName']} {topic_data['sectionName']}",
161
  k=3
162
  )
163
- # Use the raw context from documents
164
  context = "\n".join(doc.page_content for doc in docs)
165
  logging.info(f"Using context from vectorstore: {context[:100]}...")
166
-
167
- # Generate questions using the main chain
168
  response = self.chain.invoke({
169
  "context": context,
170
  "num_questions": topic_data['numQuestions'],
@@ -176,67 +158,44 @@ class QuestionGenerator:
176
  "bloom_level": topic_data['bloomLevel'],
177
  "instructions": topic_data.get('additionalInstructions', '')
178
  })
179
-
180
- # Clean and parse the response
181
  llm_output = response['text'] if isinstance(response, dict) and 'text' in response else response
182
- logging.info(f"Raw LLM output before cleaning: {llm_output}")
183
-
184
- # Remove code block markers and leading 'json'
185
  llm_output = llm_output.strip()
186
  if llm_output.startswith('```'):
187
  llm_output = re.sub(r'^```[a-zA-Z]*\s*', '', llm_output)
188
  llm_output = re.sub(r'```$', '', llm_output)
189
- llm_output = llm_output.strip()
190
-
191
  try:
192
  result = json.loads(llm_output)
193
- except Exception as e:
194
- logging.error(f"Failed to parse LLM response as JSON: {e}\nRaw output: {llm_output}")
195
- # Try to extract the first JSON object from the output
196
  match = re.search(r'\{[\s\S]*\}', llm_output)
197
  if match:
198
- json_str = match.group(0)
199
- try:
200
- result = json.loads(json_str)
201
- logging.info("Successfully parsed JSON after extracting from output.")
202
- except Exception as e2:
203
- logging.error(f"Still failed to parse extracted JSON: {e2}\nExtracted: {json_str}")
204
- raise
205
  else:
206
  raise
207
-
208
- # Validate the result structure
209
  if not isinstance(result, dict) or 'questions' not in result:
210
  raise ValueError("Invalid response format: missing 'questions' key")
211
-
212
- # Validate each question
213
  for i, q in enumerate(result['questions']):
214
- missing_fields = [field for field in ['question', 'options'] if field not in q]
215
  if 'answer' not in q and 'correctAnswer' not in q:
216
- missing_fields.append('answer/correctAnswer')
217
- if missing_fields:
218
- logging.error(f"Question {i} missing fields: {missing_fields}")
219
- logging.error(f"Question data: {json.dumps(q, indent=2)}")
220
- raise ValueError(f"Invalid question format: missing required fields {missing_fields}")
221
-
222
  return result
223
-
224
  except Exception as e:
225
  logging.error(f"Error generating questions: {e}")
226
  raise
227
 
228
  class QuestionEvaluator:
229
  def __init__(self):
230
- # Using the correct evaluator type with proper configuration
 
 
 
 
 
231
  self.evaluator = load_evaluator(
232
  "qa",
233
- llm=AzureChatOpenAI(
234
- azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
235
- api_key=os.environ["AZURE_OPENAI_API_KEY"],
236
- api_version=os.environ["AZURE_OPENAI_API_VERSION"],
237
- deployment_name=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT", "your-chat-deployment"),
238
- temperature=0
239
- ),
240
  criteria={
241
  "relevance": "Is the answer relevant to the question?",
242
  "correctness": "Is the answer factually correct based on the context?",
@@ -244,69 +203,42 @@ class QuestionEvaluator:
244
  }
245
  )
246
  self.feedback_chain = LLMChain(
247
- llm=AzureChatOpenAI(
248
- azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
249
- api_key=os.environ["AZURE_OPENAI_API_KEY"],
250
- api_version=os.environ["AZURE_OPENAI_API_VERSION"],
251
- deployment_name=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT", "your-chat-deployment"),
252
- temperature=0.3
253
- ),
254
  prompt=PromptTemplate(
255
  input_variables=["question", "feedback"],
256
  template="""
257
- Improve the following question based on the feedback:
258
- Question: {question}
259
- Feedback: {feedback}
260
-
261
- Provide an improved version that addresses the feedback while maintaining
262
- the original learning objectives and difficulty level.
263
- """
264
  )
265
  )
266
 
267
  def evaluate_question(self, question: Dict[str, Any], context: str) -> Dict[str, Any]:
268
- """Evaluate the quality of a generated question"""
269
  try:
270
  logging.info(f"Evaluating question: {question['question']}")
271
- logging.info(f"Answer: {question['correctAnswer']}")
272
- logging.info(f"Context length: {len(context)} characters")
273
-
274
- # Using the correct evaluation method
275
  evaluation = self.evaluator.evaluate_strings(
276
- prediction=question['correctAnswer'],
277
  input=question['question'],
278
  reference=context
279
  )
280
-
281
- logging.info(f"Evaluation results: {evaluation}")
282
  return evaluation
283
  except Exception as e:
284
- logging.error(f"Error evaluating question: {str(e)}")
285
- logging.error(f"Available evaluator methods: {dir(self.evaluator)}")
286
  raise
287
 
288
  def incorporate_feedback(self, question: Dict[str, Any], feedback: str) -> Dict[str, Any]:
289
- """Incorporate feedback to improve a question"""
290
  try:
291
- improved_question = self.feedback_chain.invoke({
292
- "question": question,
293
- "feedback": feedback
294
- })
295
- logging.info("Incorporated feedback into question.")
296
- return improved_question
297
  except Exception as e:
298
- logging.error(f"Error incorporating feedback: {str(e)}")
299
  raise
300
 
301
- # Initialize the components
302
  document_processor = DocumentProcessor()
303
  question_generator = QuestionGenerator()
304
  question_evaluator = QuestionEvaluator()
305
 
306
- # ---
307
- # Required environment variables for Azure OpenAI:
308
- # AZURE_OPENAI_API_KEY
309
- # AZURE_OPENAI_ENDPOINT
310
- # AZURE_OPENAI_EMBEDDING_DEPLOYMENT (embedding deployment name)
311
- # AZURE_OPENAI_CHAT_DEPLOYMENT (chat deployment name)
312
- # AZURE_OPENAI_API_VERSION (API version for both embeddings and chat)
 
7
  from langchain.evaluation import load_evaluator
8
  import os
9
  from dotenv import load_dotenv
10
+ from typing import Dict, List, Any, Tuple
11
  import logging
12
  from datetime import datetime
13
  import re
14
  import json
15
 
 
16
  # Load environment variables
17
+ dotenv_path = os.getenv('DOTENV_PATH', None)
18
+ if dotenv_path:
19
+ load_dotenv(dotenv_path)
20
+ else:
21
+ load_dotenv()
22
 
23
  # Check for required environment variables at startup
24
  REQUIRED_ENV_VARS = [
 
34
 
35
  logging.info("Checking Azure environment variables...")
36
  for var in REQUIRED_ENV_VARS:
37
+ logging.info(f"{var}: {os.environ.get(var)}")
38
 
39
  class DocumentProcessor:
40
  def __init__(self):
41
+ # Use Azure OpenAI Embeddings with model parameter
42
  self.embeddings = AzureOpenAIEmbeddings(
43
  azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
44
  api_key=os.environ["AZURE_OPENAI_API_KEY"],
45
  api_version=os.environ["AZURE_OPENAI_API_VERSION"],
46
+ model=os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]
47
  )
48
  self.text_splitter = RecursiveCharacterTextSplitter(
49
  chunk_size=1000,
 
52
  separators=["\n\n", "\n", " ", ""]
53
  )
54
 
55
+ def process_uploaded_document(self, pdf_path: str, persist_directory: str = None) -> Tuple[Any, List[Any]]:
56
  """Process uploaded PDF document and create vector store"""
57
  try:
 
58
  loader = PyPDFLoader(pdf_path)
59
  pages = loader.load()
 
 
60
  texts = self.text_splitter.split_documents(pages)
 
 
61
  vectorstore = FAISS.from_documents(
62
  documents=texts,
63
  embedding=self.embeddings
64
  )
65
+ dest = persist_directory or "./faiss_index"
66
+ vectorstore.save_local(dest)
 
 
 
 
67
  logging.info(f"Successfully processed PDF '{pdf_path}' into {len(texts)} chunks.")
68
  return vectorstore, texts
69
  except Exception as e:
70
  logging.error(f"Error processing document: {str(e)}")
71
  raise
72
 
73
+ def process_text(self, text: str) -> Tuple[Any, List[str]]:
74
  """Process raw text and create vector store"""
75
  try:
 
76
  texts = self.text_splitter.split_text(text)
 
 
77
  vectorstore = FAISS.from_texts(
78
  texts=texts,
79
  embedding=self.embeddings
80
  )
 
81
  logging.info(f"Successfully processed raw text into {len(texts)} chunks.")
82
  return vectorstore, texts
83
  except Exception as e:
 
86
 
87
  class QuestionGenerator:
88
  def __init__(self):
89
+ # Use Azure OpenAI for chat with model parameter
90
  self.llm = AzureChatOpenAI(
91
  azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
92
  api_key=os.environ["AZURE_OPENAI_API_KEY"],
93
  api_version=os.environ["AZURE_OPENAI_API_VERSION"],
94
+ model=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT"),
 
95
  temperature=0.3
96
  )
97
 
98
  self.question_template = """
99
+ Based on the following context from study materials:
100
+ {context}
101
+
102
+ Generate {num_questions} {question_type} questions for:
103
+ Subject: {subject}
104
+ Class: {class_grade}
105
+ Topic: {topic}
106
+ Difficulty: {difficulty}
107
+ Bloom's Level: {bloom_level}
108
+
109
+ Additional Instructions: {instructions}
110
+
111
+ Generate questions that:
112
+ 1. Are directly related to the provided context
113
+ 2. Test understanding at the specified Bloom's level
114
+ 3. Match the difficulty level
115
+ 4. Include detailed explanations
116
+
117
+ Format the response as a JSON object with the following structure:
118
+ {
119
+ "questions": [
120
+ {
121
+ "question": "question text",
122
+ "options": ["option1", "option2", "option3", "option4"],
123
+ "correctAnswer": "correct answer",
124
+ "explanation": "detailed explanation"
125
+ }
126
+ ]
127
+ }
128
+ """
129
 
130
  self.prompt = PromptTemplate(
131
  input_variables=[
 
139
 
140
  def generate_questions(self, topic_data: Dict[str, Any], vectorstore: Any) -> Dict[str, Any]:
141
  try:
 
142
  context = ""
 
 
143
  if vectorstore:
 
144
  docs = vectorstore.similarity_search(
145
  f"{topic_data['subjectName']} {topic_data['sectionName']}",
146
  k=3
147
  )
 
148
  context = "\n".join(doc.page_content for doc in docs)
149
  logging.info(f"Using context from vectorstore: {context[:100]}...")
 
 
150
  response = self.chain.invoke({
151
  "context": context,
152
  "num_questions": topic_data['numQuestions'],
 
158
  "bloom_level": topic_data['bloomLevel'],
159
  "instructions": topic_data.get('additionalInstructions', '')
160
  })
 
 
161
  llm_output = response['text'] if isinstance(response, dict) and 'text' in response else response
 
 
 
162
  llm_output = llm_output.strip()
163
  if llm_output.startswith('```'):
164
  llm_output = re.sub(r'^```[a-zA-Z]*\s*', '', llm_output)
165
  llm_output = re.sub(r'```$', '', llm_output)
166
+ llm_output = llm_output.strip()
 
167
  try:
168
  result = json.loads(llm_output)
169
+ except Exception:
 
 
170
  match = re.search(r'\{[\s\S]*\}', llm_output)
171
  if match:
172
+ result = json.loads(match.group(0))
 
 
 
 
 
 
173
  else:
174
  raise
 
 
175
  if not isinstance(result, dict) or 'questions' not in result:
176
  raise ValueError("Invalid response format: missing 'questions' key")
 
 
177
  for i, q in enumerate(result['questions']):
178
+ missing = [f for f in ['question', 'options'] if f not in q]
179
  if 'answer' not in q and 'correctAnswer' not in q:
180
+ missing.append('answer/correctAnswer')
181
+ if missing:
182
+ raise ValueError(f"Invalid question format: missing {missing}")
 
 
 
183
  return result
 
184
  except Exception as e:
185
  logging.error(f"Error generating questions: {e}")
186
  raise
187
 
188
  class QuestionEvaluator:
189
  def __init__(self):
190
+ common_kwargs = {
191
+ "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
192
+ "api_key": os.environ["AZURE_OPENAI_API_KEY"],
193
+ "api_version": os.environ["AZURE_OPENAI_API_VERSION"],
194
+ "model": os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT"),
195
+ }
196
  self.evaluator = load_evaluator(
197
  "qa",
198
+ llm=AzureChatOpenAI(**common_kwargs, temperature=0),
 
 
 
 
 
 
199
  criteria={
200
  "relevance": "Is the answer relevant to the question?",
201
  "correctness": "Is the answer factually correct based on the context?",
 
203
  }
204
  )
205
  self.feedback_chain = LLMChain(
206
+ llm=AzureChatOpenAI(**common_kwargs, temperature=0.3),
 
 
 
 
 
 
207
  prompt=PromptTemplate(
208
  input_variables=["question", "feedback"],
209
  template="""
210
+ Improve the following question based on the feedback:
211
+ Question: {question}
212
+ Feedback: {feedback}
213
+
214
+ Provide an improved version that addresses the feedback while maintaining
215
+ the original learning objectives and difficulty level.
216
+ """
217
  )
218
  )
219
 
220
  def evaluate_question(self, question: Dict[str, Any], context: str) -> Dict[str, Any]:
 
221
  try:
222
  logging.info(f"Evaluating question: {question['question']}")
 
 
 
 
223
  evaluation = self.evaluator.evaluate_strings(
224
+ prediction=question.get('correctAnswer', ''),
225
  input=question['question'],
226
  reference=context
227
  )
 
 
228
  return evaluation
229
  except Exception as e:
230
+ logging.error(f"Error evaluating question: {e}")
 
231
  raise
232
 
233
  def incorporate_feedback(self, question: Dict[str, Any], feedback: str) -> Dict[str, Any]:
 
234
  try:
235
+ return self.feedback_chain.invoke({"question": question['question'], "feedback": feedback})
 
 
 
 
 
236
  except Exception as e:
237
+ logging.error(f"Error incorporating feedback: {e}")
238
  raise
239
 
240
+ # Initialize components
241
  document_processor = DocumentProcessor()
242
  question_generator = QuestionGenerator()
243
  question_evaluator = QuestionEvaluator()
244