Spaces:

bibeksah
/

Prashnotri

Sleeping

App Files Files Community

bibeksah commited on May 20

Commit

08c1eb5

1 Parent(s): 1756424

changing field frm pdf to raw data

Browse files

Files changed (1) hide show

mylangv2.py +72 -140

mylangv2.py CHANGED Viewed

@@ -7,15 +7,18 @@ from langchain_core.prompts import PromptTemplate
 from langchain.evaluation import load_evaluator
 import os
 from dotenv import load_dotenv
-from typing import Dict, List, Any, Optional, Tuple
 import logging
 from datetime import datetime
 import re
 import json
 # Load environment variables
-load_dotenv()
 # Check for required environment variables at startup
 REQUIRED_ENV_VARS = [
@@ -31,16 +34,16 @@ if missing_vars:
 logging.info("Checking Azure environment variables...")
 for var in REQUIRED_ENV_VARS:
-    logging.info(f"{var}: {os.environ.get(var, 'Not Set')}")
 class DocumentProcessor:
     def __init__(self):
-        # Use Azure OpenAI for embeddings
         self.embeddings = AzureOpenAIEmbeddings(
             azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
             api_key=os.environ["AZURE_OPENAI_API_KEY"],
             api_version=os.environ["AZURE_OPENAI_API_VERSION"],
-            deployment_name="text-embedding-ada-002",
         )
         self.text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
@@ -49,45 +52,32 @@ class DocumentProcessor:
             separators=["\n\n", "\n", " ", ""]
         )
-    def process_uploaded_document(self, pdf_path, persist_directory=None) -> Tuple[Any, List[Any]]:
         """Process uploaded PDF document and create vector store"""
         try:
-            # Load PDF using LangChain
             loader = PyPDFLoader(pdf_path)
             pages = loader.load()
-            # Split text into chunks
             texts = self.text_splitter.split_documents(pages)
-            # Create vector store with FAISS
             vectorstore = FAISS.from_documents(
                 documents=texts,
                 embedding=self.embeddings
             )
-            # Save the vector store
-            if persist_directory:
-                vectorstore.save_local(persist_directory)
-            else:
-                vectorstore.save_local("./faiss_index")
             logging.info(f"Successfully processed PDF '{pdf_path}' into {len(texts)} chunks.")
             return vectorstore, texts
         except Exception as e:
             logging.error(f"Error processing document: {str(e)}")
             raise
-    def process_text(self, text: str) -> Tuple[Any, List[Any]]:
         """Process raw text and create vector store"""
         try:
-            # Split text into chunks
             texts = self.text_splitter.split_text(text)
-            # Create vector store with FAISS
             vectorstore = FAISS.from_texts(
                 texts=texts,
                 embedding=self.embeddings
             )
             logging.info(f"Successfully processed raw text into {len(texts)} chunks.")
             return vectorstore, texts
         except Exception as e:
@@ -96,47 +86,46 @@ class DocumentProcessor:
 class QuestionGenerator:
     def __init__(self):
-        # Use Azure OpenAI for chat
         self.llm = AzureChatOpenAI(
             azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
             api_key=os.environ["AZURE_OPENAI_API_KEY"],
             api_version=os.environ["AZURE_OPENAI_API_VERSION"],
-            deployment_name=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT", "your-chat-deployment"),
-            model="gpt-4",
             temperature=0.3
         )
         self.question_template = """
-        Based on the following context from study materials:
-        {context}
-        Generate {num_questions} {question_type} questions for:
-        Subject: {subject}
-        Class: {class_grade}
-        Topic: {topic}
-        Difficulty: {difficulty}
-        Bloom's Level: {bloom_level}
-        Additional Instructions: {instructions}
-        Generate questions that:
-        1. Are directly related to the provided context
-        2. Test understanding at the specified Bloom's level
-        3. Match the difficulty level
-        4. Include detailed explanations
-        Format the response as a JSON object with the following structure:
-        {{
-            "questions": [
-                {{
-                    "question": "question text",
-                    "options": ["option1", "option2", "option3", "option4"],
-                    "correctAnswer": "correct answer",
-                    "explanation": "detailed explanation"
-                }}
-            ]
-        }}
-        """
         self.prompt = PromptTemplate(
             input_variables=[
@@ -150,21 +139,14 @@ class QuestionGenerator:
     def generate_questions(self, topic_data: Dict[str, Any], vectorstore: Any) -> Dict[str, Any]:
         try:
-            # Initialize context as empty string
             context = ""
-            # Only do summarization if vectorstore exists
             if vectorstore:
-                # Get relevant documents from vectorstore
                 docs = vectorstore.similarity_search(
                     f"{topic_data['subjectName']} {topic_data['sectionName']}",
                     k=3
                 )
-                # Use the raw context from documents
                 context = "\n".join(doc.page_content for doc in docs)
                 logging.info(f"Using context from vectorstore: {context[:100]}...")
-            # Generate questions using the main chain
             response = self.chain.invoke({
                 "context": context,
                 "num_questions": topic_data['numQuestions'],
@@ -176,67 +158,44 @@ class QuestionGenerator:
                 "bloom_level": topic_data['bloomLevel'],
                 "instructions": topic_data.get('additionalInstructions', '')
             })
-            # Clean and parse the response
             llm_output = response['text'] if isinstance(response, dict) and 'text' in response else response
-            logging.info(f"Raw LLM output before cleaning: {llm_output}")
-            # Remove code block markers and leading 'json'
             llm_output = llm_output.strip()
             if llm_output.startswith('```'):
                 llm_output = re.sub(r'^```[a-zA-Z]*\s*', '', llm_output)
                 llm_output = re.sub(r'```$', '', llm_output)
-            llm_output = llm_output.strip()
             try:
                 result = json.loads(llm_output)
-            except Exception as e:
-                logging.error(f"Failed to parse LLM response as JSON: {e}\nRaw output: {llm_output}")
-                # Try to extract the first JSON object from the output
                 match = re.search(r'\{[\s\S]*\}', llm_output)
                 if match:
-                    json_str = match.group(0)
-                    try:
-                        result = json.loads(json_str)
-                        logging.info("Successfully parsed JSON after extracting from output.")
-                    except Exception as e2:
-                        logging.error(f"Still failed to parse extracted JSON: {e2}\nExtracted: {json_str}")
-                        raise
                 else:
                     raise
-            # Validate the result structure
             if not isinstance(result, dict) or 'questions' not in result:
                 raise ValueError("Invalid response format: missing 'questions' key")
-            # Validate each question
             for i, q in enumerate(result['questions']):
-                missing_fields = [field for field in ['question', 'options'] if field not in q]
                 if 'answer' not in q and 'correctAnswer' not in q:
-                    missing_fields.append('answer/correctAnswer')
-                if missing_fields:
-                    logging.error(f"Question {i} missing fields: {missing_fields}")
-                    logging.error(f"Question data: {json.dumps(q, indent=2)}")
-                    raise ValueError(f"Invalid question format: missing required fields {missing_fields}")
             return result
         except Exception as e:
             logging.error(f"Error generating questions: {e}")
             raise
 class QuestionEvaluator:
     def __init__(self):
-        # Using the correct evaluator type with proper configuration
         self.evaluator = load_evaluator(
             "qa",
-            llm=AzureChatOpenAI(
-                azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
-                api_key=os.environ["AZURE_OPENAI_API_KEY"],
-                api_version=os.environ["AZURE_OPENAI_API_VERSION"],
-                deployment_name=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT", "your-chat-deployment"),
-                temperature=0
-            ),
             criteria={
                 "relevance": "Is the answer relevant to the question?",
                 "correctness": "Is the answer factually correct based on the context?",
@@ -244,69 +203,42 @@ class QuestionEvaluator:
             }
         )
         self.feedback_chain = LLMChain(
-            llm=AzureChatOpenAI(
-                azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
-                api_key=os.environ["AZURE_OPENAI_API_KEY"],
-                api_version=os.environ["AZURE_OPENAI_API_VERSION"],
-                deployment_name=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT", "your-chat-deployment"),
-                temperature=0.3
-            ),
             prompt=PromptTemplate(
                 input_variables=["question", "feedback"],
                 template="""
-                Improve the following question based on the feedback:
-                Question: {question}
-                Feedback: {feedback}
-                Provide an improved version that addresses the feedback while maintaining
-                the original learning objectives and difficulty level.
-                """
             )
         )
     def evaluate_question(self, question: Dict[str, Any], context: str) -> Dict[str, Any]:
-        """Evaluate the quality of a generated question"""
         try:
             logging.info(f"Evaluating question: {question['question']}")
-            logging.info(f"Answer: {question['correctAnswer']}")
-            logging.info(f"Context length: {len(context)} characters")
-            # Using the correct evaluation method
             evaluation = self.evaluator.evaluate_strings(
-                prediction=question['correctAnswer'],
                 input=question['question'],
                 reference=context
             )
-            logging.info(f"Evaluation results: {evaluation}")
             return evaluation
         except Exception as e:
-            logging.error(f"Error evaluating question: {str(e)}")
-            logging.error(f"Available evaluator methods: {dir(self.evaluator)}")
             raise
     def incorporate_feedback(self, question: Dict[str, Any], feedback: str) -> Dict[str, Any]:
-        """Incorporate feedback to improve a question"""
         try:
-            improved_question = self.feedback_chain.invoke({
-                "question": question,
-                "feedback": feedback
-            })
-            logging.info("Incorporated feedback into question.")
-            return improved_question
         except Exception as e:
-            logging.error(f"Error incorporating feedback: {str(e)}")
             raise
-# Initialize the components
 document_processor = DocumentProcessor()
 question_generator = QuestionGenerator()
 question_evaluator = QuestionEvaluator()
-# ---
-# Required environment variables for Azure OpenAI:
-#   AZURE_OPENAI_API_KEY
-#   AZURE_OPENAI_ENDPOINT
-#   AZURE_OPENAI_EMBEDDING_DEPLOYMENT (embedding deployment name)
-#   AZURE_OPENAI_CHAT_DEPLOYMENT (chat deployment name)
-#   AZURE_OPENAI_API_VERSION (API version for both embeddings and chat)

 from langchain.evaluation import load_evaluator
 import os
 from dotenv import load_dotenv
+from typing import Dict, List, Any, Tuple
 import logging
 from datetime import datetime
 import re
 import json
 # Load environment variables
+dotenv_path = os.getenv('DOTENV_PATH', None)
+if dotenv_path:
+    load_dotenv(dotenv_path)
+else:
+    load_dotenv()
 # Check for required environment variables at startup
 REQUIRED_ENV_VARS = [
 logging.info("Checking Azure environment variables...")
 for var in REQUIRED_ENV_VARS:
+    logging.info(f"{var}: {os.environ.get(var)}")
 class DocumentProcessor:
     def __init__(self):
+        # Use Azure OpenAI Embeddings with model parameter
         self.embeddings = AzureOpenAIEmbeddings(
             azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
             api_key=os.environ["AZURE_OPENAI_API_KEY"],
             api_version=os.environ["AZURE_OPENAI_API_VERSION"],
+            model=os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]
         )
         self.text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
             separators=["\n\n", "\n", " ", ""]
         )
+    def process_uploaded_document(self, pdf_path: str, persist_directory: str = None) -> Tuple[Any, List[Any]]:
         """Process uploaded PDF document and create vector store"""
         try:
             loader = PyPDFLoader(pdf_path)
             pages = loader.load()
             texts = self.text_splitter.split_documents(pages)
             vectorstore = FAISS.from_documents(
                 documents=texts,
                 embedding=self.embeddings
             )
+            dest = persist_directory or "./faiss_index"
+            vectorstore.save_local(dest)
             logging.info(f"Successfully processed PDF '{pdf_path}' into {len(texts)} chunks.")
             return vectorstore, texts
         except Exception as e:
             logging.error(f"Error processing document: {str(e)}")
             raise
+    def process_text(self, text: str) -> Tuple[Any, List[str]]:
         """Process raw text and create vector store"""
         try:
             texts = self.text_splitter.split_text(text)
             vectorstore = FAISS.from_texts(
                 texts=texts,
                 embedding=self.embeddings
             )
             logging.info(f"Successfully processed raw text into {len(texts)} chunks.")
             return vectorstore, texts
         except Exception as e:
 class QuestionGenerator:
     def __init__(self):
+        # Use Azure OpenAI for chat with model parameter
         self.llm = AzureChatOpenAI(
             azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
             api_key=os.environ["AZURE_OPENAI_API_KEY"],
             api_version=os.environ["AZURE_OPENAI_API_VERSION"],
+            model=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT"),
             temperature=0.3
         )
         self.question_template = """
+Based on the following context from study materials:
+{context}
+Generate {num_questions} {question_type} questions for:
+Subject: {subject}
+Class: {class_grade}
+Topic: {topic}
+Difficulty: {difficulty}
+Bloom's Level: {bloom_level}
+Additional Instructions: {instructions}
+Generate questions that:
+1. Are directly related to the provided context
+2. Test understanding at the specified Bloom's level
+3. Match the difficulty level
+4. Include detailed explanations
+Format the response as a JSON object with the following structure:
+{
+    "questions": [
+        {
+            "question": "question text",
+            "options": ["option1", "option2", "option3", "option4"],
+            "correctAnswer": "correct answer",
+            "explanation": "detailed explanation"
+        }
+    ]
+}
+"""
         self.prompt = PromptTemplate(
             input_variables=[
     def generate_questions(self, topic_data: Dict[str, Any], vectorstore: Any) -> Dict[str, Any]:
         try:
             context = ""
             if vectorstore:
                 docs = vectorstore.similarity_search(
                     f"{topic_data['subjectName']} {topic_data['sectionName']}",
                     k=3
                 )
                 context = "\n".join(doc.page_content for doc in docs)
                 logging.info(f"Using context from vectorstore: {context[:100]}...")
             response = self.chain.invoke({
                 "context": context,
                 "num_questions": topic_data['numQuestions'],
                 "bloom_level": topic_data['bloomLevel'],
                 "instructions": topic_data.get('additionalInstructions', '')
             })
             llm_output = response['text'] if isinstance(response, dict) and 'text' in response else response
             llm_output = llm_output.strip()
             if llm_output.startswith('```'):
                 llm_output = re.sub(r'^```[a-zA-Z]*\s*', '', llm_output)
                 llm_output = re.sub(r'```$', '', llm_output)
+                llm_output = llm_output.strip()
             try:
                 result = json.loads(llm_output)
+            except Exception:
                 match = re.search(r'\{[\s\S]*\}', llm_output)
                 if match:
+                    result = json.loads(match.group(0))
                 else:
                     raise
             if not isinstance(result, dict) or 'questions' not in result:
                 raise ValueError("Invalid response format: missing 'questions' key")
             for i, q in enumerate(result['questions']):
+                missing = [f for f in ['question', 'options'] if f not in q]
                 if 'answer' not in q and 'correctAnswer' not in q:
+                    missing.append('answer/correctAnswer')
+                if missing:
+                    raise ValueError(f"Invalid question format: missing {missing}")
             return result
         except Exception as e:
             logging.error(f"Error generating questions: {e}")
             raise
 class QuestionEvaluator:
     def __init__(self):
+        common_kwargs = {
+            "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
+            "api_key": os.environ["AZURE_OPENAI_API_KEY"],
+            "api_version": os.environ["AZURE_OPENAI_API_VERSION"],
+            "model": os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT"),
+        }
         self.evaluator = load_evaluator(
             "qa",
+            llm=AzureChatOpenAI(**common_kwargs, temperature=0),
             criteria={
                 "relevance": "Is the answer relevant to the question?",
                 "correctness": "Is the answer factually correct based on the context?",
             }
         )
         self.feedback_chain = LLMChain(
+            llm=AzureChatOpenAI(**common_kwargs, temperature=0.3),
             prompt=PromptTemplate(
                 input_variables=["question", "feedback"],
                 template="""
+Improve the following question based on the feedback:
+Question: {question}
+Feedback: {feedback}
+Provide an improved version that addresses the feedback while maintaining
+ the original learning objectives and difficulty level.
+"""
             )
         )
     def evaluate_question(self, question: Dict[str, Any], context: str) -> Dict[str, Any]:
         try:
             logging.info(f"Evaluating question: {question['question']}")
             evaluation = self.evaluator.evaluate_strings(
+                prediction=question.get('correctAnswer', ''),
                 input=question['question'],
                 reference=context
             )
             return evaluation
         except Exception as e:
+            logging.error(f"Error evaluating question: {e}")
             raise
     def incorporate_feedback(self, question: Dict[str, Any], feedback: str) -> Dict[str, Any]:
         try:
+            return self.feedback_chain.invoke({"question": question['question'], "feedback": feedback})
         except Exception as e:
+            logging.error(f"Error incorporating feedback: {e}")
             raise
+# Initialize components
 document_processor = DocumentProcessor()
 question_generator = QuestionGenerator()
 question_evaluator = QuestionEvaluator()