Spaces:

bibeksah
/

Prashnotri

Sleeping

App Files Files Community

bibeksah commited on May 20

Commit

323e21c

1 Parent(s): 7ecbb59

changing field frm pdf to raw data

Browse files

Files changed (1) hide show

mylangv2.py +77 -81

mylangv2.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Mylangv2: Process raw text or uploaded PDF into vectorstore and generate questions via Azure OpenAI using LangChain.
 Includes a simple CLI test at the bottom to verify both `process_text` and `process_uploaded_document`.
 """
 import os
@@ -9,14 +9,14 @@ import json
 from dotenv import load_dotenv
 from typing import Dict, List, Any, Tuple
-# Load env vars
-dotenv_path = os.getenv('DOTENV_PATH')
 if dotenv_path:
     load_dotenv(dotenv_path)
 else:
     load_dotenv()
-# Validate env vars
 def check_env():
     required = [
         "AZURE_OPENAI_API_KEY",
@@ -31,55 +31,60 @@ def check_env():
 check_env()
-# Azure/OpenAI and LangChain imports
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
 from langchain.document_loaders import PyPDFLoader
 # Vectorstore: FAISS
 try:
     from langchain_community.vectorstores import FAISS
 except ImportError as e:
     FAISS = None
     logging.warning(
-        "FAISS import failed (%s). "
-        "Install faiss-cpu/faiss-gpu compatible with NumPy 2.0+ "
-        "or downgrade NumPy to <2.0 to use FAISS." % e
     )
-from langchain.chains import LLMChain
-from langchain_core.prompts import PromptTemplate
-from langchain.evaluation import load_evaluator
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-# Fallback in-memory vectorstore if FAISS is unavailable at runtime
-def _fallback_vectorstore(texts, embeddings_client):
-    """Creates a basic in-memory vectorstore with cosine similarity search."""
     import numpy as _np
-    # get embeddings for texts
     embs = embeddings_client.embed_documents(texts)
     class Doc:
-        def __init__(self, content): self.page_content = content
     class BasicVectorStore:
-        def __init__(self, texts, embs):
             self.texts = texts
             self.embs = embs
-        def similarity_search(self, query, k=3):
-            # embed query
             q_emb = embeddings_client.embed_query(query)
-            # cosine similarities
-            sims = (_np.dot(q_emb, emb) / (_np.linalg.norm(q_emb) * _np.linalg.norm(emb)) for emb in self.embs)
-            sims = list(sims)
-            # get top k
             idxs = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:k]
             return [Doc(self.texts[i]) for i in idxs]
-    return BasicVectorStore(texts, embs)
-logging.basicConfig(level=logging.INFO)
 class DocumentProcessor:
     def __init__(self):
-        # Initialize Azure embeddings client
         self.embeddings = AzureOpenAIEmbeddings(
             azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
             api_key=os.getenv("AZURE_OPENAI_API_KEY"),
@@ -94,10 +99,8 @@ class DocumentProcessor:
         )
     def process_text(self, text: str, persist_directory: str = None) -> Tuple[Any, List[str]]:
-        """Split raw text, build FAISS vectorstore (or fallback), return store and chunks."""
         texts = self.text_splitter.split_text(text)
-        # Try FAISS, else fallback
-        if FAISS is not None:
             try:
                 vs = FAISS.from_texts(texts=texts, embedding=self.embeddings)
                 if persist_directory:
@@ -105,40 +108,32 @@ class DocumentProcessor:
                 logging.info(f"Processed {len(texts)} chunks into FAISS vectorstore.")
                 return vs, texts
             except Exception as e:
-                logging.warning(f"FAISS failed ({e}), using in-memory fallback.")
-        # Fallback
         vs_fb = _fallback_vectorstore(texts, self.embeddings)
         logging.info(f"Processed {len(texts)} chunks into fallback vectorstore.")
         return vs_fb, texts
     def process_uploaded_document(self, pdf_path: str, persist_directory: str = None) -> Tuple[Any, List[str]]:
-        """Load and split PDF, build FAISS vectorstore, return store and raw text chunks."""
-        if FAISS is None:
-            raise ImportError(
-                "FAISS vectorstore is unavailable. "
-                "Install faiss-cpu/faiss-gpu or adjust NumPy version."
-            )
         loader = PyPDFLoader(pdf_path)
         pages = loader.load()
-        texts = self.text_splitter.split_documents(pages)
-        try:
-            vectorstore = FAISS.from_documents(
-                documents=texts,
-                embedding=self.embeddings
-            )
-        except Exception as e:
-            logging.error(f"FAISS.from_documents error: {e}")
-            raise
-        if persist_directory:
-            vectorstore.save_local(persist_directory)
-        logging.info(f"Processed PDF with {len(texts)} chunks into FAISS vectorstore.")
-        # Return raw strings for reference
-        raw_texts = [doc.page_content for doc in texts]
-        return vectorstore, raw_texts
 class QuestionGenerator:
     def __init__(self):
-        # Chat LLM
         self.llm = AzureChatOpenAI(
             azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
             api_key=os.getenv("AZURE_OPENAI_API_KEY"),
@@ -146,7 +141,14 @@ class QuestionGenerator:
             model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT"),
             temperature=0.3
         )
-        template = (
 """
 Based on the following context:
 {context}
@@ -163,27 +165,17 @@ Additional Instructions: {instructions}
 Format as JSON:
 {"questions": [{"question":"","options":[],"correctAnswer":"","explanation":""}]}
 """
-        )
-        self.chain = LLMChain(
-            llm=self.llm,
-            prompt=PromptTemplate(
-                input_variables=[
-                    "context","num_questions","question_type","subject",
-                    "class_grade","topic","difficulty","bloom_level","instructions"
-                ],
-                template=template
             )
         )
     def generate_questions(self, topic_data: Dict[str, Any], vectorstore: Any) -> Dict[str, Any]:
         context = ""
         if vectorstore:
-            docs = vectorstore.similarity_search(
-                f"{topic_data['subjectName']} {topic_data['sectionName']}", k=3
-            )
-            context = "\n".join(doc.page_content for doc in docs)
             logging.info(f"Context length: {len(context)}")
-        response = self.chain.invoke({
             "context": context,
             "num_questions": topic_data['numQuestions'],
             "question_type": topic_data['questionType'],
@@ -193,18 +185,19 @@ Format as JSON:
             "difficulty": topic_data['difficulty'],
             "bloom_level": topic_data['bloomLevel'],
             "instructions": topic_data.get('additionalInstructions','')
-        })
-        text = response['text'] if isinstance(response, dict) and 'text' in response else response
         output = text.strip()
-        if output.startswith('```'):
-            output = re.sub(r'^```[a-zA-Z]*','', output)
-            output = re.sub(r'```$','', output).strip()
         try:
             result = json.loads(output)
         except json.JSONDecodeError:
-            raise ValueError(f"Failed to parse JSON: {output}")
         if 'questions' not in result:
-            raise ValueError("Missing 'questions' in output JSON")
         return result
 class QuestionEvaluator:
@@ -221,6 +214,7 @@ class QuestionEvaluator:
         )
     def evaluate(self, question: str, answer: str, reference: str) -> Dict[str, Any]:
         try:
             return self.evaluator.evaluate_strings(
                 input=question,
@@ -231,13 +225,15 @@ class QuestionEvaluator:
             logging.error(f"Evaluation error: {e}")
             raise
-# Simple CLI test
-if __name__ == "__main__":
     dp = DocumentProcessor()
     sample = "This is a simple test. It splits into chunks and embeds."
     vs, chunks = dp.process_text(sample)
     print("Chunks:", chunks)
-    # optional PDF test (if sample.pdf exists)
     if os.path.exists('sample.pdf'):
         vs2, raw = dp.process_uploaded_document('sample.pdf')
         print("PDF raw chunks count:", len(raw))

 """
+Mylangv2: Process raw text or uploaded document into vectorstore and generate questions via Azure OpenAI using LangChain.
 Includes a simple CLI test at the bottom to verify both `process_text` and `process_uploaded_document`.
 """
 import os
 from dotenv import load_dotenv
 from typing import Dict, List, Any, Tuple
+# Load environment variables
+dotenv_path = os.getenv("DOTENV_PATH")
 if dotenv_path:
     load_dotenv(dotenv_path)
 else:
     load_dotenv()
+# Validate required environment variables
 def check_env():
     required = [
         "AZURE_OPENAI_API_KEY",
 check_env()
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s:%(message)s")
+# LangChain and Azure OpenAI imports
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
 from langchain.document_loaders import PyPDFLoader
+from langchain.chains import LLMChain
+from langchain_core.prompts import PromptTemplate
+from langchain.evaluation import load_evaluator
 # Vectorstore: FAISS
 try:
     from langchain_community.vectorstores import FAISS
 except ImportError as e:
     FAISS = None
     logging.warning(
+        "FAISS import failed (%s). Falling back to in-memory store. "
+        "Install faiss-cpu/faiss-gpu or downgrade NumPy to <2.0 to enable FAISS." % e
     )
+# Fallback in-memory vectorstore with shape validation
+def _fallback_vectorstore(texts: List[str], embeddings_client) -> Any:
+    """Creates a basic in-memory vectorstore with cosine similarity search and embedding shape checks."""
     import numpy as _np
     embs = embeddings_client.embed_documents(texts)
+    dim = len(embs[0]) if embs else 0
     class Doc:
+        def __init__(self, content: str):
+            self.page_content = content
     class BasicVectorStore:
+        def __init__(self, texts: List[str], embs: List[List[float]]):
             self.texts = texts
             self.embs = embs
+        def similarity_search(self, query: str, k: int = 3) -> List[Doc]:
             q_emb = embeddings_client.embed_query(query)
+            if len(q_emb) != dim:
+                raise ValueError(f"Query embedding dimension {len(q_emb)} != stored dimension {dim}")
+            sims = []
+            for emb in self.embs:
+                if len(emb) != dim:
+                    raise ValueError("Stored embedding has unexpected dimension")
+                sims.append(_np.dot(q_emb, emb) / (_np.linalg.norm(q_emb) * _np.linalg.norm(emb)))
             idxs = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:k]
             return [Doc(self.texts[i]) for i in idxs]
+    return BasicVectorStore(texts, embs)
 class DocumentProcessor:
     def __init__(self):
         self.embeddings = AzureOpenAIEmbeddings(
             azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
             api_key=os.getenv("AZURE_OPENAI_API_KEY"),
         )
     def process_text(self, text: str, persist_directory: str = None) -> Tuple[Any, List[str]]:
         texts = self.text_splitter.split_text(text)
+        if FAISS:
             try:
                 vs = FAISS.from_texts(texts=texts, embedding=self.embeddings)
                 if persist_directory:
                 logging.info(f"Processed {len(texts)} chunks into FAISS vectorstore.")
                 return vs, texts
             except Exception as e:
+                logging.warning(f"FAISS.from_texts failed ({e}), using fallback vectorstore.")
         vs_fb = _fallback_vectorstore(texts, self.embeddings)
         logging.info(f"Processed {len(texts)} chunks into fallback vectorstore.")
         return vs_fb, texts
     def process_uploaded_document(self, pdf_path: str, persist_directory: str = None) -> Tuple[Any, List[str]]:
         loader = PyPDFLoader(pdf_path)
         pages = loader.load()
+        docs = self.text_splitter.split_documents(pages)
+        if FAISS:
+            try:
+                vs = FAISS.from_documents(documents=docs, embedding=self.embeddings)
+                if persist_directory:
+                    vs.save_local(persist_directory)
+                logging.info(f"Processed PDF with {len(docs)} chunks into FAISS vectorstore.")
+                raw = [doc.page_content for doc in docs]
+                return vs, raw
+            except Exception as e:
+                logging.warning(f"FAISS.from_documents failed ({e}), falling back.")
+        texts = [doc.page_content for doc in docs]
+        vs_fb = _fallback_vectorstore(texts, self.embeddings)
+        logging.info(f"Processed PDF with {len(texts)} chunks into fallback vectorstore.")
+        return vs_fb, texts
 class QuestionGenerator:
     def __init__(self):
         self.llm = AzureChatOpenAI(
             azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
             api_key=os.getenv("AZURE_OPENAI_API_KEY"),
             model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT"),
             temperature=0.3
         )
+        self.chain = LLMChain(
+            llm=self.llm,
+            prompt=PromptTemplate(
+                input_variables=[
+                    "context","num_questions","question_type","subject",
+                    "class_grade","topic","difficulty","bloom_level","instructions"
+                ],
+                template=(
 """
 Based on the following context:
 {context}
 Format as JSON:
 {"questions": [{"question":"","options":[],"correctAnswer":"","explanation":""}]}
 """
+                )
             )
         )
     def generate_questions(self, topic_data: Dict[str, Any], vectorstore: Any) -> Dict[str, Any]:
         context = ""
         if vectorstore:
+            docs = vectorstore.similarity_search(f"{topic_data['subjectName']} {topic_data['sectionName']}", k=3)
+            context = "\n".join(getattr(doc, 'page_content', '') for doc in docs)
             logging.info(f"Context length: {len(context)}")
+        payload = {
             "context": context,
             "num_questions": topic_data['numQuestions'],
             "question_type": topic_data['questionType'],
             "difficulty": topic_data['difficulty'],
             "bloom_level": topic_data['bloomLevel'],
             "instructions": topic_data.get('additionalInstructions','')
+        }
+        response = self.chain.invoke(payload)
+        text = response.get('text', response) if isinstance(response, dict) else response
         output = text.strip()
+        if output.startswith('```') and output.endswith('```'):
+            output = re.sub(r'^```[a-zA-Z]*|```$', '', output).strip()
         try:
             result = json.loads(output)
         except json.JSONDecodeError:
+            logging.error(f"JSON parsing failed. Raw output: {output}")
+            raise ValueError(f"Failed to parse JSON from LLM output: {output}")
         if 'questions' not in result:
+            raise ValueError(f"Missing 'questions' key in output JSON: {result}")
         return result
 class QuestionEvaluator:
         )
     def evaluate(self, question: str, answer: str, reference: str) -> Dict[str, Any]:
+        """Evaluate question-answer pair against reference."""
         try:
             return self.evaluator.evaluate_strings(
                 input=question,
             logging.error(f"Evaluation error: {e}")
             raise
+# CLI test
+def main():
     dp = DocumentProcessor()
     sample = "This is a simple test. It splits into chunks and embeds."
     vs, chunks = dp.process_text(sample)
     print("Chunks:", chunks)
     if os.path.exists('sample.pdf'):
         vs2, raw = dp.process_uploaded_document('sample.pdf')
         print("PDF raw chunks count:", len(raw))
+if __name__ == "__main__":
+    main()