Spaces:

bibeksah
/

Prashnotri

Sleeping

App Files Files Community

bibeksah commited on May 20

Commit

146b51b

1 Parent(s): 323e21c

changing field frm pdf to raw data

Browse files

Files changed (1) hide show

mylangv2.py +116 -99

mylangv2.py CHANGED Viewed

@@ -6,33 +6,9 @@ import os
 import logging
 import re
 import json
 from dotenv import load_dotenv
-from typing import Dict, List, Any, Tuple
-# Load environment variables
-dotenv_path = os.getenv("DOTENV_PATH")
-if dotenv_path:
-    load_dotenv(dotenv_path)
-else:
-    load_dotenv()
-# Validate required environment variables
-def check_env():
-    required = [
-        "AZURE_OPENAI_API_KEY",
-        "AZURE_OPENAI_ENDPOINT",
-        "AZURE_OPENAI_EMBEDDING_DEPLOYMENT",
-        "AZURE_OPENAI_CHAT_DEPLOYMENT",
-        "AZURE_OPENAI_API_VERSION"
-    ]
-    missing = [v for v in required if not os.getenv(v)]
-    if missing:
-        raise EnvironmentError(f"Missing required environment variables: {', '.join(missing)}")
-check_env()
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s:%(message)s")
 # LangChain and Azure OpenAI imports
 from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -52,103 +28,106 @@ except ImportError as e:
         "Install faiss-cpu/faiss-gpu or downgrade NumPy to <2.0 to enable FAISS." % e
     )
-# Fallback in-memory vectorstore with shape validation
-def _fallback_vectorstore(texts: List[str], embeddings_client) -> Any:
-    """Creates a basic in-memory vectorstore with cosine similarity search and embedding shape checks."""
-    import numpy as _np
-    embs = embeddings_client.embed_documents(texts)
-    dim = len(embs[0]) if embs else 0
-    class Doc:
-        def __init__(self, content: str):
-            self.page_content = content
-    class BasicVectorStore:
-        def __init__(self, texts: List[str], embs: List[List[float]]):
-            self.texts = texts
-            self.embs = embs
-        def similarity_search(self, query: str, k: int = 3) -> List[Doc]:
-            q_emb = embeddings_client.embed_query(query)
-            if len(q_emb) != dim:
-                raise ValueError(f"Query embedding dimension {len(q_emb)} != stored dimension {dim}")
-            sims = []
-            for emb in self.embs:
-                if len(emb) != dim:
-                    raise ValueError("Stored embedding has unexpected dimension")
-                sims.append(_np.dot(q_emb, emb) / (_np.linalg.norm(q_emb) * _np.linalg.norm(emb)))
-            idxs = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:k]
-            return [Doc(self.texts[i]) for i in idxs]
-    return BasicVectorStore(texts, embs)
 class DocumentProcessor:
-    def __init__(self):
-        self.embeddings = AzureOpenAIEmbeddings(
             azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
             api_key=os.getenv("AZURE_OPENAI_API_KEY"),
             api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
             model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
         )
-        self.text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
             chunk_overlap=200,
-            length_function=len,
             separators=["\n\n", "\n", " ", ""]
         )
-    def process_text(self, text: str, persist_directory: str = None) -> Tuple[Any, List[str]]:
-        texts = self.text_splitter.split_text(text)
         if FAISS:
             try:
-                vs = FAISS.from_texts(texts=texts, embedding=self.embeddings)
                 if persist_directory:
                     vs.save_local(persist_directory)
-                logging.info(f"Processed {len(texts)} chunks into FAISS vectorstore.")
-                return vs, texts
             except Exception as e:
                 logging.warning(f"FAISS.from_texts failed ({e}), using fallback vectorstore.")
-        vs_fb = _fallback_vectorstore(texts, self.embeddings)
-        logging.info(f"Processed {len(texts)} chunks into fallback vectorstore.")
-        return vs_fb, texts
-    def process_uploaded_document(self, pdf_path: str, persist_directory: str = None) -> Tuple[Any, List[str]]:
         loader = PyPDFLoader(pdf_path)
         pages = loader.load()
         docs = self.text_splitter.split_documents(pages)
         if FAISS:
             try:
                 vs = FAISS.from_documents(documents=docs, embedding=self.embeddings)
                 if persist_directory:
                     vs.save_local(persist_directory)
-                logging.info(f"Processed PDF with {len(docs)} chunks into FAISS vectorstore.")
-                raw = [doc.page_content for doc in docs]
-                return vs, raw
             except Exception as e:
                 logging.warning(f"FAISS.from_documents failed ({e}), falling back.")
-        texts = [doc.page_content for doc in docs]
-        vs_fb = _fallback_vectorstore(texts, self.embeddings)
         logging.info(f"Processed PDF with {len(texts)} chunks into fallback vectorstore.")
-        return vs_fb, texts
 class QuestionGenerator:
-    def __init__(self):
-        self.llm = AzureChatOpenAI(
-            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
-            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
-            api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
-            model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT"),
-            temperature=0.3
-        )
-        self.chain = LLMChain(
-            llm=self.llm,
-            prompt=PromptTemplate(
-                input_variables=[
-                    "context","num_questions","question_type","subject",
-                    "class_grade","topic","difficulty","bloom_level","instructions"
-                ],
-                template=(
 """
 Based on the following context:
 {context}
@@ -165,16 +144,43 @@ Additional Instructions: {instructions}
 Format as JSON:
 {"questions": [{"question":"","options":[],"correctAnswer":"","explanation":""}]}
 """
-                )
             )
         )
     def generate_questions(self, topic_data: Dict[str, Any], vectorstore: Any) -> Dict[str, Any]:
         context = ""
         if vectorstore:
-            docs = vectorstore.similarity_search(f"{topic_data['subjectName']} {topic_data['sectionName']}", k=3)
             context = "\n".join(getattr(doc, 'page_content', '') for doc in docs)
             logging.info(f"Context length: {len(context)}")
         payload = {
             "context": context,
             "num_questions": topic_data['numQuestions'],
@@ -195,7 +201,7 @@ Format as JSON:
             result = json.loads(output)
         except json.JSONDecodeError:
             logging.error(f"JSON parsing failed. Raw output: {output}")
-            raise ValueError(f"Failed to parse JSON from LLM output: {output}")
         if 'questions' not in result:
             raise ValueError(f"Missing 'questions' key in output JSON: {result}")
         return result
@@ -225,15 +231,26 @@ class QuestionEvaluator:
             logging.error(f"Evaluation error: {e}")
             raise
-# CLI test
 def main():
     dp = DocumentProcessor()
     sample = "This is a simple test. It splits into chunks and embeds."
-    vs, chunks = dp.process_text(sample)
     print("Chunks:", chunks)
     if os.path.exists('sample.pdf'):
-        vs2, raw = dp.process_uploaded_document('sample.pdf')
-        print("PDF raw chunks count:", len(raw))
 if __name__ == "__main__":
     main()

 import logging
 import re
 import json
+import numpy as np
 from dotenv import load_dotenv
+from typing import Dict, List, Any, Tuple, Optional
 # LangChain and Azure OpenAI imports
 from langchain_text_splitters import RecursiveCharacterTextSplitter
         "Install faiss-cpu/faiss-gpu or downgrade NumPy to <2.0 to enable FAISS." % e
     )
 class DocumentProcessor:
+    def __init__(
+        self,
+        embeddings: Optional[AzureOpenAIEmbeddings] = None,
+        text_splitter: Optional[RecursiveCharacterTextSplitter] = None
+    ):
+        """Initialize DocumentProcessor with injectable embeddings and splitter."""
+        self.embeddings = embeddings or AzureOpenAIEmbeddings(
             azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
             api_key=os.getenv("AZURE_OPENAI_API_KEY"),
             api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
             model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
         )
+        self.text_splitter = text_splitter or RecursiveCharacterTextSplitter(
             chunk_size=1000,
             chunk_overlap=200,
             separators=["\n\n", "\n", " ", ""]
         )
+    def _create_fallback_vectorstore(self, texts: List[str]) -> Any:
+        """Creates a basic in-memory vectorstore with cosine similarity search and embedding shape checks."""
+        embs = self.embeddings.embed_documents(texts)
+        dim = len(embs[0]) if embs else 0
+        class Doc:
+            def __init__(self, content: str):
+                self.page_content = content
+        class BasicVectorStore:
+            def __init__(self, texts: List[str], embs: List[List[float]]):
+                self.texts = texts
+                self.embs = embs
+            def similarity_search(self, query: str, k: int = 3) -> List[Doc]:
+                q_emb = self.embeddings.embed_query(query)
+                if len(q_emb) != dim:
+                    raise ValueError(f"Query embedding dimension {len(q_emb)} != stored dimension {dim}")
+                sims = []
+                for emb in self.embs:
+                    if len(emb) != dim:
+                        raise ValueError("Stored embedding has unexpected dimension")
+                    sims.append(np.dot(q_emb, emb) / (np.linalg.norm(q_emb) * np.linalg.norm(emb)))
+                idxs = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:k]
+                return [Doc(self.texts[i]) for i in idxs]
+        # Bind embeddings for inner class
+        BasicVectorStore.embeddings = self.embeddings
+        return BasicVectorStore(texts, embs)
+    def process_text(self, text: str, persist_directory: str = None) -> Tuple[Any, List[str], Dict[str, str]]:
+        """Split raw text, build vectorstore (FAISS or fallback), return store, chunks, and metadata."""
+        chunks = self.text_splitter.split_text(text)
+        backend = 'fallback'
         if FAISS:
             try:
+                vs = FAISS.from_texts(texts=chunks, embedding=self.embeddings)
+                backend = 'faiss'
                 if persist_directory:
                     vs.save_local(persist_directory)
+                    _log_vectorstore_size(persist_directory)
+                logging.info(f"Processed {len(chunks)} chunks into FAISS vectorstore.")
+                return vs, chunks, {'backend': backend}
             except Exception as e:
                 logging.warning(f"FAISS.from_texts failed ({e}), using fallback vectorstore.")
+        vs_fb = self._create_fallback_vectorstore(chunks)
+        logging.info(f"Processed {len(chunks)} chunks into fallback vectorstore.")
+        return vs_fb, chunks, {'backend': backend}
+    def process_uploaded_document(
+        self, pdf_path: str, persist_directory: str = None
+    ) -> Tuple[Any, List[str], Dict[str, str]]:
+        """Load PDF, split, build vectorstore, and return store, raw texts, and metadata."""
         loader = PyPDFLoader(pdf_path)
         pages = loader.load()
         docs = self.text_splitter.split_documents(pages)
+        texts = [doc.page_content for doc in docs]
+        backend = 'fallback'
         if FAISS:
             try:
                 vs = FAISS.from_documents(documents=docs, embedding=self.embeddings)
+                backend = 'faiss'
                 if persist_directory:
                     vs.save_local(persist_directory)
+                    _log_vectorstore_size(persist_directory)
+                logging.info(f"Processed PDF with {len(texts)} chunks into FAISS vectorstore.")
+                return vs, texts, {'backend': backend}
             except Exception as e:
                 logging.warning(f"FAISS.from_documents failed ({e}), falling back.")
+        vs_fb = self._create_fallback_vectorstore(texts)
         logging.info(f"Processed PDF with {len(texts)} chunks into fallback vectorstore.")
+        return vs_fb, texts, {'backend': backend}
 class QuestionGenerator:
+    def __init__(self, prompt_template_path: str = None):
+        # Load prompt template from file or default
+        if prompt_template_path and os.path.exists(prompt_template_path):
+            with open(prompt_template_path) as f:
+                template_str = f.read()
+        else:
+            template_str = (
 """
 Based on the following context:
 {context}
 Format as JSON:
 {"questions": [{"question":"","options":[],"correctAnswer":"","explanation":""}]}
 """
+            )
+        self.llm = AzureChatOpenAI(
+            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
+            api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
+            model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT"),
+            temperature=0.3
+        )
+        self.chain = LLMChain(
+            llm=self.llm,
+            prompt=PromptTemplate(
+                input_variables=[
+                    "context","num_questions","question_type","subject",
+                    "class_grade","topic","difficulty","bloom_level","instructions"
+                ],
+                template=template_str
             )
         )
     def generate_questions(self, topic_data: Dict[str, Any], vectorstore: Any) -> Dict[str, Any]:
+        # Validate topic_data keys
+        required_keys = [
+            'subjectName','sectionName','numQuestions','questionType',
+            'classGrade','difficulty','bloomLevel'
+        ]
+        missing = [k for k in required_keys if k not in topic_data]
+        if missing:
+            raise ValueError(f"Missing required topic_data keys: {', '.join(missing)}")
         context = ""
         if vectorstore:
+            docs = vectorstore.similarity_search(
+                f"{topic_data['subjectName']} {topic_data['sectionName']}", k=3
+            )
             context = "\n".join(getattr(doc, 'page_content', '') for doc in docs)
             logging.info(f"Context length: {len(context)}")
         payload = {
             "context": context,
             "num_questions": topic_data['numQuestions'],
             result = json.loads(output)
         except json.JSONDecodeError:
             logging.error(f"JSON parsing failed. Raw output: {output}")
+            raise
         if 'questions' not in result:
             raise ValueError(f"Missing 'questions' key in output JSON: {result}")
         return result
             logging.error(f"Evaluation error: {e}")
             raise
+# Helper for logging vectorstore size
+def _log_vectorstore_size(directory: str):
+    total = 0
+    for root, _, files in os.walk(directory):
+        for f in files:
+            total += os.path.getsize(os.path.join(root, f))
+    logging.info(f"Vectorstore on disk: {total/1024:.2f} KB")
+# CLI test and env validation
 def main():
+    # Validate env only on script run
+    check_env()
     dp = DocumentProcessor()
     sample = "This is a simple test. It splits into chunks and embeds."
+    vs, chunks, meta = dp.process_text(sample)
     print("Chunks:", chunks)
+    print("Backend used:", meta['backend'])
     if os.path.exists('sample.pdf'):
+        vs2, raw, meta2 = dp.process_uploaded_document('sample.pdf')
+        print("PDF raw chunks count:", len(raw), "Backend:", meta2['backend'])
 if __name__ == "__main__":
     main()