elly99
/

MarCognity-AI

+# © 2025 Elena Marziali — Code released under Apache 2.0 license.
+# See LICENSE in the repository for details.
+# Removal of this copyright is prohibited.
+# Evaluate the structure of the AI response from the LLM
+def validate_ai_structure(response, expected_fields=("title", "abstract", "url")):
+    if not isinstance(response, list):
+        return []
+    valid_items = []
+    for item in response:
+        if isinstance(item, dict) and all(k in item for k in expected_fields):
+            valid_items.append(item)
+    return valid_items
+import math
+# Compute semantic score of the response
+def sigmoid(x):
+    return 1 / (1 + math.exp(-x))
+def evaluate_score(model_output):
+    try:
+        score = float(model_output[0])
+        return round(sigmoid(score), 3)
+    except:
+        return 0.0
+# Extract text from selected file
+def extract_text(file_name, max_chars=5000):
+    """
+    Extracts text from supported formats (.pdf, .docx, .tsv, .csv).
+    Returns only the first max_chars characters.
+    """
+    extension = file_name.lower().split(".")[-1]
+    try:
+        if extension == "pdf":
+            with pdfplumber.open(file_name) as pdf:
+                text = "\n".join([p.extract_text() or "" for p in pdf.pages]).strip()
+        elif extension == "docx":
+            doc = Document(file_name)
+            text = "\n".join([p.text for p in doc.paragraphs]).strip()
+        elif extension in ["csv", "tsv"]:
+            sep = "," if extension == "csv" else "\t"
+            df = pd.read_csv(file_name, sep=sep)
+            text = df.to_string(index=False)
+        else:
+            raise ValueError(f"Unsupported format: .{extension}")
+        return text[:max_chars] if text else "No text extracted."
+    except Exception as e:
+        return f"Error during text extraction: {e}"
+# Safely extract textual content from an AIMessage
+def extract_text_from_ai(obj):
+    """ Safely extracts textual content from an AIMessage object. """
+    return getattr(obj, "content", str(obj)).strip()
+# Extract figure captions from text
+def extract_captions_from_text(text):
+    pattern = r"(Figure|Fig\.?)\s*\d+[:\.\-–]?\s*[^\n]+"
+    return re.findall(pattern, text, re.IGNORECASE)
+# Extract images and captions from a file
+def extract_images_with_captions(file_path, output_folder="extracted_figures"):
+    os.makedirs(output_folder, exist_ok=True)
+    extension = file_path.lower().split(".")[-1]
+    images = []
+    captions = []
+    try:
+        if extension == "pdf":
+            doc = fitz.open(file_path)
+            full_text = "\n".join([p.get_text("text") for p in doc])
+            extracted_captions = extract_captions_from_text(full_text)
+            count = 0
+            for i, page in enumerate(doc):
+                for j, img in enumerate(page.get_images(full=True)):
+                    base = doc.extract_image(img[0])
+                    ext = base["ext"]
+                    path = f"{output_folder}/page{i+1}_img{j+1}.{ext}"
+                    with open(path, "wb") as f:
+                        f.write(base["image"])
+                    images.append(path)
+                    captions.append(extracted_captions[count] if count < len(extracted_captions) else f"Figure {i+1}.{j+1}")
+                    count += 1
+        elif extension == "docx":
+            doc = Document(file_path)
+            text = "\n".join([p.text for p in doc.paragraphs])
+            extracted_captions = extract_captions_from_text(text)
+            count = 0
+            for i, rel in enumerate(doc.part._rels):
+                relation = doc.part._rels[rel]
+                if "image" in relation.target_ref:
+                    img_data = relation.target_part.blob
+                    name = f"{output_folder}/docx_image_{i+1}.png"
+                    with open(name, "wb") as f:
+                        f.write(img_data)
+                    images.append(name)
+                    captions.append(extracted_captions[count] if count < len(extracted_captions) else f"Figure {i+1}")
+                    count += 1
+        else:
+            print(f"Unsupported extension: .{extension}")
+        print(f"{len(images)} image(s) extracted.")
+        return images, captions
+    except Exception as e:
+        print(f"Error extracting images: {e}")
+        return [], []
+# Generate semantic coherence note based on score
+def generate_note(score):
+    if score > 0.85:
+        return "High semantic coherence. The response is likely solid and relevant."
+    elif score > 0.6:
+        return "Moderate coherence. The response is understandable but may contain approximations."
+    else:
+        return "Low coherence. It may be helpful to rephrase the question or provide more context."
+# Simulate LLM response generation
+def generate_response(question, temperature=0.7):
+    if "Rephrase" in question:
+        return "How does enthalpy change during a phase transition?"
+    return f"[Simulated response at temperature {temperature} for: {question}]"