Transformers
Italian
English
semantic-search
explainable-ai
faiss
ai-ethics
responsible-ai
llm
prompt-engineering
multimodal-ai
ai-transparency
ethical-intelligence
explainable-llm
cognitive-ai
ethical-ai
scientific-retrieval
modular-ai
memory-augmented-llm
trustworthy-ai
reasoning-engine
ai-alignment
next-gen-llm
thinking-machines
open-source-ai
explainability
ai-research
semantic audit
cognitive agent
human-centered-ai
Create support_functions.py
Browse files- src/utils/support_functions.py +133 -0
src/utils/support_functions.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# © 2025 Elena Marziali — Code released under Apache 2.0 license.
|
| 2 |
+
# See LICENSE in the repository for details.
|
| 3 |
+
# Removal of this copyright is prohibited.
|
| 4 |
+
|
| 5 |
+
# Evaluate the structure of the AI response from the LLM
|
| 6 |
+
def validate_ai_structure(response, expected_fields=("title", "abstract", "url")):
|
| 7 |
+
if not isinstance(response, list):
|
| 8 |
+
return []
|
| 9 |
+
valid_items = []
|
| 10 |
+
for item in response:
|
| 11 |
+
if isinstance(item, dict) and all(k in item for k in expected_fields):
|
| 12 |
+
valid_items.append(item)
|
| 13 |
+
return valid_items
|
| 14 |
+
|
| 15 |
+
import math
|
| 16 |
+
|
| 17 |
+
# Compute semantic score of the response
|
| 18 |
+
def sigmoid(x):
|
| 19 |
+
return 1 / (1 + math.exp(-x))
|
| 20 |
+
|
| 21 |
+
def evaluate_score(model_output):
|
| 22 |
+
try:
|
| 23 |
+
score = float(model_output[0])
|
| 24 |
+
return round(sigmoid(score), 3)
|
| 25 |
+
except:
|
| 26 |
+
return 0.0
|
| 27 |
+
|
| 28 |
+
# Extract text from selected file
|
| 29 |
+
def extract_text(file_name, max_chars=5000):
|
| 30 |
+
"""
|
| 31 |
+
Extracts text from supported formats (.pdf, .docx, .tsv, .csv).
|
| 32 |
+
Returns only the first max_chars characters.
|
| 33 |
+
"""
|
| 34 |
+
extension = file_name.lower().split(".")[-1]
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
if extension == "pdf":
|
| 38 |
+
with pdfplumber.open(file_name) as pdf:
|
| 39 |
+
text = "\n".join([p.extract_text() or "" for p in pdf.pages]).strip()
|
| 40 |
+
|
| 41 |
+
elif extension == "docx":
|
| 42 |
+
doc = Document(file_name)
|
| 43 |
+
text = "\n".join([p.text for p in doc.paragraphs]).strip()
|
| 44 |
+
|
| 45 |
+
elif extension in ["csv", "tsv"]:
|
| 46 |
+
sep = "," if extension == "csv" else "\t"
|
| 47 |
+
df = pd.read_csv(file_name, sep=sep)
|
| 48 |
+
text = df.to_string(index=False)
|
| 49 |
+
|
| 50 |
+
else:
|
| 51 |
+
raise ValueError(f"Unsupported format: .{extension}")
|
| 52 |
+
|
| 53 |
+
return text[:max_chars] if text else "No text extracted."
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
return f"Error during text extraction: {e}"
|
| 57 |
+
|
| 58 |
+
# Safely extract textual content from an AIMessage
|
| 59 |
+
def extract_text_from_ai(obj):
|
| 60 |
+
""" Safely extracts textual content from an AIMessage object. """
|
| 61 |
+
return getattr(obj, "content", str(obj)).strip()
|
| 62 |
+
|
| 63 |
+
# Extract figure captions from text
|
| 64 |
+
def extract_captions_from_text(text):
|
| 65 |
+
pattern = r"(Figure|Fig\.?)\s*\d+[:\.\-–]?\s*[^\n]+"
|
| 66 |
+
return re.findall(pattern, text, re.IGNORECASE)
|
| 67 |
+
|
| 68 |
+
# Extract images and captions from a file
|
| 69 |
+
def extract_images_with_captions(file_path, output_folder="extracted_figures"):
|
| 70 |
+
os.makedirs(output_folder, exist_ok=True)
|
| 71 |
+
extension = file_path.lower().split(".")[-1]
|
| 72 |
+
images = []
|
| 73 |
+
captions = []
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
if extension == "pdf":
|
| 77 |
+
doc = fitz.open(file_path)
|
| 78 |
+
full_text = "\n".join([p.get_text("text") for p in doc])
|
| 79 |
+
extracted_captions = extract_captions_from_text(full_text)
|
| 80 |
+
count = 0
|
| 81 |
+
|
| 82 |
+
for i, page in enumerate(doc):
|
| 83 |
+
for j, img in enumerate(page.get_images(full=True)):
|
| 84 |
+
base = doc.extract_image(img[0])
|
| 85 |
+
ext = base["ext"]
|
| 86 |
+
path = f"{output_folder}/page{i+1}_img{j+1}.{ext}"
|
| 87 |
+
with open(path, "wb") as f:
|
| 88 |
+
f.write(base["image"])
|
| 89 |
+
images.append(path)
|
| 90 |
+
captions.append(extracted_captions[count] if count < len(extracted_captions) else f"Figure {i+1}.{j+1}")
|
| 91 |
+
count += 1
|
| 92 |
+
|
| 93 |
+
elif extension == "docx":
|
| 94 |
+
doc = Document(file_path)
|
| 95 |
+
text = "\n".join([p.text for p in doc.paragraphs])
|
| 96 |
+
extracted_captions = extract_captions_from_text(text)
|
| 97 |
+
count = 0
|
| 98 |
+
|
| 99 |
+
for i, rel in enumerate(doc.part._rels):
|
| 100 |
+
relation = doc.part._rels[rel]
|
| 101 |
+
if "image" in relation.target_ref:
|
| 102 |
+
img_data = relation.target_part.blob
|
| 103 |
+
name = f"{output_folder}/docx_image_{i+1}.png"
|
| 104 |
+
with open(name, "wb") as f:
|
| 105 |
+
f.write(img_data)
|
| 106 |
+
images.append(name)
|
| 107 |
+
captions.append(extracted_captions[count] if count < len(extracted_captions) else f"Figure {i+1}")
|
| 108 |
+
count += 1
|
| 109 |
+
|
| 110 |
+
else:
|
| 111 |
+
print(f"Unsupported extension: .{extension}")
|
| 112 |
+
|
| 113 |
+
print(f"{len(images)} image(s) extracted.")
|
| 114 |
+
return images, captions
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
print(f"Error extracting images: {e}")
|
| 118 |
+
return [], []
|
| 119 |
+
|
| 120 |
+
# Generate semantic coherence note based on score
|
| 121 |
+
def generate_note(score):
|
| 122 |
+
if score > 0.85:
|
| 123 |
+
return "High semantic coherence. The response is likely solid and relevant."
|
| 124 |
+
elif score > 0.6:
|
| 125 |
+
return "Moderate coherence. The response is understandable but may contain approximations."
|
| 126 |
+
else:
|
| 127 |
+
return "Low coherence. It may be helpful to rephrase the question or provide more context."
|
| 128 |
+
|
| 129 |
+
# Simulate LLM response generation
|
| 130 |
+
def generate_response(question, temperature=0.7):
|
| 131 |
+
if "Rephrase" in question:
|
| 132 |
+
return "How does enthalpy change during a phase transition?"
|
| 133 |
+
return f"[Simulated response at temperature {temperature} for: {question}]"
|