elly99 commited on
Commit
ffcb401
·
verified ·
1 Parent(s): 12361d5

Create support_functions.py

Browse files
Files changed (1) hide show
  1. src/utils/support_functions.py +133 -0
src/utils/support_functions.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # © 2025 Elena Marziali — Code released under Apache 2.0 license.
2
+ # See LICENSE in the repository for details.
3
+ # Removal of this copyright is prohibited.
4
+
5
+ # Evaluate the structure of the AI response from the LLM
6
+ def validate_ai_structure(response, expected_fields=("title", "abstract", "url")):
7
+ if not isinstance(response, list):
8
+ return []
9
+ valid_items = []
10
+ for item in response:
11
+ if isinstance(item, dict) and all(k in item for k in expected_fields):
12
+ valid_items.append(item)
13
+ return valid_items
14
+
15
+ import math
16
+
17
+ # Compute semantic score of the response
18
+ def sigmoid(x):
19
+ return 1 / (1 + math.exp(-x))
20
+
21
+ def evaluate_score(model_output):
22
+ try:
23
+ score = float(model_output[0])
24
+ return round(sigmoid(score), 3)
25
+ except:
26
+ return 0.0
27
+
28
+ # Extract text from selected file
29
+ def extract_text(file_name, max_chars=5000):
30
+ """
31
+ Extracts text from supported formats (.pdf, .docx, .tsv, .csv).
32
+ Returns only the first max_chars characters.
33
+ """
34
+ extension = file_name.lower().split(".")[-1]
35
+
36
+ try:
37
+ if extension == "pdf":
38
+ with pdfplumber.open(file_name) as pdf:
39
+ text = "\n".join([p.extract_text() or "" for p in pdf.pages]).strip()
40
+
41
+ elif extension == "docx":
42
+ doc = Document(file_name)
43
+ text = "\n".join([p.text for p in doc.paragraphs]).strip()
44
+
45
+ elif extension in ["csv", "tsv"]:
46
+ sep = "," if extension == "csv" else "\t"
47
+ df = pd.read_csv(file_name, sep=sep)
48
+ text = df.to_string(index=False)
49
+
50
+ else:
51
+ raise ValueError(f"Unsupported format: .{extension}")
52
+
53
+ return text[:max_chars] if text else "No text extracted."
54
+
55
+ except Exception as e:
56
+ return f"Error during text extraction: {e}"
57
+
58
+ # Safely extract textual content from an AIMessage
59
+ def extract_text_from_ai(obj):
60
+ """ Safely extracts textual content from an AIMessage object. """
61
+ return getattr(obj, "content", str(obj)).strip()
62
+
63
+ # Extract figure captions from text
64
+ def extract_captions_from_text(text):
65
+ pattern = r"(Figure|Fig\.?)\s*\d+[:\.\-–]?\s*[^\n]+"
66
+ return re.findall(pattern, text, re.IGNORECASE)
67
+
68
+ # Extract images and captions from a file
69
+ def extract_images_with_captions(file_path, output_folder="extracted_figures"):
70
+ os.makedirs(output_folder, exist_ok=True)
71
+ extension = file_path.lower().split(".")[-1]
72
+ images = []
73
+ captions = []
74
+
75
+ try:
76
+ if extension == "pdf":
77
+ doc = fitz.open(file_path)
78
+ full_text = "\n".join([p.get_text("text") for p in doc])
79
+ extracted_captions = extract_captions_from_text(full_text)
80
+ count = 0
81
+
82
+ for i, page in enumerate(doc):
83
+ for j, img in enumerate(page.get_images(full=True)):
84
+ base = doc.extract_image(img[0])
85
+ ext = base["ext"]
86
+ path = f"{output_folder}/page{i+1}_img{j+1}.{ext}"
87
+ with open(path, "wb") as f:
88
+ f.write(base["image"])
89
+ images.append(path)
90
+ captions.append(extracted_captions[count] if count < len(extracted_captions) else f"Figure {i+1}.{j+1}")
91
+ count += 1
92
+
93
+ elif extension == "docx":
94
+ doc = Document(file_path)
95
+ text = "\n".join([p.text for p in doc.paragraphs])
96
+ extracted_captions = extract_captions_from_text(text)
97
+ count = 0
98
+
99
+ for i, rel in enumerate(doc.part._rels):
100
+ relation = doc.part._rels[rel]
101
+ if "image" in relation.target_ref:
102
+ img_data = relation.target_part.blob
103
+ name = f"{output_folder}/docx_image_{i+1}.png"
104
+ with open(name, "wb") as f:
105
+ f.write(img_data)
106
+ images.append(name)
107
+ captions.append(extracted_captions[count] if count < len(extracted_captions) else f"Figure {i+1}")
108
+ count += 1
109
+
110
+ else:
111
+ print(f"Unsupported extension: .{extension}")
112
+
113
+ print(f"{len(images)} image(s) extracted.")
114
+ return images, captions
115
+
116
+ except Exception as e:
117
+ print(f"Error extracting images: {e}")
118
+ return [], []
119
+
120
+ # Generate semantic coherence note based on score
121
+ def generate_note(score):
122
+ if score > 0.85:
123
+ return "High semantic coherence. The response is likely solid and relevant."
124
+ elif score > 0.6:
125
+ return "Moderate coherence. The response is understandable but may contain approximations."
126
+ else:
127
+ return "Low coherence. It may be helpful to rephrase the question or provide more context."
128
+
129
+ # Simulate LLM response generation
130
+ def generate_response(question, temperature=0.7):
131
+ if "Rephrase" in question:
132
+ return "How does enthalpy change during a phase transition?"
133
+ return f"[Simulated response at temperature {temperature} for: {question}]"