yazoniak commited on
Commit
d04a9f2
·
verified ·
1 Parent(s): fac038c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +120 -12
  2. requirements.txt +1 -0
app.py CHANGED
@@ -15,18 +15,20 @@ Environment Variables:
15
  Features:
16
  - Multi-label emotion and sentiment classification
17
  - Calibrated predictions with temperature scaling
18
- - Automatic prediction logging via HuggingFaceDatasetSaver
19
  - Persistent data storage across space restarts
20
  """
21
 
22
  import gradio as gr
23
- from gradio.flagging import HuggingFaceDatasetSaver
24
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
25
  import torch
26
  import numpy as np
27
  import json
28
  import os
29
  import re
 
 
 
30
 
31
 
32
  # Model configuration
@@ -54,6 +56,105 @@ LABEL_EMOJIS = {
54
  }
55
 
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
58
  """
59
  Preprocess input text by anonymizing mentions.
@@ -122,16 +223,20 @@ model, tokenizer, labels, calibration_artifacts = load_model()
122
  print(f"✓ Model loaded successfully with {len(labels)} labels")
123
  print(f" Labels: {', '.join(labels)}")
124
 
125
- # Initialize flagging callback for automatic prediction logging
126
- flagging_callback = None
127
  if HF_TOKEN:
128
  try:
129
- flagging_callback = HuggingFaceDatasetSaver(
130
- hf_token=HF_TOKEN,
131
  dataset_name=HF_DATASET_REPO,
 
132
  private=True,
133
  )
134
  print(f"✓ Auto-logging enabled - all predictions will be saved to: {HF_DATASET_REPO}")
 
 
 
 
135
  except Exception as e:
136
  print(f"⚠ Could not initialize auto-logging: {e}")
137
  print(" Predictions will not be logged")
@@ -312,13 +417,16 @@ def predict_emotions(
312
 
313
  all_scores_json = json.dumps(json_output, indent=2, ensure_ascii=False)
314
 
315
- # Automatically log all predictions if flagging is enabled
316
- if flagging_callback:
317
  try:
318
- flagging_callback.flag(
319
- flag_data=[text, mode, threshold, anonymize, result_text, all_scores_json],
320
- flag_option="auto_logged",
321
- username=None,
 
 
 
322
  )
323
  except Exception as e:
324
  print(f"⚠ Error logging prediction: {e}")
 
15
  Features:
16
  - Multi-label emotion and sentiment classification
17
  - Calibrated predictions with temperature scaling
18
+ - Automatic prediction logging to HuggingFace datasets
19
  - Persistent data storage across space restarts
20
  """
21
 
22
  import gradio as gr
 
23
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
24
  import torch
25
  import numpy as np
26
  import json
27
  import os
28
  import re
29
+ from datetime import datetime
30
+ from datasets import Dataset, load_dataset
31
+ from huggingface_hub import HfApi
32
 
33
 
34
  # Model configuration
 
56
  }
57
 
58
 
59
+ class HFDatasetLogger:
60
+ """
61
+ Custom logger that saves predictions to a HuggingFace dataset.
62
+
63
+ This provides persistent storage across space restarts by storing data
64
+ directly to a HuggingFace dataset repository.
65
+ """
66
+
67
+ def __init__(self, dataset_name: str, hf_token: str, private: bool = True):
68
+ """
69
+ Initialize the HuggingFace dataset logger.
70
+
71
+ Args:
72
+ dataset_name: Name of the dataset (e.g., "username/dataset-name")
73
+ hf_token: HuggingFace authentication token
74
+ private: Whether to create a private dataset
75
+ """
76
+ self.dataset_name = dataset_name
77
+ self.hf_token = hf_token
78
+ self.private = private
79
+ self.api = HfApi()
80
+ self.dataset_exists = False
81
+
82
+ # Check if dataset exists
83
+ try:
84
+ load_dataset(dataset_name, split="train", token=hf_token, streaming=True)
85
+ self.dataset_exists = True
86
+ except Exception:
87
+ self.dataset_exists = False
88
+
89
+ def log(
90
+ self,
91
+ text: str,
92
+ mode: str,
93
+ threshold: float,
94
+ anonymize: bool,
95
+ predictions: str,
96
+ json_output: str,
97
+ ) -> None:
98
+ """
99
+ Log a prediction to the HuggingFace dataset.
100
+
101
+ Args:
102
+ text: Input text
103
+ mode: Prediction mode
104
+ threshold: Threshold value
105
+ anonymize: Anonymization setting
106
+ predictions: Prediction output (markdown)
107
+ json_output: JSON output with scores
108
+ """
109
+ try:
110
+ # Prepare data entry
111
+ data_entry = {
112
+ "timestamp": datetime.utcnow().isoformat(),
113
+ "text": text,
114
+ "mode": mode,
115
+ "threshold": float(threshold),
116
+ "anonymize": bool(anonymize),
117
+ "predictions": predictions,
118
+ "json_output": json_output,
119
+ }
120
+
121
+ # Create dataset from single entry
122
+ new_data = Dataset.from_dict({k: [v] for k, v in data_entry.items()})
123
+
124
+ if self.dataset_exists:
125
+ # Append to existing dataset
126
+ try:
127
+ existing_dataset = load_dataset(
128
+ self.dataset_name, split="train", token=self.hf_token
129
+ )
130
+ from datasets import concatenate_datasets
131
+
132
+ combined_dataset = concatenate_datasets([existing_dataset, new_data])
133
+ combined_dataset.push_to_hub(
134
+ self.dataset_name,
135
+ token=self.hf_token,
136
+ private=self.private,
137
+ )
138
+ except Exception as e:
139
+ print(f"⚠ Error appending to dataset: {e}")
140
+ # Fall back to creating new dataset if append fails
141
+ new_data.push_to_hub(
142
+ self.dataset_name,
143
+ token=self.hf_token,
144
+ private=self.private,
145
+ )
146
+ self.dataset_exists = True
147
+ else:
148
+ # Create new dataset
149
+ new_data.push_to_hub(
150
+ self.dataset_name, token=self.hf_token, private=self.private
151
+ )
152
+ self.dataset_exists = True
153
+
154
+ except Exception as e:
155
+ print(f"⚠ Error logging to HuggingFace dataset: {e}")
156
+
157
+
158
  def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
159
  """
160
  Preprocess input text by anonymizing mentions.
 
223
  print(f"✓ Model loaded successfully with {len(labels)} labels")
224
  print(f" Labels: {', '.join(labels)}")
225
 
226
+ # Initialize custom HuggingFace dataset logger for automatic prediction logging
227
+ hf_logger = None
228
  if HF_TOKEN:
229
  try:
230
+ hf_logger = HFDatasetLogger(
 
231
  dataset_name=HF_DATASET_REPO,
232
+ hf_token=HF_TOKEN,
233
  private=True,
234
  )
235
  print(f"✓ Auto-logging enabled - all predictions will be saved to: {HF_DATASET_REPO}")
236
+ if hf_logger.dataset_exists:
237
+ print(" Dataset found - will append new predictions")
238
+ else:
239
+ print(" Dataset will be created on first prediction")
240
  except Exception as e:
241
  print(f"⚠ Could not initialize auto-logging: {e}")
242
  print(" Predictions will not be logged")
 
417
 
418
  all_scores_json = json.dumps(json_output, indent=2, ensure_ascii=False)
419
 
420
+ # Automatically log all predictions if logging is enabled
421
+ if hf_logger:
422
  try:
423
+ hf_logger.log(
424
+ text=text,
425
+ mode=mode,
426
+ threshold=threshold,
427
+ anonymize=anonymize,
428
+ predictions=result_text,
429
+ json_output=all_scores_json,
430
  )
431
  except Exception as e:
432
  print(f"⚠ Error logging prediction: {e}")
requirements.txt CHANGED
@@ -3,4 +3,5 @@ transformers>=4.30.0
3
  torch>=2.0.0
4
  numpy>=1.24.0
5
  huggingface_hub>=0.16.0
 
6
 
 
3
  torch>=2.0.0
4
  numpy>=1.24.0
5
  huggingface_hub>=0.16.0
6
+ datasets>=2.14.0
7