Spaces:

yazoniak
/

twitteremo-pl-classifier

Running

App Files Files Community

yazoniak commited on 19 days ago

Commit

d04a9f2

verified ·

1 Parent(s): fac038c

Upload 2 files

Browse files

Files changed (2) hide show

app.py +120 -12
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -15,18 +15,20 @@ Environment Variables:
 Features:
     - Multi-label emotion and sentiment classification
     - Calibrated predictions with temperature scaling
-    - Automatic prediction logging via HuggingFaceDatasetSaver
     - Persistent data storage across space restarts
 """
 import gradio as gr
-from gradio.flagging import HuggingFaceDatasetSaver
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 import torch
 import numpy as np
 import json
 import os
 import re
 # Model configuration
@@ -54,6 +56,105 @@ LABEL_EMOJIS = {
 }
 def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
     """
     Preprocess input text by anonymizing mentions.
@@ -122,16 +223,20 @@ model, tokenizer, labels, calibration_artifacts = load_model()
 print(f"✓ Model loaded successfully with {len(labels)} labels")
 print(f"  Labels: {', '.join(labels)}")
-# Initialize flagging callback for automatic prediction logging
-flagging_callback = None
 if HF_TOKEN:
     try:
-        flagging_callback = HuggingFaceDatasetSaver(
-            hf_token=HF_TOKEN,
             dataset_name=HF_DATASET_REPO,
             private=True,
         )
         print(f"✓ Auto-logging enabled - all predictions will be saved to: {HF_DATASET_REPO}")
     except Exception as e:
         print(f"⚠ Could not initialize auto-logging: {e}")
         print("  Predictions will not be logged")
@@ -312,13 +417,16 @@ def predict_emotions(
     all_scores_json = json.dumps(json_output, indent=2, ensure_ascii=False)
-    # Automatically log all predictions if flagging is enabled
-    if flagging_callback:
         try:
-            flagging_callback.flag(
-                flag_data=[text, mode, threshold, anonymize, result_text, all_scores_json],
-                flag_option="auto_logged",
-                username=None,
             )
         except Exception as e:
             print(f"⚠ Error logging prediction: {e}")

 Features:
     - Multi-label emotion and sentiment classification
     - Calibrated predictions with temperature scaling
+    - Automatic prediction logging to HuggingFace datasets
     - Persistent data storage across space restarts
 """
 import gradio as gr
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 import torch
 import numpy as np
 import json
 import os
 import re
+from datetime import datetime
+from datasets import Dataset, load_dataset
+from huggingface_hub import HfApi
 # Model configuration
 }
+class HFDatasetLogger:
+    """
+    Custom logger that saves predictions to a HuggingFace dataset.
+    This provides persistent storage across space restarts by storing data
+    directly to a HuggingFace dataset repository.
+    """
+    def __init__(self, dataset_name: str, hf_token: str, private: bool = True):
+        """
+        Initialize the HuggingFace dataset logger.
+        Args:
+            dataset_name: Name of the dataset (e.g., "username/dataset-name")
+            hf_token: HuggingFace authentication token
+            private: Whether to create a private dataset
+        """
+        self.dataset_name = dataset_name
+        self.hf_token = hf_token
+        self.private = private
+        self.api = HfApi()
+        self.dataset_exists = False
+        # Check if dataset exists
+        try:
+            load_dataset(dataset_name, split="train", token=hf_token, streaming=True)
+            self.dataset_exists = True
+        except Exception:
+            self.dataset_exists = False
+    def log(
+        self,
+        text: str,
+        mode: str,
+        threshold: float,
+        anonymize: bool,
+        predictions: str,
+        json_output: str,
+    ) -> None:
+        """
+        Log a prediction to the HuggingFace dataset.
+        Args:
+            text: Input text
+            mode: Prediction mode
+            threshold: Threshold value
+            anonymize: Anonymization setting
+            predictions: Prediction output (markdown)
+            json_output: JSON output with scores
+        """
+        try:
+            # Prepare data entry
+            data_entry = {
+                "timestamp": datetime.utcnow().isoformat(),
+                "text": text,
+                "mode": mode,
+                "threshold": float(threshold),
+                "anonymize": bool(anonymize),
+                "predictions": predictions,
+                "json_output": json_output,
+            }
+            # Create dataset from single entry
+            new_data = Dataset.from_dict({k: [v] for k, v in data_entry.items()})
+            if self.dataset_exists:
+                # Append to existing dataset
+                try:
+                    existing_dataset = load_dataset(
+                        self.dataset_name, split="train", token=self.hf_token
+                    )
+                    from datasets import concatenate_datasets
+                    combined_dataset = concatenate_datasets([existing_dataset, new_data])
+                    combined_dataset.push_to_hub(
+                        self.dataset_name,
+                        token=self.hf_token,
+                        private=self.private,
+                    )
+                except Exception as e:
+                    print(f"⚠ Error appending to dataset: {e}")
+                    # Fall back to creating new dataset if append fails
+                    new_data.push_to_hub(
+                        self.dataset_name,
+                        token=self.hf_token,
+                        private=self.private,
+                    )
+                    self.dataset_exists = True
+            else:
+                # Create new dataset
+                new_data.push_to_hub(
+                    self.dataset_name, token=self.hf_token, private=self.private
+                )
+                self.dataset_exists = True
+        except Exception as e:
+            print(f"⚠ Error logging to HuggingFace dataset: {e}")
 def preprocess_text(text: str, anonymize_mentions: bool = True) -> str:
     """
     Preprocess input text by anonymizing mentions.
 print(f"✓ Model loaded successfully with {len(labels)} labels")
 print(f"  Labels: {', '.join(labels)}")
+# Initialize custom HuggingFace dataset logger for automatic prediction logging
+hf_logger = None
 if HF_TOKEN:
     try:
+        hf_logger = HFDatasetLogger(
             dataset_name=HF_DATASET_REPO,
+            hf_token=HF_TOKEN,
             private=True,
         )
         print(f"✓ Auto-logging enabled - all predictions will be saved to: {HF_DATASET_REPO}")
+        if hf_logger.dataset_exists:
+            print("  Dataset found - will append new predictions")
+        else:
+            print("  Dataset will be created on first prediction")
     except Exception as e:
         print(f"⚠ Could not initialize auto-logging: {e}")
         print("  Predictions will not be logged")
     all_scores_json = json.dumps(json_output, indent=2, ensure_ascii=False)
+    # Automatically log all predictions if logging is enabled
+    if hf_logger:
         try:
+            hf_logger.log(
+                text=text,
+                mode=mode,
+                threshold=threshold,
+                anonymize=anonymize,
+                predictions=result_text,
+                json_output=all_scores_json,
             )
         except Exception as e:
             print(f"⚠ Error logging prediction: {e}")

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ transformers>=4.30.0
 torch>=2.0.0
 numpy>=1.24.0
 huggingface_hub>=0.16.0

 torch>=2.0.0
 numpy>=1.24.0
 huggingface_hub>=0.16.0
+datasets>=2.14.0