Spaces:

sambodhan
/

prepare_dataset

Paused

App Files Files Community

mr-kush commited on 29 days ago

Commit

58bdb4f

1 Parent(s): 1412806

implement dataset preparation pipeline with environment variable validation and database connection handling

Browse files

Files changed (1) hide show

prepare_dataset_pipeline.py +111 -0

prepare_dataset_pipeline.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os
+from preprocess_and_prepare_dataset import preprocess_and_push_dataset
+from prepare_pd_df import fetch_misclassified_dataframe
+from dotenv import load_dotenv
+from sqlalchemy import create_engine
+import time
+from sqlalchemy.exc import SQLAlchemyError
+from huggingface_hub import HfApi
+load_dotenv()
+def prepare_datasets():
+    """
+    Fetch misclassified data and preprocess & push datasets for department and urgency.
+    Uses environment variables:
+        - HF_TOKEN
+        - DEPARTMENT_DATASET
+        - URGENCY_DATASET
+        - DB_URL
+        - PREPARE_DATASET_SPACE_ID
+    """
+    # Load configuration from environment variable
+    hf_token = os.getenv("HF_TOKEN", None)
+    dept_dataset_dir = os.getenv("DEPARTMENT_DATASET", None)
+    urgency_dataset_dir = os.getenv("URGENCY_DATASET", None)
+    DB_URL = os.getenv("POSTGRES_URL", None)
+    PREPARE_DATASET_SPACE_ID = os.getenv('PREPARE_DATASET_SPACE_ID', None)
+    # chekcing the envrionment var's
+    if not DB_URL:
+        raise EnvironmentError(f"Environment variable POSTGRES_URL must be set: {DB_URL} ")
+    if not hf_token:
+        raise ValueError(f"HF_TOKEN environment variable is not set:{hf_token}")
+    if not dept_dataset_dir:
+        raise ValueError(f"DEPARTMENT_DATASET environment variable is not set: {dept_dataset_dir}")
+    if not urgency_dataset_dir:
+        raise ValueError(f"URGENCY_DATASET environment variable is not set: {urgency_dataset_dir}")
+    try:
+        # create engine with a pre-ping to avoid stale connections
+        engine = create_engine(DB_URL, pool_pre_ping=True)
+        # validate connection with simple query and a small retry/backoff strategy
+        max_attempts = 3
+        for attempt in range(1, max_attempts + 1):
+            try:
+                with engine.connect() as conn:
+                    conn.exec_driver_sql("SELECT 1")
+                break
+            except SQLAlchemyError as e:
+                if attempt >= max_attempts:
+                    raise RuntimeError(f"Unable to connect to DB after {max_attempts} attempts: {e}")
+                wait = 2 ** attempt
+                print(f"DB connection attempt {attempt} failed: {e}. Retrying in {wait}s...")
+                time.sleep(wait)
+    except Exception as e:
+        raise RuntimeError(f"Error setting up SQLAlchemy engine: {e}")
+    # Mapping label -> dataset directory
+    dataset_mapping = {
+        "department": dept_dataset_dir,
+        "urgency": urgency_dataset_dir
+    }
+    for label, dataset_dir in dataset_mapping.items():
+        try:
+            print(f"Fetching misclassified data for '{label}'...")
+            df = fetch_misclassified_dataframe(label_column=label,
+                                               engine=engine,
+                                               correct_ratio=0.5
+                                               )
+            print(f"Preprocessing and pushing dataset for '{label}' to '{dataset_dir}'...", flush=True)
+            preprocess_and_push_dataset(
+                df=df,
+                hf_token=hf_token,
+                hf_dataset_dir=dataset_dir,
+                label_column=label,
+            )
+            print(f"Successfully processe and pushed '{label}' dataset.\n")
+        except Exception as e:
+            raise RuntimeError(f" Error processing '{label}' dataset: {e}")
+    # pause the space if it was run in the hf_space
+    if PREPARE_DATASET_SPACE_ID:
+        try:
+            print(f"[{time.strftime('%H:%M:%S')}] Attempting to pause Hugging Face Space...", flush=True)
+            api = HfApi()
+            api.pause_space(repo_id=PREPARE_DATASET_SPACE_ID,
+                            token=hf_token)
+            print(f"[{time.strftime('%H:%M:%S')}] Pause command executed.", flush=True)
+        except Exception as e:
+            print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] WARNING: Failed to pause HF Space: {e}", flush=True)
+# Example usage
+if __name__ == "__main__":
+    prepare_datasets()