Spaces:
Running
Running
| from typing import Tuple, List | |
| import pandas as pd | |
| from transformers import AutoTokenizer | |
| SUPPORTED_TEXT_COLUMNS = ["text", "content", "body", "essay", "prompt"] | |
| # Try common label column names; map to 0 (human), 1 (ai) | |
| LABEL_MAPPINGS = { | |
| "label": None, # already 0/1 or string | |
| "target": None, | |
| "class": None, | |
| "is_ai": None | |
| } | |
| def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame: | |
| # Find text column | |
| text_col = None | |
| for c in SUPPORTED_TEXT_COLUMNS: | |
| if c in df.columns: | |
| text_col = c | |
| break | |
| if text_col is None: | |
| raise ValueError(f"Could not find a text column among: {SUPPORTED_TEXT_COLUMNS}") | |
| df = df.rename(columns={text_col: "text"}) | |
| # Find label column | |
| label_col = None | |
| for c in LABEL_MAPPINGS.keys(): | |
| if c in df.columns: | |
| label_col = c | |
| break | |
| if label_col is None: | |
| # attempt heuristic: columns named like 'human'/'ai' | |
| for c in df.columns: | |
| if str(c).lower() in ("ai", "human", "source"): | |
| label_col = c | |
| break | |
| if label_col is None: | |
| raise ValueError("Could not find a label column. Expected one of: " | |
| f"{list(LABEL_MAPPINGS.keys())} or something like ['ai','human','source'].") | |
| # Normalize labels (0=human, 1=ai) | |
| def to01(v): | |
| if isinstance(v, str): | |
| v_low = v.strip().lower() | |
| if v_low in ("ai", "machine", "generated", "gpt", "llm", "chatgpt"): | |
| return 1 | |
| if v_low in ("human", "person", "authored", "real"): | |
| return 0 | |
| try: | |
| iv = int(v) | |
| if iv in (0, 1): | |
| return iv | |
| except Exception: | |
| pass | |
| # fallback: treat non-human as AI | |
| return 1 | |
| df["label"] = df[label_col].apply(to01) | |
| df = df[["text", "label"]].dropna() | |
| df = df[df["text"].astype(str).str.strip() != ""] | |
| return df | |
| class DatasetLoader: | |
| def __init__(self, model_name="roberta-base", max_length: int = 256): | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) | |
| self.max_length = max_length | |
| def load(self, path) -> pd.DataFrame: | |
| if str(path).endswith(".csv"): | |
| df = pd.read_csv(path) | |
| elif str(path).endswith(".jsonl") or str(path).endswith(".json"): | |
| df = pd.read_json(path, lines=str(path).endswith(".jsonl")) | |
| else: | |
| raise ValueError(f"Unsupported file format: {path}") | |
| return _normalize_columns(df) | |
| def tokenize(self, texts: List[str]): | |
| return self.tokenizer( | |
| texts, | |
| truncation=True, | |
| padding="max_length", | |
| max_length=self.max_length, | |
| return_tensors="pt" | |
| ) | |