ChauHPham's picture
Upload folder using huggingface_hub
25faba3 verified
from typing import Tuple, List
import pandas as pd
from transformers import AutoTokenizer
SUPPORTED_TEXT_COLUMNS = ["text", "content", "body", "essay", "prompt"]
# Try common label column names; map to 0 (human), 1 (ai)
LABEL_MAPPINGS = {
"label": None, # already 0/1 or string
"target": None,
"class": None,
"is_ai": None
}
def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
# Find text column
text_col = None
for c in SUPPORTED_TEXT_COLUMNS:
if c in df.columns:
text_col = c
break
if text_col is None:
raise ValueError(f"Could not find a text column among: {SUPPORTED_TEXT_COLUMNS}")
df = df.rename(columns={text_col: "text"})
# Find label column
label_col = None
for c in LABEL_MAPPINGS.keys():
if c in df.columns:
label_col = c
break
if label_col is None:
# attempt heuristic: columns named like 'human'/'ai'
for c in df.columns:
if str(c).lower() in ("ai", "human", "source"):
label_col = c
break
if label_col is None:
raise ValueError("Could not find a label column. Expected one of: "
f"{list(LABEL_MAPPINGS.keys())} or something like ['ai','human','source'].")
# Normalize labels (0=human, 1=ai)
def to01(v):
if isinstance(v, str):
v_low = v.strip().lower()
if v_low in ("ai", "machine", "generated", "gpt", "llm", "chatgpt"):
return 1
if v_low in ("human", "person", "authored", "real"):
return 0
try:
iv = int(v)
if iv in (0, 1):
return iv
except Exception:
pass
# fallback: treat non-human as AI
return 1
df["label"] = df[label_col].apply(to01)
df = df[["text", "label"]].dropna()
df = df[df["text"].astype(str).str.strip() != ""]
return df
class DatasetLoader:
def __init__(self, model_name="roberta-base", max_length: int = 256):
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
self.max_length = max_length
def load(self, path) -> pd.DataFrame:
if str(path).endswith(".csv"):
df = pd.read_csv(path)
elif str(path).endswith(".jsonl") or str(path).endswith(".json"):
df = pd.read_json(path, lines=str(path).endswith(".jsonl"))
else:
raise ValueError(f"Unsupported file format: {path}")
return _normalize_columns(df)
def tokenize(self, texts: List[str]):
return self.tokenizer(
texts,
truncation=True,
padding="max_length",
max_length=self.max_length,
return_tensors="pt"
)