Spaces:
Running
Running
| import os | |
| import sys | |
| # Disable tokenizer parallelism and MPS on macOS | |
| if os.getenv("TOKENIZERS_PARALLELISM") is None: | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| import torch | |
| import torch.nn as nn | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel | |
| class DesklibAIDetectionModel(PreTrainedModel): | |
| """Desklib AI Detection Model - Pre-trained model for AI text detection""" | |
| config_class = AutoConfig | |
| def __init__(self, config): | |
| super().__init__(config) | |
| # Initialize the base transformer model | |
| self.model = AutoModel.from_config(config) | |
| # Define a classifier head | |
| self.classifier = nn.Linear(config.hidden_size, 1) | |
| # Initialize weights | |
| self.init_weights() | |
| def forward(self, input_ids, attention_mask=None, labels=None): | |
| # Forward pass through the transformer | |
| outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) | |
| last_hidden_state = outputs[0] | |
| # Mean pooling | |
| input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float() | |
| sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, dim=1) | |
| sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9) | |
| pooled_output = sum_embeddings / sum_mask | |
| # Classifier | |
| logits = self.classifier(pooled_output) | |
| loss = None | |
| if labels is not None: | |
| loss_fct = nn.BCEWithLogitsLoss() | |
| loss = loss_fct(logits.view(-1), labels.float()) | |
| output = {"logits": logits} | |
| if loss is not None: | |
| output["loss"] = loss | |
| return output | |
| class DetectorModel: | |
| def __init__(self, model_name="desklib/ai-text-detector-v1.01", use_desklib=True): | |
| """ | |
| Initialize detector model. | |
| Args: | |
| model_name: Model name or path. Defaults to Desklib pre-trained model. | |
| use_desklib: If True, use Desklib model architecture. If False, use standard classification. | |
| """ | |
| self.model_name = model_name | |
| self.use_desklib = use_desklib | |
| if use_desklib and "desklib" in model_name: | |
| # Try to load Desklib model, but fallback if MPS issues occur | |
| if sys.platform == "darwin": | |
| # On macOS: try multiple loading strategies | |
| try: | |
| # Strategy 1: Load with low_cpu_mem_usage and explicit CPU | |
| print("Attempting to load Desklib model...") | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| config = AutoConfig.from_pretrained(model_name) | |
| # Try loading with safetensors if available | |
| try: | |
| from transformers import AutoModel | |
| # Load base model first | |
| base_model = AutoModel.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float32, | |
| low_cpu_mem_usage=True, | |
| device_map="cpu" | |
| ) | |
| # Create Desklib model wrapper | |
| self.model = DesklibAIDetectionModel(config) | |
| self.model.model = base_model | |
| self.model = self.model.to("cpu") | |
| # Load classifier weights | |
| from transformers.utils import cached_file | |
| try: | |
| classifier_path = cached_file(model_name, "pytorch_model.bin") | |
| state_dict = torch.load(classifier_path, map_location="cpu") | |
| # Only load classifier weights | |
| classifier_dict = {k: v for k, v in state_dict.items() if "classifier" in k} | |
| if classifier_dict: | |
| self.model.load_state_dict(classifier_dict, strict=False) | |
| except: | |
| pass # Use initialized classifier | |
| self.model.eval() | |
| print("✅ Desklib model loaded successfully!") | |
| except Exception as e: | |
| print(f"⚠️ Desklib model loading failed: {e}") | |
| print("Falling back to DistilBERT model...") | |
| raise | |
| except: | |
| # Fallback to a smaller, simpler model | |
| print("Using DistilBERT as fallback (smaller, more compatible)") | |
| self.use_desklib = False | |
| self.model = AutoModelForSequenceClassification.from_pretrained( | |
| "distilbert-base-uncased", | |
| num_labels=2 | |
| ) | |
| self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") | |
| self.model = self.model.to("cpu") | |
| else: | |
| # Non-macOS: standard loading | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| config = AutoConfig.from_pretrained(model_name) | |
| self.model = DesklibAIDetectionModel.from_pretrained(model_name) | |
| else: | |
| # Fallback to standard classification model | |
| self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) | |
| self.use_desklib = False | |
| def predict(self, text, max_length=768, threshold=0.5): | |
| """ | |
| Predict if text is AI-generated. | |
| Args: | |
| text: Input text to classify | |
| max_length: Maximum sequence length | |
| threshold: Probability threshold for classification | |
| Returns: | |
| tuple: (probability, label) where label is 1 for AI-generated, 0 for human | |
| """ | |
| # Tokenize | |
| encoded = self.tokenizer( | |
| text, | |
| padding='max_length', | |
| truncation=True, | |
| max_length=max_length, | |
| return_tensors='pt' | |
| ) | |
| input_ids = encoded['input_ids'] | |
| attention_mask = encoded['attention_mask'] | |
| # Get device | |
| device = next(self.model.parameters()).device | |
| input_ids = input_ids.to(device) | |
| attention_mask = attention_mask.to(device) | |
| # Predict | |
| self.model.eval() | |
| with torch.no_grad(): | |
| if self.use_desklib: | |
| outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) | |
| logits = outputs["logits"] | |
| probability = torch.sigmoid(logits).item() | |
| else: | |
| outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) | |
| probs = torch.softmax(outputs.logits, dim=1) | |
| # For standard models: prob[0] = human, prob[1] = AI | |
| probability = probs[0][1].item() | |
| label = 1 if probability >= threshold else 0 | |
| return probability, label | |
| def save(self, path: str): | |
| self.model.save_pretrained(path) | |
| self.tokenizer.save_pretrained(path) | |
| def load(cls, path: str): | |
| # Try to detect if it's a Desklib model | |
| try: | |
| config = AutoConfig.from_pretrained(path) | |
| # Check if it has the Desklib architecture | |
| if hasattr(config, 'model_type') and 'deberta' in config.model_type.lower(): | |
| model = DesklibAIDetectionModel.from_pretrained(path) | |
| tokenizer = AutoTokenizer.from_pretrained(path) | |
| obj = cls.__new__(cls) | |
| obj.model_name = path | |
| obj.model = model | |
| obj.tokenizer = tokenizer | |
| obj.use_desklib = True | |
| return obj | |
| except: | |
| pass | |
| # Fallback to standard model | |
| model = AutoModelForSequenceClassification.from_pretrained(path) | |
| tokenizer = AutoTokenizer.from_pretrained(path, use_fast=True) | |
| obj = cls.__new__(cls) | |
| obj.model_name = path | |
| obj.model = model | |
| obj.tokenizer = tokenizer | |
| obj.use_desklib = False | |
| return obj | |