|
|
from typing import Union, List, Annotated, Dict |
|
|
from pydantic import BaseModel, Field, field_validator, model_validator |
|
|
import re |
|
|
|
|
|
|
|
|
|
|
|
def clean_text(text: str) -> str: |
|
|
"""Clean grievance text by removing URLs, HTML tags, extra whitespace.""" |
|
|
text = re.sub(r'https?://\S+|www\.\S+', '', text) |
|
|
text = re.sub(r'<.*?>', '', text) |
|
|
text = re.sub(r'\n', ' ', text) |
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
return text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TextInput(BaseModel): |
|
|
text: Annotated[ |
|
|
Union[str, List[str]], |
|
|
Field( |
|
|
..., |
|
|
title="Input text(s)", |
|
|
description="A single string or a list of non-empty strings representing user input." |
|
|
) |
|
|
] |
|
|
|
|
|
|
|
|
@field_validator("text") |
|
|
def validate_text(cls, value): |
|
|
if isinstance(value, str): |
|
|
value = value.strip() |
|
|
if not value: |
|
|
raise ValueError("String input cannot be empty.") |
|
|
elif isinstance(value, list): |
|
|
if not value: |
|
|
raise ValueError("List input cannot be empty.") |
|
|
for i, v in enumerate(value): |
|
|
if not isinstance(v, str) or not v.strip(): |
|
|
raise ValueError(f"Item {i} in list is not a valid non-empty string.") |
|
|
else: |
|
|
raise TypeError("Input must be a string or a list of strings.") |
|
|
return value |
|
|
|
|
|
|
|
|
@model_validator(mode="after") |
|
|
def clean_texts(self): |
|
|
if isinstance(self.text, str): |
|
|
self.text = clean_text(self.text) |
|
|
else: |
|
|
self.text = [clean_text(t) for t in self.text] |
|
|
return self |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_config = { |
|
|
"json_schema_extra": { |
|
|
"examples": [ |
|
|
{ |
|
|
"text": "Where can I get a new water connection?" |
|
|
}, |
|
|
{ |
|
|
"text": [ |
|
|
"Where can I get a new water connection?", |
|
|
"My streetlight is broken." |
|
|
] |
|
|
} |
|
|
] |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ClassificationOutput(BaseModel): |
|
|
label: str = Field(..., description="Top predicted label") |
|
|
confidence: float = Field(..., ge=0, le=1, description="Confidence score") |
|
|
scores: Dict[str, float] = Field(..., description="All label confidence scores") |
|
|
|
|
|
|