Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| import torch.optim as optim | |
| from torch.utils.data import DataLoader, Dataset | |
| from transformers import BertTokenizer, BertModel | |
| # Define constants | |
| DIMENSIONS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'] | |
| class EssayDataset(Dataset): | |
| def __init__(self, texts, targets, tokenizer, max_len): | |
| self.texts = texts | |
| self.targets = targets | |
| self.tokenizer = tokenizer | |
| self.max_len = max_len | |
| def __len__(self): | |
| return len(self.texts) | |
| def __getitem__(self, item): | |
| text = self.texts[item] | |
| target = self.targets[item] | |
| encoding = self.tokenizer.encode_plus( | |
| text, | |
| add_special_tokens=True, | |
| max_length=self.max_len, | |
| return_token_type_ids=False, | |
| padding='max_length', | |
| return_attention_mask=True, | |
| return_tensors='pt', | |
| truncation=True | |
| ) | |
| return { | |
| 'text': text, | |
| 'input_ids': encoding['input_ids'].flatten(), | |
| 'attention_mask': encoding['attention_mask'].flatten(), | |
| 'targets': torch.tensor(target, dtype=torch.float) | |
| } | |
| class EssayScoreRegressor(nn.Module): | |
| def __init__(self, n_outputs): | |
| super(EssayScoreRegressor, self).__init__() | |
| self.bert = BertModel.from_pretrained('bert-base-uncased') | |
| self.drop = nn.Dropout(p=0.3) | |
| self.out = nn.Linear(self.bert.config.hidden_size, n_outputs) | |
| def forward(self, input_ids, attention_mask): | |
| pooled_output = self.bert( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask | |
| )['pooler_output'] | |
| output = self.drop(pooled_output) | |
| return self.out(output) | |
| def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples): | |
| model = model.train() | |
| losses = [] | |
| for d in data_loader: | |
| input_ids = d['input_ids'].to(device) | |
| attention_mask = d['attention_mask'].to(device) | |
| targets = d['targets'].to(device) | |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask) | |
| loss = loss_fn(outputs, targets) | |
| losses.append(loss.item()) | |
| loss.backward() | |
| optimizer.step() | |
| scheduler.step() | |
| optimizer.zero_grad() | |
| return np.mean(losses) | |
| def train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs, batch_size, max_len): | |
| train_dataset = EssayDataset( | |
| texts=train_data['full_text'].to_numpy(), | |
| targets=train_data[DIMENSIONS].to_numpy(), | |
| tokenizer=tokenizer, | |
| max_len=max_len | |
| ) | |
| val_dataset = EssayDataset( | |
| texts=val_data['full_text'].to_numpy(), | |
| targets=val_data[DIMENSIONS].to_numpy(), | |
| tokenizer=tokenizer, | |
| max_len=max_len | |
| ) | |
| train_data_loader = DataLoader( | |
| train_dataset, | |
| batch_size=batch_size, | |
| shuffle=True | |
| ) | |
| val_data_loader = DataLoader( | |
| val_dataset, | |
| batch_size=batch_size, | |
| shuffle=False | |
| ) | |
| loss_fn = nn.MSELoss().to(device) | |
| for epoch in range(epochs): | |
| print(f'Epoch {epoch + 1}/{epochs}') | |
| print('-' * 10) | |
| train_loss = train_epoch( | |
| model, | |
| train_data_loader, | |
| loss_fn, | |
| optimizer, | |
| device, | |
| scheduler, | |
| len(train_dataset) | |
| ) | |
| print(f'Train loss {train_loss}') | |
| if __name__ == "__main__": | |
| df = pd.read_csv('train.csv') | |
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model = EssayScoreRegressor(n_outputs=len(DIMENSIONS)) | |
| model = model.to(device) | |
| optimizer = optim.Adam(model.parameters(), lr=2e-5) | |
| total_steps = len(df) // 16 * 5 | |
| scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=total_steps, gamma=0.1) | |
| train_data = df.sample(frac=0.8, random_state=42) | |
| val_data = df.drop(train_data.index) | |
| train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs=5, batch_size=16, max_len=160) | |