from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments # Load dataset dataset = load_dataset("Abdelkareem/wikihow-arabic-summarization") # Load the model and tokenizer model_name = "UBC-NLP/AraT5v2-base-1024" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # Preprocessing function to tokenize the dataset def preprocess_function(examples): inputs = examples["article"] targets = examples["summarize"] model_inputs = tokenizer(inputs, max_length=1024, truncation=True) labels = tokenizer(targets, max_length=150, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs # Apply preprocessing to the dataset tokenized_datasets = dataset.map(preprocess_function, batched=True) # Define training arguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=4, per_device_eval_batch_size=4, num_train_epochs=3, weight_decay=0.01, logging_dir="./logs" ) # Initialize the Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"] ) # Start the training process trainer.train()