|
|
""" |
|
|
Fine-tune a quantized Qwen2.5:7b model using SFT + LoRA on expanded preventative health prompts. |
|
|
""" |
|
|
|
|
|
|
|
|
import json |
|
|
import time |
|
|
from datasets import load_dataset, Dataset |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq |
|
|
from peft import LoraConfig, get_peft_model, TaskType |
|
|
import torch |
|
|
|
|
|
|
|
|
|
|
|
SEED_DATA_FILE = "expanded_templates.json" |
|
|
MODEL_NAME = "Qwen/Qwen2.5-7B" |
|
|
OUTPUT_DIR = "./qwen_lora_adapter" |
|
|
BATCH_SIZE = 4 |
|
|
EPOCHS = 3 |
|
|
LEARNING_RATE = 2e-4 |
|
|
MAX_LENGTH = 512 |
|
|
LORA_RANK = 16 |
|
|
LORA_ALPHA = 32 |
|
|
LORA_DROPOUT = 0.05 |
|
|
|
|
|
print("π Starting fine-tuning pipeline for Qwen2.5 with LoRA...") |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("π Loading dataset from:", SEED_DATA_FILE) |
|
|
with open(SEED_DATA_FILE, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
print(f"β
Loaded {len(data)} samples.") |
|
|
|
|
|
|
|
|
|
|
|
dataset = Dataset.from_list([{ |
|
|
"prompt": entry["prompt"], |
|
|
"response": entry["response"], |
|
|
"topic": entry.get("topic", "general") |
|
|
} for entry in data]) |
|
|
print("β
Converted to Hugging Face Dataset.") |
|
|
|
|
|
|
|
|
|
|
|
def tokenize_function(examples, tokenizer): |
|
|
|
|
|
|
|
|
texts = [ |
|
|
f"### Topic: {t}\n### Instruction:\n{p}\n\n### Response:\n{r}" |
|
|
for p, r, t in zip(examples["prompt"], examples["response"], examples["topic"]) |
|
|
] |
|
|
|
|
|
|
|
|
tokenized = tokenizer( |
|
|
texts, |
|
|
max_length=MAX_LENGTH, |
|
|
padding="max_length", |
|
|
truncation=True, |
|
|
return_tensors="pt" |
|
|
) |
|
|
tokenized["labels"] = tokenized["input_ids"].clone() |
|
|
return tokenized |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("π§ Loading tokenizer and 8-bit quantized model:", MODEL_NAME) |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_NAME, |
|
|
device_map="auto", |
|
|
torch_dtype=torch.bfloat16 |
|
|
) |
|
|
print("β
Model and tokenizer loaded successfully.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("βοΈ Configuring LoRA adapters...") |
|
|
lora_config = LoraConfig( |
|
|
task_type=TaskType.CAUSAL_LM, |
|
|
r=LORA_RANK, |
|
|
lora_alpha=LORA_ALPHA, |
|
|
lora_dropout=LORA_DROPOUT, |
|
|
target_modules=["q_proj", "v_proj"] |
|
|
) |
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
|
model.print_trainable_parameters() |
|
|
print("β
LoRA configuration complete.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("π§© Tokenizing dataset... (this might take a while)") |
|
|
tokenized_dataset = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True) |
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt") |
|
|
print("β
Dataset tokenized and ready for training.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("π Setting up training arguments...") |
|
|
training_args = TrainingArguments( |
|
|
output_dir=OUTPUT_DIR, |
|
|
per_device_train_batch_size=BATCH_SIZE, |
|
|
gradient_accumulation_steps=2, |
|
|
learning_rate=LEARNING_RATE, |
|
|
num_train_epochs=EPOCHS, |
|
|
logging_steps=10, |
|
|
save_strategy="epoch", |
|
|
fp16=True, |
|
|
save_total_limit=3, |
|
|
report_to="none", |
|
|
) |
|
|
print("β
Training arguments configured.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("π Starting training...") |
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=tokenized_dataset, |
|
|
tokenizer=tokenizer, |
|
|
data_collator=data_collator |
|
|
) |
|
|
|
|
|
train_start = time.time() |
|
|
trainer.train() |
|
|
print(f"β
Training completed in {(time.time() - train_start)/60:.2f} minutes.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("πΎ Saving LoRA adapter...") |
|
|
model.save_pretrained(OUTPUT_DIR) |
|
|
print(f"β
LoRA adapter saved at: {OUTPUT_DIR}") |
|
|
|
|
|
print(f"π All done! Total pipeline time: {(time.time() - start_time)/60:.2f} minutes.") |
|
|
|