|
|
import gradio as gr |
|
|
import torch |
|
|
from transformers import ( |
|
|
AutoModel, |
|
|
AutoTokenizer, |
|
|
TrainingArguments, |
|
|
Trainer, |
|
|
DataCollatorForLanguageModeling |
|
|
) |
|
|
from datasets import load_dataset |
|
|
from huggingface_hub import HfApi, login, whoami |
|
|
import os |
|
|
from datetime import datetime |
|
|
import json |
|
|
import pickle |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
class CoDATrainer(Trainer): |
|
|
def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): |
|
|
""" |
|
|
Custom loss computation for CoDA diffusion model. |
|
|
CoDA returns a dict with 'loss' key instead of a scalar. |
|
|
""" |
|
|
outputs = model(**inputs) |
|
|
|
|
|
|
|
|
if isinstance(outputs, dict) and 'loss' in outputs: |
|
|
loss = outputs['loss'] |
|
|
elif hasattr(outputs, 'loss'): |
|
|
loss = outputs.loss |
|
|
else: |
|
|
|
|
|
labels = inputs.get('labels') |
|
|
logits = outputs.get('logits') if isinstance(outputs, dict) else outputs[0] |
|
|
loss_fct = torch.nn.CrossEntropyLoss() |
|
|
loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)) |
|
|
|
|
|
|
|
|
if loss.dim() > 0: |
|
|
loss = loss.mean() |
|
|
|
|
|
return (loss, outputs) if return_outputs else loss |
|
|
|
|
|
def preprocess_conversations(examples, tokenizer): |
|
|
"""Convert ChatML-style conversations to text for training""" |
|
|
texts = [] |
|
|
for conv in examples['conversations']: |
|
|
|
|
|
if not isinstance(conv, list): |
|
|
raise ValueError(f"Expected conversation to be a list, got {type(conv)}") |
|
|
|
|
|
text = "" |
|
|
for message in conv: |
|
|
if not isinstance(message, dict): |
|
|
raise ValueError(f"Expected message to be a dict, got {type(message)}") |
|
|
|
|
|
role = message.get('role', '') |
|
|
content = message.get('content', '') |
|
|
if role == 'user': |
|
|
text += f"<|user|>\n{content}\n" |
|
|
elif role == 'assistant': |
|
|
text += f"<|assistant|>\n{content}\n" |
|
|
texts.append(text) |
|
|
|
|
|
return tokenizer(texts, truncation=True, max_length=2048, padding=False) |
|
|
|
|
|
|
|
|
CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints") |
|
|
MODEL_DIR = Path("/data/models") if Path("/data").exists() else Path("./models") |
|
|
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
MODEL_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
STATE_FILE = CHECKPOINT_DIR / "training_state.pkl" |
|
|
|
|
|
def save_training_state(state): |
|
|
"""Save training state to persistent storage""" |
|
|
with open(STATE_FILE, 'wb') as f: |
|
|
pickle.dump(state, f) |
|
|
|
|
|
def load_training_state(): |
|
|
"""Load training state from persistent storage""" |
|
|
if STATE_FILE.exists(): |
|
|
with open(STATE_FILE, 'rb') as f: |
|
|
return pickle.load(f) |
|
|
return None |
|
|
|
|
|
def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Progress()): |
|
|
try: |
|
|
|
|
|
if resume: |
|
|
saved_state = load_training_state() |
|
|
if saved_state: |
|
|
progress(0, desc=f"Resuming from step {saved_state.get('step', 0)}...") |
|
|
|
|
|
progress(0, desc="Initializing training...") |
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
if device == "cpu": |
|
|
return "β οΈ Warning: Training on CPU will be very slow. Please upgrade Space to GPU." |
|
|
|
|
|
progress(0.1, desc="Loading model and tokenizer...") |
|
|
|
|
|
|
|
|
|
|
|
model_name = "Salesforce/CoDA-v0-Instruct" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
|
model = AutoModel.from_pretrained( |
|
|
model_name, |
|
|
trust_remote_code=True, |
|
|
torch_dtype=torch.float16 if device == "cuda" else torch.float32 |
|
|
) |
|
|
|
|
|
|
|
|
if device == "cuda": |
|
|
model = model.to(device) |
|
|
|
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
model.config.pad_token_id = tokenizer.eos_token_id |
|
|
|
|
|
progress(0.2, desc="Loading dataset...") |
|
|
|
|
|
|
|
|
dataset = load_dataset("baseten-admin/gpt-oss120b-generated-perfectblend", split="train") |
|
|
|
|
|
|
|
|
if 'conversations' not in dataset.column_names: |
|
|
return f"β Error: Dataset does not have 'conversations' column. Found columns: {dataset.column_names}" |
|
|
|
|
|
|
|
|
progress(0.3, desc="Preprocessing dataset...") |
|
|
|
|
|
tokenized_dataset = dataset.map( |
|
|
lambda x: preprocess_conversations(x, tokenizer), |
|
|
batched=True, |
|
|
remove_columns=dataset.column_names |
|
|
) |
|
|
|
|
|
|
|
|
train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42) |
|
|
train_dataset = train_test_split['train'] |
|
|
eval_dataset = train_test_split['test'] |
|
|
|
|
|
progress(0.4, desc="Setting up training configuration...") |
|
|
|
|
|
|
|
|
output_dir = str(MODEL_DIR / "coda-finetuned") |
|
|
training_args = TrainingArguments( |
|
|
output_dir=output_dir, |
|
|
num_train_epochs=epochs, |
|
|
per_device_train_batch_size=batch_size, |
|
|
per_device_eval_batch_size=batch_size, |
|
|
learning_rate=learning_rate, |
|
|
warmup_steps=100, |
|
|
logging_steps=5, |
|
|
logging_first_step=True, |
|
|
eval_strategy="steps", |
|
|
eval_steps=100, |
|
|
save_strategy="steps", |
|
|
save_steps=500, |
|
|
save_total_limit=2, |
|
|
fp16=True if device == "cuda" else False, |
|
|
gradient_accumulation_steps=4, |
|
|
gradient_checkpointing=False, |
|
|
optim="adamw_torch", |
|
|
report_to="none", |
|
|
load_best_model_at_end=True, |
|
|
metric_for_best_model="loss", |
|
|
greater_is_better=False, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
|
tokenizer=tokenizer, |
|
|
mlm=False |
|
|
) |
|
|
|
|
|
|
|
|
trainer = CoDATrainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=eval_dataset, |
|
|
data_collator=data_collator, |
|
|
) |
|
|
|
|
|
progress(0.5, desc=f"Training for {epochs} epochs...") |
|
|
|
|
|
|
|
|
class ProgressCallback: |
|
|
def __init__(self, progress_fn): |
|
|
self.progress_fn = progress_fn |
|
|
self.step = 0 |
|
|
|
|
|
def on_log(self, args, state, control, logs=None, **kwargs): |
|
|
if logs: |
|
|
self.step += 1 |
|
|
log_str = f"Step {state.global_step}: " |
|
|
if 'loss' in logs: |
|
|
log_str += f"loss={logs['loss']:.4f} " |
|
|
if 'learning_rate' in logs: |
|
|
log_str += f"lr={logs['learning_rate']:.2e}" |
|
|
self.progress_fn(0.5 + (0.4 * state.global_step / state.max_steps), desc=log_str) |
|
|
|
|
|
from transformers import TrainerCallback |
|
|
class GradioProgressCallback(TrainerCallback): |
|
|
def __init__(self, progress_fn): |
|
|
self.progress_fn = progress_fn |
|
|
|
|
|
def on_log(self, args, state, control, logs=None, **kwargs): |
|
|
if logs and state.max_steps > 0: |
|
|
log_str = f"Step {state.global_step}/{state.max_steps}: " |
|
|
if 'loss' in logs: |
|
|
log_str += f"loss={logs['loss']:.4f} " |
|
|
if 'learning_rate' in logs: |
|
|
log_str += f"lr={logs['learning_rate']:.2e}" |
|
|
progress = 0.5 + (0.4 * state.global_step / state.max_steps) |
|
|
self.progress_fn(progress, desc=log_str) |
|
|
|
|
|
|
|
|
class StateSavingCallback(TrainerCallback): |
|
|
def on_save(self, args, state, control, **kwargs): |
|
|
save_training_state({ |
|
|
'step': state.global_step, |
|
|
'epoch': state.epoch, |
|
|
'best_metric': state.best_metric |
|
|
}) |
|
|
|
|
|
trainer.add_callback(GradioProgressCallback(progress)) |
|
|
trainer.add_callback(StateSavingCallback()) |
|
|
|
|
|
|
|
|
resume_from_checkpoint = None |
|
|
if resume: |
|
|
checkpoints = list(Path(output_dir).glob("checkpoint-*")) |
|
|
if checkpoints: |
|
|
latest_checkpoint = max(checkpoints, key=lambda x: int(x.name.split("-")[1])) |
|
|
resume_from_checkpoint = str(latest_checkpoint) |
|
|
progress(0, desc=f"Resuming from {latest_checkpoint.name}...") |
|
|
|
|
|
trainer.train(resume_from_checkpoint=resume_from_checkpoint) |
|
|
|
|
|
progress(0.9, desc="Saving model...") |
|
|
|
|
|
|
|
|
trainer.save_model(output_dir) |
|
|
tokenizer.save_pretrained(output_dir) |
|
|
|
|
|
progress(1.0, desc="Training complete!") |
|
|
|
|
|
return f"β
Training completed successfully!\nModel saved to: {output_dir}\n\nFinal training loss: {trainer.state.log_history[-1].get('loss', 'N/A')}" |
|
|
|
|
|
except Exception as e: |
|
|
return f"β Error during training: {str(e)}" |
|
|
|
|
|
def upload_to_hub(repo_name, oauth_token: gr.OAuthToken | None, progress=gr.Progress()): |
|
|
try: |
|
|
if oauth_token is None: |
|
|
return "β Please login first to upload the model!" |
|
|
|
|
|
progress(0, desc="Authenticating...") |
|
|
|
|
|
|
|
|
login(token=oauth_token.token) |
|
|
user_info = whoami(oauth_token.token) |
|
|
username = user_info['name'] |
|
|
|
|
|
progress(0.2, desc="Preparing model for upload...") |
|
|
|
|
|
|
|
|
if not repo_name: |
|
|
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") |
|
|
repo_name = f"coda-finetuned-{timestamp}" |
|
|
|
|
|
repo_id = f"{username}/{repo_name}" |
|
|
|
|
|
progress(0.3, desc=f"Creating repository {repo_id}...") |
|
|
|
|
|
|
|
|
api = HfApi() |
|
|
api.create_repo(repo_id=repo_id, exist_ok=True, token=oauth_token.token, repo_type="model") |
|
|
|
|
|
progress(0.5, desc="Uploading model files...") |
|
|
|
|
|
|
|
|
model_dir = "./coda-finetuned" |
|
|
if not os.path.exists(model_dir): |
|
|
return "β No trained model found! Please train a model first." |
|
|
|
|
|
api.upload_folder( |
|
|
folder_path=model_dir, |
|
|
repo_id=repo_id, |
|
|
repo_type="model", |
|
|
token=oauth_token.token |
|
|
) |
|
|
|
|
|
progress(1.0, desc="Upload complete!") |
|
|
|
|
|
return f"β
Model successfully uploaded to: https://huggingface.co/{repo_id}" |
|
|
|
|
|
except Exception as e: |
|
|
return f"β Error during upload: {str(e)}" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="CoDA Fine-tuning Space") as demo: |
|
|
gr.Markdown(""" |
|
|
# π CoDA Model Fine-tuning Space |
|
|
|
|
|
This Space fine-tunes the **Salesforce/CoDA-v0-Instruct** diffusion model on the **baseten-admin/gpt-oss120b-generated-perfectblend** dataset. |
|
|
|
|
|
### Steps: |
|
|
1. **Login** with your Hugging Face account (required for upload) |
|
|
2. **Configure** training parameters |
|
|
3. **Train** the model (requires GPU - upgrade Space if needed) |
|
|
4. **Upload** the trained model to your account |
|
|
|
|
|
β οΈ **Note**: |
|
|
- Full fine-tuning requires significant GPU resources. Training may take several hours. |
|
|
- **Checkpoints are saved every 500 steps** - you can resume if interrupted. |
|
|
- For Docker: Mount `/data` volume for full persistence across container restarts. |
|
|
- On Spaces: Checkpoints persist in the same session and across rebuilds with persistent storage. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
login_button = gr.LoginButton() |
|
|
|
|
|
gr.Markdown("## Training Configuration") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Epochs") |
|
|
batch_size = gr.Slider(minimum=1, maximum=8, value=2, step=1, label="Batch Size per Device") |
|
|
learning_rate = gr.Slider(minimum=1e-6, maximum=1e-4, value=2e-5, step=1e-6, label="Learning Rate", info="Default: 2e-5") |
|
|
resume_training = gr.Checkbox(label="Resume from last checkpoint", value=False, info="Check if training was interrupted") |
|
|
|
|
|
with gr.Row(): |
|
|
train_button = gr.Button("π― Start Training", variant="primary", size="lg") |
|
|
|
|
|
training_output = gr.Textbox(label="Training Status", lines=5) |
|
|
|
|
|
gr.Markdown("## Upload Trained Model") |
|
|
|
|
|
with gr.Row(): |
|
|
repo_name = gr.Textbox(label="Model Repository Name", placeholder="coda-finetuned-v1", info="Leave empty for auto-generated name") |
|
|
|
|
|
with gr.Row(): |
|
|
upload_button = gr.Button("π€ Upload to Hugging Face Hub", variant="secondary", size="lg") |
|
|
|
|
|
upload_output = gr.Textbox(label="Upload Status", lines=3) |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
### About |
|
|
|
|
|
**CoDA (Code Diffusion with Autoregressive)** is a 1.7B parameter bidirectional diffusion model for text generation. |
|
|
This Space performs full fine-tuning on conversational data in ChatML format. |
|
|
|
|
|
**Dataset**: The training uses the `conversations` column from the dataset, which contains question-answer pairs. |
|
|
|
|
|
**Hardware**: GPU (T4 or better) is strongly recommended. CPU training will be extremely slow. |
|
|
""") |
|
|
|
|
|
|
|
|
train_button.click( |
|
|
fn=train_model, |
|
|
inputs=[epochs, batch_size, learning_rate, resume_training], |
|
|
outputs=training_output |
|
|
) |
|
|
|
|
|
upload_button.click( |
|
|
fn=upload_to_hub, |
|
|
inputs=[repo_name, login_button], |
|
|
outputs=upload_output |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|