train3 / app.py
llaa33219's picture
Upload 4 files
f284dcb verified
import gradio as gr
import torch
from transformers import (
AutoModel,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from datasets import load_dataset
from huggingface_hub import HfApi, login, whoami
import os
from datetime import datetime
import json
import pickle
from pathlib import Path
# Custom Trainer for CoDA model
class CoDATrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
"""
Custom loss computation for CoDA diffusion model.
CoDA returns a dict with 'loss' key instead of a scalar.
"""
outputs = model(**inputs)
# CoDA model returns a dict with 'loss' key
if isinstance(outputs, dict) and 'loss' in outputs:
loss = outputs['loss']
elif hasattr(outputs, 'loss'):
loss = outputs.loss
else:
# Fallback: compute standard LM loss
labels = inputs.get('labels')
logits = outputs.get('logits') if isinstance(outputs, dict) else outputs[0]
loss_fct = torch.nn.CrossEntropyLoss()
loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
# Ensure loss is a scalar
if loss.dim() > 0:
loss = loss.mean()
return (loss, outputs) if return_outputs else loss
def preprocess_conversations(examples, tokenizer):
"""Convert ChatML-style conversations to text for training"""
texts = []
for conv in examples['conversations']:
# Format: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
if not isinstance(conv, list):
raise ValueError(f"Expected conversation to be a list, got {type(conv)}")
text = ""
for message in conv:
if not isinstance(message, dict):
raise ValueError(f"Expected message to be a dict, got {type(message)}")
role = message.get('role', '')
content = message.get('content', '')
if role == 'user':
text += f"<|user|>\n{content}\n"
elif role == 'assistant':
text += f"<|assistant|>\n{content}\n"
texts.append(text)
return tokenizer(texts, truncation=True, max_length=2048, padding=False)
# Persistent storage paths
CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints")
MODEL_DIR = Path("/data/models") if Path("/data").exists() else Path("./models")
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)
STATE_FILE = CHECKPOINT_DIR / "training_state.pkl"
def save_training_state(state):
"""Save training state to persistent storage"""
with open(STATE_FILE, 'wb') as f:
pickle.dump(state, f)
def load_training_state():
"""Load training state from persistent storage"""
if STATE_FILE.exists():
with open(STATE_FILE, 'rb') as f:
return pickle.load(f)
return None
def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Progress()):
try:
# Check for existing training state
if resume:
saved_state = load_training_state()
if saved_state:
progress(0, desc=f"Resuming from step {saved_state.get('step', 0)}...")
progress(0, desc="Initializing training...")
# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cpu":
return "⚠️ Warning: Training on CPU will be very slow. Please upgrade Space to GPU."
progress(0.1, desc="Loading model and tokenizer...")
# Load model and tokenizer
# Note: Using Instruct version which is better for fine-tuning
model_name = "Salesforce/CoDA-v0-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_name,
trust_remote_code=True,
torch_dtype=torch.float16 if device == "cuda" else torch.float32
)
# Move model to device (CoDA doesn't support device_map='auto')
if device == "cuda":
model = model.to(device)
# Set pad token if not exists
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
progress(0.2, desc="Loading dataset...")
# Load dataset
dataset = load_dataset("baseten-admin/gpt-oss120b-generated-perfectblend", split="train")
# Verify dataset has conversations column
if 'conversations' not in dataset.column_names:
return f"❌ Error: Dataset does not have 'conversations' column. Found columns: {dataset.column_names}"
# Preprocess dataset
progress(0.3, desc="Preprocessing dataset...")
tokenized_dataset = dataset.map(
lambda x: preprocess_conversations(x, tokenizer),
batched=True,
remove_columns=dataset.column_names
)
# Split into train/eval
train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
progress(0.4, desc="Setting up training configuration...")
# Training arguments - use persistent storage
output_dir = str(MODEL_DIR / "coda-finetuned")
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
learning_rate=learning_rate,
warmup_steps=100,
logging_steps=5, # More frequent logging
logging_first_step=True,
eval_strategy="steps",
eval_steps=100,
save_strategy="steps",
save_steps=500,
save_total_limit=2,
fp16=True if device == "cuda" else False,
gradient_accumulation_steps=4,
gradient_checkpointing=False, # CoDA doesn't support gradient checkpointing
optim="adamw_torch",
report_to="none",
load_best_model_at_end=True,
metric_for_best_model="loss",
greater_is_better=False,
)
# Use standard data collator for causal language modeling
# This properly handles CoDA's internal sequence modifications
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False # Causal LM, not masked LM
)
# Initialize trainer with custom loss
trainer = CoDATrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
progress(0.5, desc=f"Training for {epochs} epochs...")
# Train with live logging
class ProgressCallback:
def __init__(self, progress_fn):
self.progress_fn = progress_fn
self.step = 0
def on_log(self, args, state, control, logs=None, **kwargs):
if logs:
self.step += 1
log_str = f"Step {state.global_step}: "
if 'loss' in logs:
log_str += f"loss={logs['loss']:.4f} "
if 'learning_rate' in logs:
log_str += f"lr={logs['learning_rate']:.2e}"
self.progress_fn(0.5 + (0.4 * state.global_step / state.max_steps), desc=log_str)
from transformers import TrainerCallback
class GradioProgressCallback(TrainerCallback):
def __init__(self, progress_fn):
self.progress_fn = progress_fn
def on_log(self, args, state, control, logs=None, **kwargs):
if logs and state.max_steps > 0:
log_str = f"Step {state.global_step}/{state.max_steps}: "
if 'loss' in logs:
log_str += f"loss={logs['loss']:.4f} "
if 'learning_rate' in logs:
log_str += f"lr={logs['learning_rate']:.2e}"
progress = 0.5 + (0.4 * state.global_step / state.max_steps)
self.progress_fn(progress, desc=log_str)
# Add state saving callback
class StateSavingCallback(TrainerCallback):
def on_save(self, args, state, control, **kwargs):
save_training_state({
'step': state.global_step,
'epoch': state.epoch,
'best_metric': state.best_metric
})
trainer.add_callback(GradioProgressCallback(progress))
trainer.add_callback(StateSavingCallback())
# Resume from checkpoint if exists
resume_from_checkpoint = None
if resume:
checkpoints = list(Path(output_dir).glob("checkpoint-*"))
if checkpoints:
latest_checkpoint = max(checkpoints, key=lambda x: int(x.name.split("-")[1]))
resume_from_checkpoint = str(latest_checkpoint)
progress(0, desc=f"Resuming from {latest_checkpoint.name}...")
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
progress(0.9, desc="Saving model...")
# Save final model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
progress(1.0, desc="Training complete!")
return f"βœ… Training completed successfully!\nModel saved to: {output_dir}\n\nFinal training loss: {trainer.state.log_history[-1].get('loss', 'N/A')}"
except Exception as e:
return f"❌ Error during training: {str(e)}"
def upload_to_hub(repo_name, oauth_token: gr.OAuthToken | None, progress=gr.Progress()):
try:
if oauth_token is None:
return "❌ Please login first to upload the model!"
progress(0, desc="Authenticating...")
# Login with OAuth token
login(token=oauth_token.token)
user_info = whoami(oauth_token.token)
username = user_info['name']
progress(0.2, desc="Preparing model for upload...")
# Full repo ID
if not repo_name:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
repo_name = f"coda-finetuned-{timestamp}"
repo_id = f"{username}/{repo_name}"
progress(0.3, desc=f"Creating repository {repo_id}...")
# Create repo
api = HfApi()
api.create_repo(repo_id=repo_id, exist_ok=True, token=oauth_token.token, repo_type="model")
progress(0.5, desc="Uploading model files...")
# Upload folder
model_dir = "./coda-finetuned"
if not os.path.exists(model_dir):
return "❌ No trained model found! Please train a model first."
api.upload_folder(
folder_path=model_dir,
repo_id=repo_id,
repo_type="model",
token=oauth_token.token
)
progress(1.0, desc="Upload complete!")
return f"βœ… Model successfully uploaded to: https://huggingface.co/{repo_id}"
except Exception as e:
return f"❌ Error during upload: {str(e)}"
# Gradio UI
with gr.Blocks(title="CoDA Fine-tuning Space") as demo:
gr.Markdown("""
# πŸš€ CoDA Model Fine-tuning Space
This Space fine-tunes the **Salesforce/CoDA-v0-Instruct** diffusion model on the **baseten-admin/gpt-oss120b-generated-perfectblend** dataset.
### Steps:
1. **Login** with your Hugging Face account (required for upload)
2. **Configure** training parameters
3. **Train** the model (requires GPU - upgrade Space if needed)
4. **Upload** the trained model to your account
⚠️ **Note**:
- Full fine-tuning requires significant GPU resources. Training may take several hours.
- **Checkpoints are saved every 500 steps** - you can resume if interrupted.
- For Docker: Mount `/data` volume for full persistence across container restarts.
- On Spaces: Checkpoints persist in the same session and across rebuilds with persistent storage.
""")
with gr.Row():
login_button = gr.LoginButton()
gr.Markdown("## Training Configuration")
with gr.Row():
with gr.Column():
epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Epochs")
batch_size = gr.Slider(minimum=1, maximum=8, value=2, step=1, label="Batch Size per Device")
learning_rate = gr.Slider(minimum=1e-6, maximum=1e-4, value=2e-5, step=1e-6, label="Learning Rate", info="Default: 2e-5")
resume_training = gr.Checkbox(label="Resume from last checkpoint", value=False, info="Check if training was interrupted")
with gr.Row():
train_button = gr.Button("🎯 Start Training", variant="primary", size="lg")
training_output = gr.Textbox(label="Training Status", lines=5)
gr.Markdown("## Upload Trained Model")
with gr.Row():
repo_name = gr.Textbox(label="Model Repository Name", placeholder="coda-finetuned-v1", info="Leave empty for auto-generated name")
with gr.Row():
upload_button = gr.Button("πŸ“€ Upload to Hugging Face Hub", variant="secondary", size="lg")
upload_output = gr.Textbox(label="Upload Status", lines=3)
gr.Markdown("""
---
### About
**CoDA (Code Diffusion with Autoregressive)** is a 1.7B parameter bidirectional diffusion model for text generation.
This Space performs full fine-tuning on conversational data in ChatML format.
**Dataset**: The training uses the `conversations` column from the dataset, which contains question-answer pairs.
**Hardware**: GPU (T4 or better) is strongly recommended. CPU training will be extremely slow.
""")
# Event handlers
train_button.click(
fn=train_model,
inputs=[epochs, batch_size, learning_rate, resume_training],
outputs=training_output
)
upload_button.click(
fn=upload_to_hub,
inputs=[repo_name, login_button],
outputs=upload_output
)
if __name__ == "__main__":
demo.launch()