import gradio as gr import torch from transformers import ( AutoModel, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling ) from datasets import load_dataset from huggingface_hub import HfApi, login, whoami import os from datetime import datetime import json import pickle from pathlib import Path # Custom Trainer for CoDA model class CoDATrainer(Trainer): def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): """ Custom loss computation for CoDA diffusion model. CoDA returns a dict with 'loss' key instead of a scalar. """ outputs = model(**inputs) # CoDA model returns a dict with 'loss' key if isinstance(outputs, dict) and 'loss' in outputs: loss = outputs['loss'] elif hasattr(outputs, 'loss'): loss = outputs.loss else: # Fallback: compute standard LM loss labels = inputs.get('labels') logits = outputs.get('logits') if isinstance(outputs, dict) else outputs[0] loss_fct = torch.nn.CrossEntropyLoss() loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)) # Ensure loss is a scalar if loss.dim() > 0: loss = loss.mean() return (loss, outputs) if return_outputs else loss def preprocess_conversations(examples, tokenizer): """Convert ChatML-style conversations to text for training""" texts = [] for conv in examples['conversations']: # Format: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}] if not isinstance(conv, list): raise ValueError(f"Expected conversation to be a list, got {type(conv)}") text = "" for message in conv: if not isinstance(message, dict): raise ValueError(f"Expected message to be a dict, got {type(message)}") role = message.get('role', '') content = message.get('content', '') if role == 'user': text += f"<|user|>\n{content}\n" elif role == 'assistant': text += f"<|assistant|>\n{content}\n" texts.append(text) return tokenizer(texts, truncation=True, max_length=2048, padding=False) # Persistent storage paths CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints") MODEL_DIR = Path("/data/models") if Path("/data").exists() else Path("./models") CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True) MODEL_DIR.mkdir(parents=True, exist_ok=True) STATE_FILE = CHECKPOINT_DIR / "training_state.pkl" def save_training_state(state): """Save training state to persistent storage""" with open(STATE_FILE, 'wb') as f: pickle.dump(state, f) def load_training_state(): """Load training state from persistent storage""" if STATE_FILE.exists(): with open(STATE_FILE, 'rb') as f: return pickle.load(f) return None def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Progress()): try: # Check for existing training state if resume: saved_state = load_training_state() if saved_state: progress(0, desc=f"Resuming from step {saved_state.get('step', 0)}...") progress(0, desc="Initializing training...") # Check for GPU device = "cuda" if torch.cuda.is_available() else "cpu" if device == "cpu": return "⚠️ Warning: Training on CPU will be very slow. Please upgrade Space to GPU." progress(0.1, desc="Loading model and tokenizer...") # Load model and tokenizer # Note: Using Instruct version which is better for fine-tuning model_name = "Salesforce/CoDA-v0-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModel.from_pretrained( model_name, trust_remote_code=True, torch_dtype=torch.float16 if device == "cuda" else torch.float32 ) # Move model to device (CoDA doesn't support device_map='auto') if device == "cuda": model = model.to(device) # Set pad token if not exists if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = tokenizer.eos_token_id progress(0.2, desc="Loading dataset...") # Load dataset dataset = load_dataset("baseten-admin/gpt-oss120b-generated-perfectblend", split="train") # Verify dataset has conversations column if 'conversations' not in dataset.column_names: return f"❌ Error: Dataset does not have 'conversations' column. Found columns: {dataset.column_names}" # Preprocess dataset progress(0.3, desc="Preprocessing dataset...") tokenized_dataset = dataset.map( lambda x: preprocess_conversations(x, tokenizer), batched=True, remove_columns=dataset.column_names ) # Split into train/eval train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42) train_dataset = train_test_split['train'] eval_dataset = train_test_split['test'] progress(0.4, desc="Setting up training configuration...") # Training arguments - use persistent storage output_dir = str(MODEL_DIR / "coda-finetuned") training_args = TrainingArguments( output_dir=output_dir, num_train_epochs=epochs, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, learning_rate=learning_rate, warmup_steps=100, logging_steps=5, # More frequent logging logging_first_step=True, eval_strategy="steps", eval_steps=100, save_strategy="steps", save_steps=500, save_total_limit=2, fp16=True if device == "cuda" else False, gradient_accumulation_steps=4, gradient_checkpointing=False, # CoDA doesn't support gradient checkpointing optim="adamw_torch", report_to="none", load_best_model_at_end=True, metric_for_best_model="loss", greater_is_better=False, ) # Use standard data collator for causal language modeling # This properly handles CoDA's internal sequence modifications data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False # Causal LM, not masked LM ) # Initialize trainer with custom loss trainer = CoDATrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, ) progress(0.5, desc=f"Training for {epochs} epochs...") # Train with live logging class ProgressCallback: def __init__(self, progress_fn): self.progress_fn = progress_fn self.step = 0 def on_log(self, args, state, control, logs=None, **kwargs): if logs: self.step += 1 log_str = f"Step {state.global_step}: " if 'loss' in logs: log_str += f"loss={logs['loss']:.4f} " if 'learning_rate' in logs: log_str += f"lr={logs['learning_rate']:.2e}" self.progress_fn(0.5 + (0.4 * state.global_step / state.max_steps), desc=log_str) from transformers import TrainerCallback class GradioProgressCallback(TrainerCallback): def __init__(self, progress_fn): self.progress_fn = progress_fn def on_log(self, args, state, control, logs=None, **kwargs): if logs and state.max_steps > 0: log_str = f"Step {state.global_step}/{state.max_steps}: " if 'loss' in logs: log_str += f"loss={logs['loss']:.4f} " if 'learning_rate' in logs: log_str += f"lr={logs['learning_rate']:.2e}" progress = 0.5 + (0.4 * state.global_step / state.max_steps) self.progress_fn(progress, desc=log_str) # Add state saving callback class StateSavingCallback(TrainerCallback): def on_save(self, args, state, control, **kwargs): save_training_state({ 'step': state.global_step, 'epoch': state.epoch, 'best_metric': state.best_metric }) trainer.add_callback(GradioProgressCallback(progress)) trainer.add_callback(StateSavingCallback()) # Resume from checkpoint if exists resume_from_checkpoint = None if resume: checkpoints = list(Path(output_dir).glob("checkpoint-*")) if checkpoints: latest_checkpoint = max(checkpoints, key=lambda x: int(x.name.split("-")[1])) resume_from_checkpoint = str(latest_checkpoint) progress(0, desc=f"Resuming from {latest_checkpoint.name}...") trainer.train(resume_from_checkpoint=resume_from_checkpoint) progress(0.9, desc="Saving model...") # Save final model trainer.save_model(output_dir) tokenizer.save_pretrained(output_dir) progress(1.0, desc="Training complete!") return f"✅ Training completed successfully!\nModel saved to: {output_dir}\n\nFinal training loss: {trainer.state.log_history[-1].get('loss', 'N/A')}" except Exception as e: return f"❌ Error during training: {str(e)}" def upload_to_hub(repo_name, oauth_token: gr.OAuthToken | None, progress=gr.Progress()): try: if oauth_token is None: return "❌ Please login first to upload the model!" progress(0, desc="Authenticating...") # Login with OAuth token login(token=oauth_token.token) user_info = whoami(oauth_token.token) username = user_info['name'] progress(0.2, desc="Preparing model for upload...") # Full repo ID if not repo_name: timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") repo_name = f"coda-finetuned-{timestamp}" repo_id = f"{username}/{repo_name}" progress(0.3, desc=f"Creating repository {repo_id}...") # Create repo api = HfApi() api.create_repo(repo_id=repo_id, exist_ok=True, token=oauth_token.token, repo_type="model") progress(0.5, desc="Uploading model files...") # Upload folder model_dir = "./coda-finetuned" if not os.path.exists(model_dir): return "❌ No trained model found! Please train a model first." api.upload_folder( folder_path=model_dir, repo_id=repo_id, repo_type="model", token=oauth_token.token ) progress(1.0, desc="Upload complete!") return f"✅ Model successfully uploaded to: https://huggingface.co/{repo_id}" except Exception as e: return f"❌ Error during upload: {str(e)}" # Gradio UI with gr.Blocks(title="CoDA Fine-tuning Space") as demo: gr.Markdown(""" # 🚀 CoDA Model Fine-tuning Space This Space fine-tunes the **Salesforce/CoDA-v0-Instruct** diffusion model on the **baseten-admin/gpt-oss120b-generated-perfectblend** dataset. ### Steps: 1. **Login** with your Hugging Face account (required for upload) 2. **Configure** training parameters 3. **Train** the model (requires GPU - upgrade Space if needed) 4. **Upload** the trained model to your account ⚠️ **Note**: - Full fine-tuning requires significant GPU resources. Training may take several hours. - **Checkpoints are saved every 500 steps** - you can resume if interrupted. - For Docker: Mount `/data` volume for full persistence across container restarts. - On Spaces: Checkpoints persist in the same session and across rebuilds with persistent storage. """) with gr.Row(): login_button = gr.LoginButton() gr.Markdown("## Training Configuration") with gr.Row(): with gr.Column(): epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Epochs") batch_size = gr.Slider(minimum=1, maximum=8, value=2, step=1, label="Batch Size per Device") learning_rate = gr.Slider(minimum=1e-6, maximum=1e-4, value=2e-5, step=1e-6, label="Learning Rate", info="Default: 2e-5") resume_training = gr.Checkbox(label="Resume from last checkpoint", value=False, info="Check if training was interrupted") with gr.Row(): train_button = gr.Button("🎯 Start Training", variant="primary", size="lg") training_output = gr.Textbox(label="Training Status", lines=5) gr.Markdown("## Upload Trained Model") with gr.Row(): repo_name = gr.Textbox(label="Model Repository Name", placeholder="coda-finetuned-v1", info="Leave empty for auto-generated name") with gr.Row(): upload_button = gr.Button("📤 Upload to Hugging Face Hub", variant="secondary", size="lg") upload_output = gr.Textbox(label="Upload Status", lines=3) gr.Markdown(""" --- ### About **CoDA (Code Diffusion with Autoregressive)** is a 1.7B parameter bidirectional diffusion model for text generation. This Space performs full fine-tuning on conversational data in ChatML format. **Dataset**: The training uses the `conversations` column from the dataset, which contains question-answer pairs. **Hardware**: GPU (T4 or better) is strongly recommended. CPU training will be extremely slow. """) # Event handlers train_button.click( fn=train_model, inputs=[epochs, batch_size, learning_rate, resume_training], outputs=training_output ) upload_button.click( fn=upload_to_hub, inputs=[repo_name, login_button], outputs=upload_output ) if __name__ == "__main__": demo.launch()