Spaces:

llaa33219
/

train3

Paused

App Files Files Community

train3 / app.py

llaa33219

Upload 4 files

f284dcb verified 14 days ago

raw

history blame contribute delete

14.8 kB

	import gradio as gr
	import torch
	from transformers import (
	AutoModel,
	AutoTokenizer,
	TrainingArguments,
	Trainer,
	DataCollatorForLanguageModeling
	)
	from datasets import load_dataset
	from huggingface_hub import HfApi, login, whoami
	import os
	from datetime import datetime
	import json
	import pickle
	from pathlib import Path

	# Custom Trainer for CoDA model
	class CoDATrainer(Trainer):
	def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
	"""
	Custom loss computation for CoDA diffusion model.
	CoDA returns a dict with 'loss' key instead of a scalar.
	"""
	outputs = model(**inputs)

	# CoDA model returns a dict with 'loss' key
	if isinstance(outputs, dict) and 'loss' in outputs:
	loss = outputs['loss']
	elif hasattr(outputs, 'loss'):
	loss = outputs.loss
	else:
	# Fallback: compute standard LM loss
	labels = inputs.get('labels')
	logits = outputs.get('logits') if isinstance(outputs, dict) else outputs[0]
	loss_fct = torch.nn.CrossEntropyLoss()
	loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

	# Ensure loss is a scalar
	if loss.dim() > 0:
	loss = loss.mean()

	return (loss, outputs) if return_outputs else loss

	def preprocess_conversations(examples, tokenizer):
	"""Convert ChatML-style conversations to text for training"""
	texts = []
	for conv in examples['conversations']:
	# Format: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
	if not isinstance(conv, list):
	raise ValueError(f"Expected conversation to be a list, got {type(conv)}")

	text = ""
	for message in conv:
	if not isinstance(message, dict):
	raise ValueError(f"Expected message to be a dict, got {type(message)}")

	role = message.get('role', '')
	content = message.get('content', '')
	if role == 'user':
	text += f"<\|user\|>\n{content}\n"
	elif role == 'assistant':
	text += f"<\|assistant\|>\n{content}\n"
	texts.append(text)

	return tokenizer(texts, truncation=True, max_length=2048, padding=False)

	# Persistent storage paths
	CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints")
	MODEL_DIR = Path("/data/models") if Path("/data").exists() else Path("./models")
	CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
	MODEL_DIR.mkdir(parents=True, exist_ok=True)

	STATE_FILE = CHECKPOINT_DIR / "training_state.pkl"

	def save_training_state(state):
	"""Save training state to persistent storage"""
	with open(STATE_FILE, 'wb') as f:
	pickle.dump(state, f)

	def load_training_state():
	"""Load training state from persistent storage"""
	if STATE_FILE.exists():
	with open(STATE_FILE, 'rb') as f:
	return pickle.load(f)
	return None

	def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Progress()):
	try:
	# Check for existing training state
	if resume:
	saved_state = load_training_state()
	if saved_state:
	progress(0, desc=f"Resuming from step {saved_state.get('step', 0)}...")

	progress(0, desc="Initializing training...")

	# Check for GPU
	device = "cuda" if torch.cuda.is_available() else "cpu"
	if device == "cpu":
	return "⚠️ Warning: Training on CPU will be very slow. Please upgrade Space to GPU."

	progress(0.1, desc="Loading model and tokenizer...")

	# Load model and tokenizer
	# Note: Using Instruct version which is better for fine-tuning
	model_name = "Salesforce/CoDA-v0-Instruct"
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	model = AutoModel.from_pretrained(
	model_name,
	trust_remote_code=True,
	torch_dtype=torch.float16 if device == "cuda" else torch.float32
	)

	# Move model to device (CoDA doesn't support device_map='auto')
	if device == "cuda":
	model = model.to(device)

	# Set pad token if not exists
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	model.config.pad_token_id = tokenizer.eos_token_id

	progress(0.2, desc="Loading dataset...")

	# Load dataset
	dataset = load_dataset("baseten-admin/gpt-oss120b-generated-perfectblend", split="train")

	# Verify dataset has conversations column
	if 'conversations' not in dataset.column_names:
	return f"❌ Error: Dataset does not have 'conversations' column. Found columns: {dataset.column_names}"

	# Preprocess dataset
	progress(0.3, desc="Preprocessing dataset...")

	tokenized_dataset = dataset.map(
	lambda x: preprocess_conversations(x, tokenizer),
	batched=True,
	remove_columns=dataset.column_names
	)

	# Split into train/eval
	train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
	train_dataset = train_test_split['train']
	eval_dataset = train_test_split['test']

	progress(0.4, desc="Setting up training configuration...")

	# Training arguments - use persistent storage
	output_dir = str(MODEL_DIR / "coda-finetuned")
	training_args = TrainingArguments(
	output_dir=output_dir,
	num_train_epochs=epochs,
	per_device_train_batch_size=batch_size,
	per_device_eval_batch_size=batch_size,
	learning_rate=learning_rate,
	warmup_steps=100,
	logging_steps=5, # More frequent logging
	logging_first_step=True,
	eval_strategy="steps",
	eval_steps=100,
	save_strategy="steps",
	save_steps=500,
	save_total_limit=2,
	fp16=True if device == "cuda" else False,
	gradient_accumulation_steps=4,
	gradient_checkpointing=False, # CoDA doesn't support gradient checkpointing
	optim="adamw_torch",
	report_to="none",
	load_best_model_at_end=True,
	metric_for_best_model="loss",
	greater_is_better=False,
	)

	# Use standard data collator for causal language modeling
	# This properly handles CoDA's internal sequence modifications
	data_collator = DataCollatorForLanguageModeling(
	tokenizer=tokenizer,
	mlm=False # Causal LM, not masked LM
	)

	# Initialize trainer with custom loss
	trainer = CoDATrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	data_collator=data_collator,
	)

	progress(0.5, desc=f"Training for {epochs} epochs...")

	# Train with live logging
	class ProgressCallback:
	def __init__(self, progress_fn):
	self.progress_fn = progress_fn
	self.step = 0

	def on_log(self, args, state, control, logs=None, **kwargs):
	if logs:
	self.step += 1
	log_str = f"Step {state.global_step}: "
	if 'loss' in logs:
	log_str += f"loss={logs['loss']:.4f} "
	if 'learning_rate' in logs:
	log_str += f"lr={logs['learning_rate']:.2e}"
	self.progress_fn(0.5 + (0.4 * state.global_step / state.max_steps), desc=log_str)

	from transformers import TrainerCallback
	class GradioProgressCallback(TrainerCallback):
	def __init__(self, progress_fn):
	self.progress_fn = progress_fn

	def on_log(self, args, state, control, logs=None, **kwargs):
	if logs and state.max_steps > 0:
	log_str = f"Step {state.global_step}/{state.max_steps}: "
	if 'loss' in logs:
	log_str += f"loss={logs['loss']:.4f} "
	if 'learning_rate' in logs:
	log_str += f"lr={logs['learning_rate']:.2e}"
	progress = 0.5 + (0.4 * state.global_step / state.max_steps)
	self.progress_fn(progress, desc=log_str)

	# Add state saving callback
	class StateSavingCallback(TrainerCallback):
	def on_save(self, args, state, control, **kwargs):
	save_training_state({
	'step': state.global_step,
	'epoch': state.epoch,
	'best_metric': state.best_metric
	})

	trainer.add_callback(GradioProgressCallback(progress))
	trainer.add_callback(StateSavingCallback())

	# Resume from checkpoint if exists
	resume_from_checkpoint = None
	if resume:
	checkpoints = list(Path(output_dir).glob("checkpoint-*"))
	if checkpoints:
	latest_checkpoint = max(checkpoints, key=lambda x: int(x.name.split("-")[1]))
	resume_from_checkpoint = str(latest_checkpoint)
	progress(0, desc=f"Resuming from {latest_checkpoint.name}...")

	trainer.train(resume_from_checkpoint=resume_from_checkpoint)

	progress(0.9, desc="Saving model...")

	# Save final model
	trainer.save_model(output_dir)
	tokenizer.save_pretrained(output_dir)

	progress(1.0, desc="Training complete!")

	return f"✅ Training completed successfully!\nModel saved to: {output_dir}\n\nFinal training loss: {trainer.state.log_history[-1].get('loss', 'N/A')}"

	except Exception as e:
	return f"❌ Error during training: {str(e)}"

	def upload_to_hub(repo_name, oauth_token: gr.OAuthToken \| None, progress=gr.Progress()):
	try:
	if oauth_token is None:
	return "❌ Please login first to upload the model!"

	progress(0, desc="Authenticating...")

	# Login with OAuth token
	login(token=oauth_token.token)
	user_info = whoami(oauth_token.token)
	username = user_info['name']

	progress(0.2, desc="Preparing model for upload...")

	# Full repo ID
	if not repo_name:
	timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
	repo_name = f"coda-finetuned-{timestamp}"

	repo_id = f"{username}/{repo_name}"

	progress(0.3, desc=f"Creating repository {repo_id}...")

	# Create repo
	api = HfApi()
	api.create_repo(repo_id=repo_id, exist_ok=True, token=oauth_token.token, repo_type="model")

	progress(0.5, desc="Uploading model files...")

	# Upload folder
	model_dir = "./coda-finetuned"
	if not os.path.exists(model_dir):
	return "❌ No trained model found! Please train a model first."

	api.upload_folder(
	folder_path=model_dir,
	repo_id=repo_id,
	repo_type="model",
	token=oauth_token.token
	)

	progress(1.0, desc="Upload complete!")

	return f"✅ Model successfully uploaded to: https://huggingface.co/{repo_id}"

	except Exception as e:
	return f"❌ Error during upload: {str(e)}"

	# Gradio UI
	with gr.Blocks(title="CoDA Fine-tuning Space") as demo:
	gr.Markdown("""
	# 🚀 CoDA Model Fine-tuning Space

	This Space fine-tunes the Salesforce/CoDA-v0-Instruct diffusion model on the baseten-admin/gpt-oss120b-generated-perfectblend dataset.

	### Steps:
	1. Login with your Hugging Face account (required for upload)
	2. Configure training parameters
	3. Train the model (requires GPU - upgrade Space if needed)
	4. Upload the trained model to your account

	⚠️ Note:
	- Full fine-tuning requires significant GPU resources. Training may take several hours.
	- Checkpoints are saved every 500 steps - you can resume if interrupted.
	- For Docker: Mount `/data` volume for full persistence across container restarts.
	- On Spaces: Checkpoints persist in the same session and across rebuilds with persistent storage.
	""")

	with gr.Row():
	login_button = gr.LoginButton()

	gr.Markdown("## Training Configuration")

	with gr.Row():
	with gr.Column():
	epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Epochs")
	batch_size = gr.Slider(minimum=1, maximum=8, value=2, step=1, label="Batch Size per Device")
	learning_rate = gr.Slider(minimum=1e-6, maximum=1e-4, value=2e-5, step=1e-6, label="Learning Rate", info="Default: 2e-5")
	resume_training = gr.Checkbox(label="Resume from last checkpoint", value=False, info="Check if training was interrupted")

	with gr.Row():
	train_button = gr.Button("🎯 Start Training", variant="primary", size="lg")

	training_output = gr.Textbox(label="Training Status", lines=5)

	gr.Markdown("## Upload Trained Model")

	with gr.Row():
	repo_name = gr.Textbox(label="Model Repository Name", placeholder="coda-finetuned-v1", info="Leave empty for auto-generated name")

	with gr.Row():
	upload_button = gr.Button("📤 Upload to Hugging Face Hub", variant="secondary", size="lg")

	upload_output = gr.Textbox(label="Upload Status", lines=3)

	gr.Markdown("""
	---
	### About

	CoDA (Code Diffusion with Autoregressive) is a 1.7B parameter bidirectional diffusion model for text generation.
	This Space performs full fine-tuning on conversational data in ChatML format.

	Dataset: The training uses the `conversations` column from the dataset, which contains question-answer pairs.

	Hardware: GPU (T4 or better) is strongly recommended. CPU training will be extremely slow.
	""")

	# Event handlers
	train_button.click(
	fn=train_model,
	inputs=[epochs, batch_size, learning_rate, resume_training],
	outputs=training_output
	)

	upload_button.click(
	fn=upload_to_hub,
	inputs=[repo_name, login_button],
	outputs=upload_output
	)

	if __name__ == "__main__":
	demo.launch()