|
|
|
|
|
import gradio as gr |
|
|
import os |
|
|
from huggingface_hub import login |
|
|
|
|
|
|
|
|
hf_token = os.environ.get("HF_TOKEN") |
|
|
if hf_token: |
|
|
login(token=hf_token) |
|
|
|
|
|
class RealTrainer: |
|
|
def __init__(self): |
|
|
self.is_training = False |
|
|
self.current_output = "" |
|
|
|
|
|
def real_training(self, epochs, learning_rate, batch_size): |
|
|
"""Lance un VRAI entraînement avec identité Gopu""" |
|
|
if self.is_training: |
|
|
return "❌ Un entraînement est déjà en cours!" |
|
|
|
|
|
self.is_training = True |
|
|
self.current_output = "" |
|
|
|
|
|
try: |
|
|
|
|
|
from transformers import ( |
|
|
AutoTokenizer, AutoModelForCausalLM, |
|
|
TrainingArguments, Trainer, |
|
|
DataCollatorForLanguageModeling |
|
|
) |
|
|
from datasets import load_dataset |
|
|
import torch |
|
|
|
|
|
|
|
|
self._update_output("🔄 Chargement du modèle Unity-Tinny-Go...") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("Gopu-poss/unity-tinny-go") |
|
|
model = AutoModelForCausalLM.from_pretrained("Gopu-poss/unity-tinny-go") |
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
self._update_output("✅ Modèle chargé!") |
|
|
|
|
|
|
|
|
self._update_output("📊 Chargement du dataset gopus-1xs...") |
|
|
|
|
|
train_dataset = load_dataset("Gopu-poss/gopus-1xs", split="train") |
|
|
|
|
|
self._update_output(f"✅ Dataset chargé: {len(train_dataset)} échantillons") |
|
|
|
|
|
|
|
|
self._update_output("🔤 Préparation des données avec identité Gopu...") |
|
|
|
|
|
def prepare_gopus_data(examples): |
|
|
texts = [] |
|
|
for i in range(len(examples['instruction'])): |
|
|
instruction = examples['instruction'][i] or "" |
|
|
input_text = examples['input'][i] or "" |
|
|
output = examples['output'][i] or "" |
|
|
|
|
|
|
|
|
if input_text.strip(): |
|
|
text = f"Instruction: {instruction}\nInput: {input_text}\nResponse: Je suis Gopu, créé par Mauricio Mangituka. {output}" |
|
|
else: |
|
|
text = f"Instruction: {instruction}\nResponse: Je suis Gopu, créé par Mauricio Mangituka. {output}" |
|
|
|
|
|
texts.append(text) |
|
|
|
|
|
|
|
|
return {"text": texts} |
|
|
|
|
|
|
|
|
train_dataset = train_dataset.map(prepare_gopus_data, batched=True) |
|
|
|
|
|
def tokenize_function(examples): |
|
|
return tokenizer( |
|
|
examples["text"], |
|
|
truncation=True, |
|
|
padding="max_length", |
|
|
max_length=512, |
|
|
) |
|
|
|
|
|
tokenized_dataset = train_dataset.map( |
|
|
tokenize_function, |
|
|
batched=True, |
|
|
remove_columns=train_dataset.column_names |
|
|
) |
|
|
|
|
|
self._update_output("✅ Données préparées avec identité Gopu!") |
|
|
|
|
|
|
|
|
self._update_output("🎯 Configuration de l'entraînement...") |
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir="./unity-tinny-go-trained", |
|
|
num_train_epochs=int(epochs), |
|
|
per_device_train_batch_size=int(batch_size), |
|
|
gradient_accumulation_steps=2, |
|
|
learning_rate=float(learning_rate), |
|
|
warmup_steps=10, |
|
|
logging_steps=5, |
|
|
save_steps=50, |
|
|
push_to_hub=True, |
|
|
hub_model_id="Gopu-poss/unity-tinny-go", |
|
|
hub_strategy="end", |
|
|
report_to="none", |
|
|
remove_unused_columns=False, |
|
|
dataloader_pin_memory=False, |
|
|
) |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
|
tokenizer=tokenizer, |
|
|
mlm=False, |
|
|
) |
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=tokenized_dataset, |
|
|
data_collator=data_collator, |
|
|
) |
|
|
|
|
|
self._update_output("🏋️ Début de l'entraînement...") |
|
|
self._update_output(f"📊 Échantillons: {len(tokenized_dataset)}") |
|
|
|
|
|
|
|
|
train_result = trainer.train() |
|
|
|
|
|
|
|
|
self._update_output("💾 Sauvegarde du modèle...") |
|
|
|
|
|
trainer.save_model() |
|
|
tokenizer.save_pretrained("./unity-tinny-go-trained") |
|
|
|
|
|
|
|
|
self._update_output("📤 Push vers Hugging Face...") |
|
|
|
|
|
trainer.push_to_hub(commit_message=f"AutoTrain: {epochs} epochs, {len(tokenized_dataset)} samples") |
|
|
|
|
|
final_output = self.current_output + f"\n🎉 ENTRAÎNEMENT TERMINÉ!\n" |
|
|
final_output += f"📉 Loss final: {train_result.metrics['train_loss']:.4f}\n" |
|
|
final_output += f"⏱️ Temps: {train_result.metrics['train_runtime']:.1f}s\n" |
|
|
final_output += f"🔗 Modèle: https://huggingface.co/Gopu-poss/unity-tinny-go\n" |
|
|
final_output += f"👤 Identité: Gopu créé par Mauricio Mangituka\n" |
|
|
final_output += f"📊 Échantillons utilisés: {len(tokenized_dataset)}" |
|
|
|
|
|
self.is_training = False |
|
|
return final_output |
|
|
|
|
|
except Exception as e: |
|
|
self.is_training = False |
|
|
error_msg = f"❌ Erreur pendant l'entraînement: {str(e)}" |
|
|
import traceback |
|
|
error_details = traceback.format_exc() |
|
|
return f"{error_msg}\n\n🔍 Détails:\n{error_details}" |
|
|
|
|
|
def _update_output(self, message): |
|
|
"""Met à jour la sortie en cours""" |
|
|
self.current_output += message + "\n" |
|
|
|
|
|
def chat_with_gopu(message): |
|
|
"""Chat avec Gopu""" |
|
|
try: |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
import torch |
|
|
|
|
|
|
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained("./unity-tinny-go-trained") |
|
|
model = AutoModelForCausalLM.from_pretrained("./unity-tinny-go-trained") |
|
|
model_source = "entraîné" |
|
|
except: |
|
|
tokenizer = AutoTokenizer.from_pretrained("Gopu-poss/unity-tinny-go") |
|
|
model = AutoModelForCausalLM.from_pretrained("Gopu-poss/unity-tinny-go") |
|
|
model_source = "de base" |
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
|
|
|
prompt = f"Instruction: {message}\nResponse: Je suis Gopu, créé par Mauricio Mangituka." |
|
|
|
|
|
inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=256) |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model.generate( |
|
|
inputs, |
|
|
max_length=400, |
|
|
temperature=0.7, |
|
|
do_sample=True, |
|
|
top_p=0.9, |
|
|
repetition_penalty=1.1, |
|
|
pad_token_id=tokenizer.eos_token_id, |
|
|
eos_token_id=tokenizer.eos_token_id, |
|
|
) |
|
|
|
|
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
if "Response:" in response: |
|
|
response = response.split("Response:")[-1].strip() |
|
|
else: |
|
|
response = response[len(prompt):].strip() |
|
|
|
|
|
return f"{response}\n\n*(Modèle {model_source})*" |
|
|
|
|
|
except Exception as e: |
|
|
return f"❌ Erreur: {str(e)}" |
|
|
|
|
|
def check_dataset_info(): |
|
|
"""Vérifie les informations du dataset""" |
|
|
try: |
|
|
from datasets import load_dataset |
|
|
dataset = load_dataset("Gopu-poss/gopus-1xs") |
|
|
train_dataset = dataset['train'] |
|
|
|
|
|
info = [] |
|
|
info.append(f"✅ Dataset gopus-1xs chargé") |
|
|
info.append(f"📊 Échantillons: {len(train_dataset)}") |
|
|
info.append(f"📋 Colonnes: {', '.join(train_dataset.column_names)}") |
|
|
|
|
|
|
|
|
sample = train_dataset[0] |
|
|
info.append(f"📝 Exemple instruction: {sample['instruction'][:60]}...") |
|
|
info.append(f"📝 Exemple réponse: {sample['output'][:60]}...") |
|
|
|
|
|
return "\n".join(info) |
|
|
except Exception as e: |
|
|
return f"❌ Erreur chargement dataset: {str(e)}" |
|
|
|
|
|
def test_trained_model(): |
|
|
"""Teste si le modèle entraîné existe""" |
|
|
try: |
|
|
from transformers import AutoTokenizer |
|
|
tokenizer = AutoTokenizer.from_pretrained("./unity-tinny-go-trained") |
|
|
return "✅ Modèle entraîné disponible!" |
|
|
except: |
|
|
return "ℹ️ Aucun modèle entraîné. Lancez un entraînement." |
|
|
|
|
|
|
|
|
trainer = RealTrainer() |
|
|
|
|
|
def launch_real_training(epochs, learning_rate, batch_size): |
|
|
return trainer.real_training(epochs, learning_rate, batch_size) |
|
|
|
|
|
def check_training_status(): |
|
|
if trainer.is_training: |
|
|
return "🔄 Entraînement en cours..." |
|
|
else: |
|
|
return "✅ Prêt pour l'entraînement" |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(), title="Gopu - AutoTrain") as demo: |
|
|
|
|
|
gr.Markdown(""" |
|
|
# 🤖 Gopu - AutoTrain |
|
|
**Entraînement sur le dataset gopus-1xs** |
|
|
*Créé par Mauricio Mangituka* |
|
|
""") |
|
|
|
|
|
with gr.Tab("🎯 Entraînement"): |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("### Configuration") |
|
|
|
|
|
epochs = gr.Slider(1, 3, value=1, step=1, label="Epochs", info="Commencez avec 1 epoch") |
|
|
learning_rate = gr.Number(2e-5, label="Learning Rate", info="2e-5 recommandé") |
|
|
batch_size = gr.Slider(1, 2, value=1, step=1, label="Batch Size", info="1 pour éviter les erreurs mémoire") |
|
|
|
|
|
train_btn = gr.Button("🚀 Lancer l'entraînement", variant="primary") |
|
|
status_text = gr.Textbox(label="Status", value=check_training_status(), interactive=False) |
|
|
|
|
|
with gr.Column(): |
|
|
gr.Markdown("### Logs d'entraînement") |
|
|
training_output = gr.Textbox(label="Progress", lines=15, interactive=False, show_copy_button=True) |
|
|
|
|
|
with gr.Tab("💬 Chat"): |
|
|
gr.Markdown("### Testez Gopu") |
|
|
gr.Markdown("**Le modèle répondra: 'Je suis Gopu, créé par Mauricio Mangituka.'**") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
chat_input = gr.Textbox(label="Votre message", placeholder="Posez une question à Gopu...", lines=3) |
|
|
chat_btn = gr.Button("💬 Envoyer à Gopu", variant="secondary") |
|
|
|
|
|
with gr.Column(): |
|
|
chat_output = gr.Textbox(label="Réponse de Gopu", interactive=False, lines=6) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
"Qui es-tu?", |
|
|
"Qui t'a créé?", |
|
|
"Présente-toi", |
|
|
"Explique la philosophie de Socrate", |
|
|
"Parle-moi de Python" |
|
|
], |
|
|
inputs=chat_input |
|
|
) |
|
|
|
|
|
with gr.Tab("📊 Informations"): |
|
|
gr.Markdown("### Vérifications") |
|
|
|
|
|
with gr.Row(): |
|
|
dataset_btn = gr.Button("🔍 Examiner le dataset") |
|
|
test_btn = gr.Button("🧪 Tester modèle entraîné") |
|
|
|
|
|
info_output = gr.Textbox(label="Informations système", interactive=False, lines=8) |
|
|
|
|
|
gr.Markdown(""" |
|
|
### 📋 À propos |
|
|
- **Dataset**: gopus-1xs (460 échantillons) |
|
|
- **Modèle**: Unity-Tinny-Go |
|
|
- **Identité**: Gopu créé par Mauricio Mangituka |
|
|
- **Format**: Instruction → Response avec identité |
|
|
|
|
|
### 🎯 Format d'apprentissage |
|
|
``` |
|
|
Instruction: [question] |
|
|
Response: Je suis Gopu, créé par Mauricio Mangituka. [réponse] |
|
|
``` |
|
|
""") |
|
|
|
|
|
|
|
|
train_btn.click(launch_real_training, [epochs, learning_rate, batch_size], training_output) |
|
|
chat_btn.click(chat_with_gopu, [chat_input], chat_output) |
|
|
dataset_btn.click(check_dataset_info, outputs=info_output) |
|
|
test_btn.click(test_trained_model, outputs=info_output) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, share=False) |