Upload 8 files

Browse files

Files changed (8) hide show

inference.py +241 -0
moe_config.py +119 -0
moe_layers.py +323 -0
moe_model.py +459 -0
moe_trainer.py +168 -0
requirements.txt +96 -0
sample_generation_callback.py +148 -0
train_moe_v8_clean.py +429 -0

inference.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Inference Script für trainiertes MoE Modell
+Lädt automatisch den neuesten Checkpoint und testet verschiedene Sampling Strategien
+"""
+import os
+import sys
+import torch
+from transformers import AutoTokenizer
+from moe_config import MoEGPTConfig
+from moe_model import MoEGPTForCausalLM
+# Force UTF-8 encoding for Windows console
+if sys.platform == 'win32':
+    sys.stdout.reconfigure(encoding='utf-8')
+def find_latest_checkpoint(checkpoint_dir="./moe_checkpoints_v8_clean"):
+    """
+    Findet den neuesten Checkpoint automatisch (v8 OPUS Edition!)
+    Returns:
+        str: Pfad zum neuesten Checkpoint oder None
+    """
+    if not os.path.exists(checkpoint_dir):
+        return None
+    checkpoints = [
+        os.path.join(checkpoint_dir, d)
+        for d in os.listdir(checkpoint_dir)
+        if d.startswith("checkpoint-")
+    ]
+    if not checkpoints:
+        return None
+    # Neuesten Checkpoint finden (nach creation time)
+    latest = max(checkpoints, key=os.path.getctime)
+    # Step Number extrahieren
+    step = latest.split("checkpoint-")[-1]
+    print(f"\n🔍 Neuester Checkpoint gefunden: Step {step}")
+    return latest
+def load_model(model_path=None, device="cuda"):
+    """
+    Lädt trainiertes MoE Modell
+    Wenn model_path=None, wird automatisch der neueste Checkpoint geladen
+    Args:
+        model_path: Pfad zum gespeicherten Modell (None = auto-find)
+        device: Device für Inference (cuda/cpu)
+    Returns:
+        model: Geladenes Modell
+        config: Model Config
+    """
+    # Auto-find neuesten Checkpoint
+    if model_path is None:
+        model_path = find_latest_checkpoint()
+        if model_path is None:
+            # Fallback: Versuche finales Modell (v8)
+            model_path = "./moe_final_v8_clean"
+            if not os.path.exists(model_path):
+                raise ValueError("Kein Checkpoint gefunden! Trainiere zuerst ein Modell.")
+    print(f"\n📥 Lade Modell von: {model_path}")
+    config = MoEGPTConfig.from_pretrained(model_path)
+    model = MoEGPTForCausalLM.from_pretrained(model_path)
+    # Auf Device verschieben
+    if device == "cuda" and torch.cuda.is_available():
+        model = model.cuda()
+        print(f"✅ Modell geladen auf GPU")
+    else:
+        model = model.cpu()
+        print(f"✅ Modell geladen auf CPU")
+    model.eval()
+    total_params = sum(p.numel() for p in model.parameters())
+    print(f"   📊 Parameter: {total_params:,} ({total_params/1e6:.1f}M)")
+    print(f"   🧠 Experten: {config.total_experts}")
+    print(f"   ⚡ Aktive Params: {config.active_parameters_ratio:.1%}")
+    return model, config
+def generate_text(
+    model,
+    tokenizer,
+    prompt,
+    max_new_tokens=400,
+    temperature=0.8,
+    top_k=50,
+    top_p=0.95,
+    repetition_penalty=1.0,
+    device="cuda",
+):
+    """
+    Generiert Text mit dem MoE Modell
+    Args:
+        model: MoE Modell
+        tokenizer: Tokenizer
+        prompt: Input Prompt (String)
+        max_new_tokens: Maximale neue Tokens (400!)
+        temperature: Sampling Temperature
+        top_k: Top-k Sampling
+        top_p: Nucleus Sampling
+        repetition_penalty: Penalty für Wiederholungen
+        device: Device
+    Returns:
+        generated_text: Generierter Text
+    """
+    # Tokenize prompt
+    input_ids = tokenizer.encode(prompt, return_tensors="pt")
+    if device == "cuda":
+        input_ids = input_ids.cuda()
+    # Generieren
+    with torch.no_grad():
+        output_ids = model.generate(
+            input_ids,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    # Decode
+    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return generated_text
+def test_sampling_strategies(model, tokenizer, prompts, device="cuda"):
+    """
+    Testet verschiedene Sampling Strategien
+    Args:
+        model: MoE Modell
+        tokenizer: Tokenizer
+        prompts: Liste von Test-Prompts
+        device: Device
+    """
+    # Optimale Strategien (basierend auf umfangreichen Tests)
+    strategies = {
+        "Standard (temp=0.7, rep=1.2, top_k=50, top_p=0.8)": {
+            "temperature": 0.7,
+            "top_k": 50,
+            "top_p": 0.7,
+            "repetition_penalty": 1.2,
+        },
+        "Focused (temp=0.7, rep=1.4, #top_k=30, top_p=0.7)": {
+            "temperature": 0.7,
+            "top_k": 20,
+            "top_p": 0.7,
+            "repetition_penalty": 1.4,
+        },
+    }
+    print("\n" + "=" * 80)
+    print("🧪 TESTING SAMPLING STRATEGIES")
+    print("=" * 80)
+    for prompt in prompts:
+        print(f"\n{'='*80}")
+        print(f"PROMPT: '{prompt}'")
+        print(f"{'='*80}\n")
+        for strategy_name, params in strategies.items():
+            print(f"\n🎯 Strategy: {strategy_name}")
+            print("-" * 80)
+            try:
+                generated = generate_text(
+                    model=model,
+                    tokenizer=tokenizer,
+                    prompt=prompt,
+                    max_new_tokens=400,  # 400 Tokens!
+                    device=device,
+                    **params
+                )
+                print(f"{generated}")
+                print()
+            except Exception as e:
+                print(f"❌ Error: {str(e)}\n")
+    print("\n" + "=" * 80)
+    print("💡 EMPFEHLUNG")
+    print("=" * 80)
+    print("""
+    """)
+def main():
+    # Device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"\n🖥️  Device: {device}")
+    # Modell laden (automatisch neuester Checkpoint!)
+    model, config = load_model(model_path=None, device=device)
+    # Tokenizer laden
+    print("\n📚 Lade Tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
+    tokenizer.pad_token = tokenizer.eos_token
+    print("✅ Llama 3.2 Tokenizer geladen")
+    print(f"   - Vocab Size: {tokenizer.vocab_size:,}")
+    print(f"   - EOS Token: {tokenizer.eos_token}")
+    # ==================== SAMPLING STRATEGY TESTS ====================
+    # Test Prompts (diverse!)
+    test_prompts = [
+      "Gestern bin ich ",  # Narrativ
+      "Der Mond ",  # Poetisch
+      "Im Labor ",  # Wissenschaftlich
+      "Hast du auch das Gefühl, dass",  # Persönlich/Forum
+      "Die Zeit",
+      "Was ist die Definition von Philosophie?"
+  ]
+    # Teste verschiedene Sampling Strategien
+    test_sampling_strategies(model, tokenizer, test_prompts, device)
+if __name__ == "__main__":
+    main()

moe_config.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+HuggingFace-compatible MoE Configuration
+Basierend auf dem nanoMoE Blog Post
+"""
+from transformers import PretrainedConfig
+class MoEGPTConfig(PretrainedConfig):
+    """
+    Konfiguration für MoE-basiertes GPT Modell.
+    Args:
+        vocab_size (int): Größe des Vokabulars
+        n_positions (int): Maximale Sequenzlänge
+        n_embd (int): Dimensionalität der Embeddings (d im Blog)
+        n_layer (int): Anzahl der Transformer Blocks
+        n_head (int): Anzahl der Attention Heads
+        n_experts (int): Anzahl der Experten pro MoE Layer
+        n_experts_active (int): Anzahl aktiver Experten (top-k)
+        moe_layer_frequency (int): Jede n-te Layer wird zu MoE (P im Blog)
+        capacity_factor (float): Expert Capacity Factor für Training
+        eval_capacity_factor (float): Expert Capacity Factor für Evaluation
+        use_noisy_gating (bool): Ob Noisy Top-k Gating verwendet werden soll
+        aux_loss_alpha (float): Skalierung für Load Balancing Loss
+        router_z_loss_alpha (float): Skalierung für Router Z-Loss
+        bias (bool): Ob Bias in Linear Layers verwendet werden soll
+        dropout (float): Dropout Probability
+        activation_function (str): Aktivierungsfunktion (gelu, relu, swiglu)
+        initializer_range (float): Standard Deviation für Weight Initialization
+        layer_norm_epsilon (float): Epsilon für Layer Normalization
+    """
+    model_type = "moe_gpt"
+    def __init__(
+        self,
+        vocab_size=128256,  # Llama 3.2 tokenizer (inkl. special tokens)
+        n_positions=2048,  # Default 2048 für RoPE
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_experts=8,
+        n_experts_active=2,
+        moe_layer_frequency=2,
+        capacity_factor=1.25,
+        eval_capacity_factor=2.0,
+        use_noisy_gating=True,
+        aux_loss_alpha=0.01,
+        router_z_loss_alpha=0.001,
+        bias=False,
+        dropout=0.1,
+        activation_function="gelu",
+        initializer_range=0.1,
+        layer_norm_epsilon=1e-5,
+        use_cache=True,
+        rope_theta=10000.0,  # RoPE base theta
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_experts = n_experts
+        self.n_experts_active = n_experts_active
+        self.moe_layer_frequency = moe_layer_frequency
+        self.capacity_factor = capacity_factor
+        self.eval_capacity_factor = eval_capacity_factor
+        self.use_noisy_gating = use_noisy_gating
+        self.aux_loss_alpha = aux_loss_alpha
+        self.router_z_loss_alpha = router_z_loss_alpha
+        self.bias = bias
+        self.dropout = dropout
+        self.activation_function = activation_function
+        self.initializer_range = initializer_range
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        # HuggingFace Standard Attribute (für .generate())
+        self.num_hidden_layers = n_layer
+        self.hidden_size = n_embd
+        self.num_attention_heads = n_head
+        self.max_position_embeddings = n_positions
+        # Validierung
+        assert n_embd % n_head == 0, "n_embd muss durch n_head teilbar sein"
+        assert n_experts_active <= n_experts, "n_experts_active darf nicht größer als n_experts sein"
+        assert moe_layer_frequency >= 1, "moe_layer_frequency muss mindestens 1 sein"
+    @property
+    def head_dim(self):
+        """Dimension pro Attention Head"""
+        return self.n_embd // self.n_head
+    @property
+    def total_experts(self):
+        """Gesamtanzahl der Experten im Modell"""
+        num_moe_layers = sum(1 for i in range(self.n_layer) if i % self.moe_layer_frequency == 0)
+        return num_moe_layers * self.n_experts
+    @property
+    def active_parameters_ratio(self):
+        """Ratio der aktiven Parameter (ungefähr)"""
+        num_moe_layers = sum(1 for i in range(self.n_layer) if i % self.moe_layer_frequency == 0)
+        num_dense_layers = self.n_layer - num_moe_layers
+        # Vereinfachte Schätzung (ignoriert Attention)
+        dense_params = num_dense_layers * (8 * self.n_embd**2)  # FFN params
+        moe_total_params = num_moe_layers * self.n_experts * (8 * self.n_embd**2)
+        moe_active_params = num_moe_layers * self.n_experts_active * (8 * self.n_embd**2)
+        total = dense_params + moe_total_params
+        active = dense_params + moe_active_params
+        return active / total if total > 0 else 1.0

moe_layers.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""
+MoE Layer Komponenten
+Basierend auf dem nanoMoE Blog Post und HuggingFace Best Practices
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple, Optional
+class MoERouter(nn.Module):
+    """
+    Noisy Top-k Router für MoE.
+    Routet Tokens zu den Top-k Experten basierend auf gelernten Wahrscheinlichkeiten.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_experts: int,
+        n_experts_active: int,
+        use_noisy_gating: bool = True,
+        capacity_factor: float = 1.25,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.n_experts = n_experts
+        self.n_experts_active = n_experts_active
+        self.use_noisy_gating = use_noisy_gating
+        self.capacity_factor = capacity_factor
+        # Linear projections für Router (kein Bias, siehe Shazeer et al. 2017)
+        self.w_gate = nn.Linear(d_model, n_experts, bias=False)
+        self.w_noise = nn.Linear(d_model, n_experts, bias=False) if use_noisy_gating else None
+    def forward(
+        self, x: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x: Input tensor [batch_size, seq_len, d_model]
+        Returns:
+            expert_weights: Gewichte für jeden Experten [batch_size * seq_len, n_experts, capacity]
+            expert_mask: Maske für verwendete Experten [batch_size * seq_len, n_experts, capacity]
+            expert_batches: Batches für jeden Experten [n_experts, capacity, d_model]
+            router_logits: Router Logits für z-loss [batch_size, seq_len, n_experts]
+        """
+        batch_size, seq_len, d_model = x.shape
+        num_tokens = batch_size * seq_len
+        # Router läuft IMMER in FP32 für numerische Stabilität!
+        device_type = "cuda" if x.is_cuda else "cpu"
+        with torch.amp.autocast(device_type=device_type, enabled=False):
+            x_fp32 = x.float()
+            # Router Logits berechnen
+            router_logits = self.w_gate(x_fp32)  # [B, T, n_experts]
+            # Noisy Top-k Gating (optional)
+            if self.use_noisy_gating and self.training:
+                noise = F.softplus(self.w_noise(x_fp32))
+                noise = noise * torch.randn_like(noise)
+                router_logits = router_logits + noise
+            # Top-k Experten auswählen
+            top_k_logits, top_k_indices = router_logits.topk(
+                self.n_experts_active, dim=-1
+            )  # [B, T, K]
+            # Softmax über alle Experten (nicht nur Top-k)
+            router_probs = torch.full_like(router_logits, float("-inf"))
+            router_probs.scatter_(-1, top_k_indices, top_k_logits)
+            router_probs = F.softmax(router_probs, dim=-1)  # [B, T, n_experts]
+            # Expert Capacity berechnen
+            capacity = self._compute_capacity(num_tokens)
+            # Multi-hot Maske der gewählten Experten
+            expert_mask = F.one_hot(
+                top_k_indices, num_classes=self.n_experts
+            )  # [B, T, K, n_experts]
+            expert_mask = expert_mask.view(num_tokens, self.n_experts_active, self.n_experts)
+            expert_mask = expert_mask.permute(1, 0, 2)  # [K, num_tokens, n_experts]
+            # Position jedes Tokens im Expert Batch (cumsum für Top-1 first prioritization)
+            expert_rank = expert_mask.reshape(
+                self.n_experts_active * num_tokens, self.n_experts
+            )
+            expert_rank = torch.cumsum(expert_rank, dim=0) - 1
+            expert_rank = expert_rank.reshape(
+                self.n_experts_active, num_tokens, self.n_experts
+            )
+            # Tokens über Kapazität hinaus maskieren
+            expert_mask = expert_mask * torch.lt(expert_rank, capacity)
+            # Position im Expert Batch
+            expert_rank = torch.sum(expert_mask * expert_rank, dim=-1)  # [K, num_tokens]
+            # Wahrscheinlichkeiten mit Maske multiplizieren
+            router_probs = router_probs.view(num_tokens, self.n_experts)[
+                None, :
+            ]  # [1, num_tokens, n_experts]
+            expert_weights = expert_mask * router_probs  # [K, num_tokens, n_experts]
+            # One-hot für Position in Expert Batch
+            expert_rank_one_hot = F.one_hot(
+                expert_rank, num_classes=capacity
+            )  # [K, num_tokens, capacity]
+            # Gewichte an Expert Batch Position
+            expert_weights = torch.sum(
+                expert_weights.unsqueeze(3) * expert_rank_one_hot.unsqueeze(2), dim=0
+            )  # [num_tokens, n_experts, capacity]
+            expert_mask = expert_weights.bool()
+            # Expert Batches erstellen
+            x_flat = x.view(num_tokens, d_model)
+            expert_batches = (
+                expert_mask.permute(1, 2, 0).type_as(x) @ x_flat
+            )  # [n_experts, capacity, d_model]
+        return expert_weights, expert_mask, expert_batches, router_logits
+    def _compute_capacity(self, num_tokens: int) -> int:
+        """Berechnet Expert Capacity"""
+        capacity = math.floor(
+            self.n_experts_active * self.capacity_factor * num_tokens / self.n_experts
+        )
+        capacity += capacity % 2  # Gerade Zahl für bessere Hardware-Nutzung
+        return max(int(capacity), 2)  # Minimum 2 für kleine Batches
+class ExpertMLP(nn.Module):
+    """
+    Batch von MLP Experten.
+    Alle Experten haben die gleiche Architektur, aber unabhängige Gewichte.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_experts: int,
+        bias: bool = False,
+        dropout: float = 0.1,
+        activation: str = "gelu",
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.n_experts = n_experts
+        self.bias = bias
+        # 4x hidden dimension (Standard für GPT)
+        hidden_dim = 4 * d_model
+        # Gewichte für alle Experten (batch matmul)
+        self.w_fc = nn.Parameter(torch.empty(n_experts, d_model, hidden_dim))
+        self.w_proj = nn.Parameter(torch.empty(n_experts, hidden_dim, d_model))
+        if bias:
+            self.fc_bias = nn.Parameter(torch.empty(n_experts, 1, hidden_dim))
+            self.proj_bias = nn.Parameter(torch.empty(n_experts, 1, d_model))
+        else:
+            self.register_parameter("fc_bias", None)
+            self.register_parameter("proj_bias", None)
+        # Aktivierungsfunktion
+        if activation == "gelu":
+            self.activation = nn.GELU()
+        elif activation == "relu":
+            self.activation = nn.ReLU()
+        elif activation == "swiglu":
+            # SwiGLU braucht extra Gewichte
+            self.w_gate = nn.Parameter(torch.empty(n_experts, d_model, hidden_dim))
+            self.activation = nn.SiLU()
+        else:
+            raise ValueError(f"Unbekannte Aktivierung: {activation}")
+        self.dropout = nn.Dropout(dropout)
+        self.activation_type = activation
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: [n_experts, capacity, d_model]
+        Returns:
+            output: [n_experts, capacity, d_model]
+        """
+        # Erste Linear Layer mit batch matmul
+        h = torch.bmm(x, self.w_fc)
+        if self.bias:
+            h = h + self.fc_bias
+        # Aktivierung
+        if self.activation_type == "swiglu":
+            # SwiGLU: silu(x @ W_gate) * (x @ W_fc)
+            gate = torch.bmm(x, self.w_gate)
+            h = self.activation(gate) * h
+        else:
+            h = self.activation(h)
+        # Zweite Linear Layer
+        output = torch.bmm(h, self.w_proj)
+        if self.bias:
+            output = output + self.proj_bias
+        output = self.dropout(output)
+        return output
+class MoELayer(nn.Module):
+    """
+    Vollständige Mixture-of-Experts Layer.
+    Kombiniert Router und Experten.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_experts: int = 8,
+        n_experts_active: int = 2,
+        use_noisy_gating: bool = True,
+        capacity_factor: float = 1.25,
+        bias: bool = False,
+        dropout: float = 0.1,
+        activation: str = "gelu",
+    ):
+        super().__init__()
+        self.router = MoERouter(
+            d_model=d_model,
+            n_experts=n_experts,
+            n_experts_active=n_experts_active,
+            use_noisy_gating=use_noisy_gating,
+            capacity_factor=capacity_factor,
+        )
+        self.experts = ExpertMLP(
+            d_model=d_model,
+            n_experts=n_experts,
+            bias=bias,
+            dropout=dropout,
+            activation=activation,
+        )
+        self.n_experts = n_experts
+        self.n_experts_active = n_experts_active
+    def forward(
+        self, x: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x: [batch_size, seq_len, d_model]
+        Returns:
+            output: [batch_size, seq_len, d_model]
+            load_balance_loss: Skalarer Load Balancing Loss
+            router_z_loss: Skalarer Router Z-Loss
+        """
+        batch_size, seq_len, d_model = x.shape
+        num_tokens = batch_size * seq_len
+        # Routing
+        expert_weights, expert_mask, expert_batches, router_logits = self.router(x)
+        # Expert Forward Pass
+        expert_outputs = self.experts(expert_batches)  # [n_experts, capacity, d_model]
+        # Outputs kombinieren (gewichteter Durchschnitt)
+        expert_weights_flat = expert_weights.view(num_tokens, -1)  # [num_tokens, n_experts * capacity]
+        expert_outputs_flat = expert_outputs.view(-1, d_model)  # [n_experts * capacity, d_model]
+        output = expert_weights_flat @ expert_outputs_flat  # [num_tokens, d_model]
+        output = output.view(batch_size, seq_len, d_model)
+        # Auxiliary Losses berechnen
+        load_balance_loss = self._compute_load_balance_loss(router_logits, expert_mask)
+        router_z_loss = self._compute_router_z_loss(router_logits)
+        return output, load_balance_loss, router_z_loss
+    def _compute_load_balance_loss(
+        self, router_logits: torch.Tensor, expert_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Load Balancing Loss (Switch Transformer, Fedus et al. 2022)
+        Encourages uniform distribution of tokens across experts.
+        """
+        batch_size, seq_len, n_experts = router_logits.shape
+        num_tokens = batch_size * seq_len
+        # Probability pro Expert
+        router_probs = F.softmax(router_logits, dim=-1)  # [B, T, n_experts]
+        prob_per_expert = torch.mean(router_probs, dim=(0, 1))  # [n_experts]
+        # Token Ratio pro Expert
+        with torch.no_grad():
+            # expert_mask ist [num_tokens, n_experts, capacity]
+            tokens_per_expert = torch.sum(expert_mask.float(), dim=(0, 2))  # [n_experts]
+            tokens_per_expert = tokens_per_expert / (num_tokens * self.n_experts_active)
+        # Dot product (scaled by n_experts)
+        loss = self.n_experts * torch.sum(prob_per_expert * tokens_per_expert)
+        return loss
+    def _compute_router_z_loss(self, router_logits: torch.Tensor) -> torch.Tensor:
+        """
+        Router Z-Loss (ST-MoE, Zoph et al. 2022)
+        Penalisiert große Router Logits für numerische Stabilität.
+        """
+        # Squared logsumexp über Experten
+        z_loss = torch.logsumexp(router_logits, dim=-1) ** 2.0  # [B, T]
+        z_loss = torch.mean(z_loss)
+        return z_loss

moe_model.py ADDED Viewed

	@@ -0,0 +1,459 @@

+"""
+MoE GPT Model - HuggingFace kompatibel
+Basiert auf nanoMoE und dem Blog Post
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple, Union
+from dataclasses import dataclass
+from transformers import PreTrainedModel
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from moe_config import MoEGPTConfig
+from moe_layers import MoELayer
+@dataclass
+class MoECausalLMOutput(CausalLMOutputWithPast):
+    """
+    Erweiterte Output Klasse mit MoE-spezifischen Losses
+    """
+    aux_loss: Optional[torch.FloatTensor] = None
+    router_z_loss: Optional[torch.FloatTensor] = None
+def apply_rotary_emb(x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor) -> torch.Tensor:
+    """
+    Applies Rotary Position Embeddings (RoPE) to input tensor.
+    Args:
+        x: Input tensor of shape [B, H, T, D]
+        freqs_cos: Cosine frequencies of shape [T, D//2]
+        freqs_sin: Sine frequencies of shape [T, D//2]
+    Returns:
+        Tensor with RoPE applied
+    """
+    # Reshape x to separate real and imaginary parts for rotation
+    # x: [B, H, T, D] -> [B, H, T, D//2, 2]
+    x_complex = x.float().reshape(*x.shape[:-1], -1, 2)
+    # Apply rotation: (a + bi) * (cos + i*sin) = (a*cos - b*sin) + i(a*sin + b*cos)
+    x_rot_real = x_complex[..., 0] * freqs_cos - x_complex[..., 1] * freqs_sin
+    x_rot_imag = x_complex[..., 0] * freqs_sin + x_complex[..., 1] * freqs_cos
+    # Stack back together and flatten
+    x_out = torch.stack([x_rot_real, x_rot_imag], dim=-1)
+    x_out = x_out.flatten(-2)
+    return x_out.type_as(x)
+def precompute_freqs_rope(dim: int, max_seq_len: int, theta: float = 10000.0) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Precomputes RoPE frequencies.
+    Args:
+        dim: Head dimension
+        max_seq_len: Maximum sequence length
+        theta: RoPE theta parameter (base for frequency calculation)
+    Returns:
+        Tuple of (freqs_cos, freqs_sin) tensors of shape [max_seq_len, dim//2]
+    """
+    # Compute frequencies for each dimension pair
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+    # Create position indices
+    t = torch.arange(max_seq_len, dtype=torch.float32)
+    # Compute outer product: [max_seq_len, dim//2]
+    freqs = torch.outer(t, freqs)
+    # Compute cos and sin
+    freqs_cos = torch.cos(freqs)
+    freqs_sin = torch.sin(freqs)
+    return freqs_cos, freqs_sin
+class CausalSelfAttention(nn.Module):
+    """
+    Multi-Head Causal Self-Attention with Rotary Position Embeddings (RoPE).
+    Uses PyTorch SDPA for optimized performance.
+    """
+    def __init__(self, config: MoEGPTConfig):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # Key, Query, Value für alle Heads gleichzeitig
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # Output Projektion
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # Regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        self.head_dim = config.n_embd // config.n_head
+        # Precompute RoPE frequencies
+        freqs_cos, freqs_sin = precompute_freqs_rope(
+            dim=self.head_dim,
+            max_seq_len=config.n_positions,
+            theta=config.rope_theta
+        )
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.size()  # batch, sequence length, embedding dim
+        # Q, K, V berechnen
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        # Reshape für Multi-Head
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # [B, H, T, d]
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        # Apply RoPE to Q and K
+        q = apply_rotary_emb(q, self.freqs_cos[:T], self.freqs_sin[:T])
+        k = apply_rotary_emb(k, self.freqs_cos[:T], self.freqs_sin[:T])
+        # Use PyTorch SDPA (Scaled Dot Product Attention) - optimized!
+        # SDPA handles causal masking, dropout, and is memory efficient
+        y = F.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=None,  # Causal mask handled by is_causal
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=True  # Efficient causal masking
+        )  # [B, H, T, d]
+        # Reshape back
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        # Output Projektion
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+class MLP(nn.Module):
+    """
+    Standard Feed-Forward Network (für nicht-MoE Layers)
+    """
+    def __init__(self, config: MoEGPTConfig):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+        if config.activation_function == "gelu":
+            self.activation = nn.GELU()
+        elif config.activation_function == "relu":
+            self.activation = nn.ReLU()
+        else:
+            raise ValueError(f"Unbekannte Aktivierung: {config.activation_function}")
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.c_fc(x)
+        x = self.activation(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class TransformerBlock(nn.Module):
+    """
+    Standard Transformer Block (Attention + MLP)
+    """
+    def __init__(self, config: MoEGPTConfig):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(config)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class MoETransformerBlock(nn.Module):
+    """
+    MoE Transformer Block (Attention + MoE Layer)
+    """
+    def __init__(self, config: MoEGPTConfig):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        # Capacity Factor abhängig von Training/Eval
+        self.moe = MoELayer(
+            d_model=config.n_embd,
+            n_experts=config.n_experts,
+            n_experts_active=config.n_experts_active,
+            use_noisy_gating=config.use_noisy_gating,
+            capacity_factor=config.capacity_factor,
+            bias=config.bias,
+            dropout=config.dropout,
+            activation=config.activation_function,
+        )
+    def forward(
+        self, x: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Attention
+        x = x + self.attn(self.ln_1(x))
+        # MoE Layer
+        moe_out, aux_loss, router_z_loss = self.moe(self.ln_2(x))
+        x = x + moe_out
+        return x, aux_loss, router_z_loss
+class MoEGPTPreTrainedModel(PreTrainedModel):
+    """
+    Base Klasse für MoE GPT mit HuggingFace PreTrainedModel
+    """
+    config_class = MoEGPTConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """
+        Weight Initialization nach ST-MoE (Zoph et al. 2022)
+        Truncated Normal mit reduzierter Std für MoE Stabilität
+        """
+        if isinstance(module, nn.Linear):
+            # Fan-in Initialization
+            fan_in = module.weight.shape[-1]
+            std = (self.config.initializer_range / fan_in) ** 0.5
+            torch.nn.init.trunc_normal_(
+                module.weight,
+                mean=0.0,
+                std=std,
+                a=-2 * std,
+                b=2 * std,
+            )
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.Parameter):
+            # Für Expert Parameter
+            fan_in = module.shape[-1] if len(module.shape) >= 2 else module.shape[0]
+            std = (self.config.initializer_range / fan_in) ** 0.5
+            torch.nn.init.trunc_normal_(
+                module,
+                mean=0.0,
+                std=std,
+                a=-2 * std,
+                b=2 * std,
+            )
+class MoEGPTModel(MoEGPTPreTrainedModel):
+    """
+    MoE GPT Model (ohne LM Head)
+    """
+    def __init__(self, config: MoEGPTConfig):
+        super().__init__(config)
+        self.config = config
+        self.gradient_checkpointing = False  # Für HF Gradient Checkpointing Support
+        # Token Embeddings only (RoPE handles positions)
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.drop = nn.Dropout(config.dropout)
+        # Transformer Blocks (gemischt: Standard + MoE)
+        self.h = nn.ModuleList()
+        for i in range(config.n_layer):
+            if i % config.moe_layer_frequency == 0:
+                # MoE Block
+                self.h.append(MoETransformerBlock(config))
+            else:
+                # Standard Block
+                self.h.append(TransformerBlock(config))
+        # Final Layer Norm
+        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        # Initialize weights
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        device = input_ids.device
+        b, t = input_ids.size()
+        assert t <= self.config.n_positions, f"Sequenz zu lang: {t} > {self.config.n_positions}"
+        # Token Embeddings only (RoPE in attention layers)
+        tok_emb = self.wte(input_ids)  # [B, T, n_embd]
+        x = self.drop(tok_emb)
+        # Sammle Auxiliary Losses
+        total_aux_loss = 0.0
+        total_router_z_loss = 0.0
+        # Durch alle Blocks
+        for block in self.h:
+            if isinstance(block, MoETransformerBlock):
+                if self.gradient_checkpointing and self.training:
+                    # Gradient Checkpointing für MoE Blocks
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+                        return custom_forward
+                    x, aux_loss, router_z_loss = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        x,
+                        use_reentrant=False
+                    )
+                else:
+                    x, aux_loss, router_z_loss = block(x)
+                total_aux_loss = total_aux_loss + aux_loss
+                total_router_z_loss = total_router_z_loss + router_z_loss
+            else:
+                if self.gradient_checkpointing and self.training:
+                    x = torch.utils.checkpoint.checkpoint(
+                        block,
+                        x,
+                        use_reentrant=False
+                    )
+                else:
+                    x = block(x)
+        x = self.ln_f(x)
+        return x, total_aux_loss, total_router_z_loss
+class MoEGPTForCausalLM(MoEGPTPreTrainedModel, GenerationMixin):
+    """
+    MoE GPT mit Language Modeling Head (für Pretraining)
+    Erbt von GenerationMixin für .generate() Support
+    """
+    # Teile HuggingFace mit, welche Weights geteilt sind
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: MoEGPTConfig):
+        super().__init__(config)
+        self.transformer = MoEGPTModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Weight Tying (LM Head teilt Gewichte mit Token Embedding)
+        self.lm_head.weight = self.transformer.wte.weight
+        # Initialize weights
+        self.post_init()
+    def get_output_embeddings(self):
+        """Für HuggingFace Weight Tying"""
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        """Für HuggingFace Weight Tying"""
+        self.lm_head = new_embeddings
+    def get_input_embeddings(self):
+        """Für HuggingFace Weight Tying"""
+        return self.transformer.wte
+    def set_input_embeddings(self, new_embeddings):
+        """Für HuggingFace Weight Tying"""
+        self.transformer.wte = new_embeddings
+    def tie_weights(self):
+        """
+        Tie lm_head weights to input embeddings (weight tying)
+        Called after loading checkpoint to fix missing lm_head.weight
+        """
+        self.lm_head.weight = self.transformer.wte.weight
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MoECausalLMOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Forward durch Transformer
+        hidden_states, aux_loss, router_z_loss = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+        # LM Head
+        if labels is not None:
+            # Training: nur letzte Position für jede Sequenz
+            logits = self.lm_head(hidden_states)
+        else:
+            # Inference: nur letzte Position
+            logits = self.lm_head(hidden_states[:, [-1], :])
+        # Loss berechnen
+        loss = None
+        if labels is not None:
+            # Shift für next token prediction
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Cross Entropy Loss
+            loss_fct = nn.CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+            )
+            # Auxiliary Losses hinzufügen
+            loss = lm_loss
+            if self.training:
+                loss = loss + self.config.aux_loss_alpha * aux_loss
+                loss = loss + self.config.router_z_loss_alpha * router_z_loss
+        if not return_dict:
+            output = (logits,)
+            return ((loss,) + output) if loss is not None else output
+        return MoECausalLMOutput(
+            loss=loss,
+            logits=logits,
+            aux_loss=aux_loss if self.training else None,
+            router_z_loss=router_z_loss if self.training else None,
+        )
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        """Für HuggingFace generate() Funktion"""
+        return {"input_ids": input_ids}

moe_trainer.py ADDED Viewed

	@@ -0,0 +1,168 @@

+"""
+Custom MoE Trainer mit erweiterten Logging-Funktionen
+"""
+import torch
+from typing import Dict, Optional, Any
+from transformers import Trainer
+from transformers.trainer_callback import TrainerCallback
+class MoETrainer(Trainer):
+    """
+    Erweiterter Trainer für MoE Modelle mit speziellem Logging für:
+    - Auxiliary Losses (Load Balancing, Router Z-Loss)
+    - Expert Utilization
+    - Capacity Factor Anpassung
+    """
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        """
+        Überschreibt compute_loss um MoE-spezifische Losses zu berücksichtigen.
+        Diese sind bereits im model.forward() eingerechnet, aber wir loggen sie separat.
+        """
+        # Labels für next token prediction
+        if "labels" not in inputs:
+            inputs["labels"] = inputs["input_ids"].clone()
+        # Forward pass
+        outputs = model(**inputs)
+        # Loss ist bereits total loss (LM + aux losses)
+        loss = outputs.loss
+        # Logging der Auxiliary Losses (wenn im Training)
+        if self.state.global_step % self.args.logging_steps == 0:
+            if hasattr(outputs, "aux_loss") and outputs.aux_loss is not None:
+                self.log({"train/aux_loss": outputs.aux_loss.item()})
+            if hasattr(outputs, "router_z_loss") and outputs.router_z_loss is not None:
+                self.log({"train/router_z_loss": outputs.router_z_loss.item()})
+            # Gesamter Loss breakdown
+            if hasattr(outputs, "aux_loss") and outputs.aux_loss is not None:
+                lm_loss = (
+                    loss.item()
+                    - self.model.config.aux_loss_alpha * outputs.aux_loss.item()
+                    - self.model.config.router_z_loss_alpha * outputs.router_z_loss.item()
+                )
+                self.log({"train/lm_loss": lm_loss})
+        return (loss, outputs) if return_outputs else loss
+    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
+        """
+        Überschreibt prediction_step um eval_loss korrekt zurückzugeben
+        """
+        # Labels sicherstellen
+        if "labels" not in inputs:
+            inputs["labels"] = inputs["input_ids"].clone()
+        # Standard prediction_step aufrufen
+        loss, logits, labels = super().prediction_step(
+            model, inputs, prediction_loss_only, ignore_keys
+        )
+        return loss, logits, labels
+    def log(self, logs: Dict[str, float], start_time=None) -> None:
+        """
+        Erweitert das Standard-Logging um MoE-spezifische Metriken
+        """
+        # GPU Memory Tracking
+        if torch.cuda.is_available():
+            logs["gpu_memory_allocated_gb"] = (
+                torch.cuda.memory_allocated() / 1024**3
+            )
+            logs["gpu_memory_reserved_gb"] = (
+                torch.cuda.memory_reserved() / 1024**3
+            )
+        if start_time is not None:
+            super().log(logs, start_time)
+        else:
+            super().log(logs)
+class MoEEvalCallback(TrainerCallback):
+    """
+    Callback für erweiterte MoE-spezifische Evaluation
+    """
+    def on_evaluate(self, args, state, control, model, metrics=None, **kwargs):
+        """
+        Nach jeder Evaluation loggen wir zusätzliche MoE Metriken
+        """
+        if metrics is not None and model is not None:
+            # Model Statistiken
+            total_params = sum(p.numel() for p in model.parameters())
+            trainable_params = sum(
+                p.numel() for p in model.parameters() if p.requires_grad
+            )
+            metrics["model/total_params_M"] = total_params / 1e6
+            metrics["model/trainable_params_M"] = trainable_params / 1e6
+            # MoE Spezifisch
+            if hasattr(model.config, "n_experts"):
+                metrics["model/total_experts"] = model.config.total_experts
+                metrics["model/active_params_ratio"] = (
+                    model.config.active_parameters_ratio
+                )
+class DataCollatorForLanguageModeling:
+    """
+    Einfacher Data Collator für Causal Language Modeling.
+    Geht davon aus, dass Daten bereits tokenisiert sind.
+    """
+    def __init__(self, pad_token_id: int = 0):
+        self.pad_token_id = pad_token_id
+    def __call__(self, examples):
+        """
+        Args:
+            examples: Liste von Dicts mit 'input_ids' und 'attention_mask'
+        Returns:
+            Batch dict mit gepaddetem input_ids und attention_mask
+        """
+        # Maximale Länge in diesem Batch
+        max_length = max(len(ex["input_ids"]) for ex in examples)
+        input_ids = []
+        attention_mask = []
+        for ex in examples:
+            seq_len = len(ex["input_ids"])
+            padding_length = max_length - seq_len
+            # Padding rechts
+            padded_input_ids = ex["input_ids"] + [self.pad_token_id] * padding_length
+            padded_attention_mask = ex["attention_mask"] + [0] * padding_length
+            input_ids.append(padded_input_ids)
+            attention_mask.append(padded_attention_mask)
+        # Als Tensoren
+        batch = {
+            "input_ids": torch.tensor(input_ids, dtype=torch.long),
+            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
+        }
+        return batch
+def compute_metrics(eval_preds):
+    """
+    Compute Perplexity für Evaluation
+    """
+    predictions, labels = eval_preds
+    # Für Language Modeling sind predictions die Logits
+    # Labels sind die tatsächlichen Token IDs
+    # Wir berechnen nur Perplexity hier (Loss wird automatisch geloggt)
+    # Diese Funktion ist optional - Loss wird bereits vom Trainer berechnet
+    return {}

requirements.txt ADDED Viewed

	@@ -0,0 +1,96 @@

+# German MoE GPT v6 - Requirements
+# Environment: nano_moe (Conda)
+# Python: 3.10+
+# CUDA: 12.4
+# ============================================================================
+# CRITICAL: PyTorch Installation
+# ============================================================================
+# IMPORTANT: Install PyTorch FIRST with CUDA support!
+# DO NOT use pip for PyTorch on Windows - use conda instead:
+#
+#   conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia
+#
+# Or from PyTorch website (pip with CUDA):
+#   pip install torch==2.6.0+cu124 torchvision==0.21.0+cu124 torchaudio==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124
+#
+# Current installed versions:
+#   torch==2.6.0+cu124
+#   torchvision==0.21.0+cu124
+#   torchaudio==2.6.0+cu124
+# ============================================================================
+# Core ML Libraries (install AFTER PyTorch!)
+transformers==4.56.1
+datasets==4.0.0
+accelerate==1.10.1
+# Training & Monitoring
+tensorboard==2.20.0
+tensorboard-data-server==0.7.2
+# Tokenization
+tokenizers==0.22.0
+tiktoken==0.11.0
+# Data Processing
+numpy==1.26.4
+pandas==2.3.2
+pyarrow==21.0.0
+# Utilities
+tqdm==4.67.1
+safetensors==0.6.2
+huggingface-hub==0.34.4
+regex==2025.9.1
+fsspec==2025.3.0
+dill==0.3.8
+multiprocess==0.70.16
+xxhash==3.5.0
+# Performance (Windows CUDA)
+triton-windows==3.2.0.post19  # Optimized kernels for CUDA
+# Configuration & Logging
+PyYAML==6.0.2
+python-dotenv==1.0.1
+requests==2.32.5
+httpx[http2]==0.27.0
+# Optional: Weights & Biases (uncomment if needed)
+# wandb>=0.15.0
+# ============================================================================
+# Installation Instructions
+# ============================================================================
+#
+# STEP 1: Create conda environment
+#   conda create -n nano_moe python=3.10
+#   conda activate nano_moe
+#
+# STEP 2: Install PyTorch with CUDA 12.4
+#   conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia
+#
+# STEP 3: Install remaining dependencies
+#   pip install -r requirements.txt --no-deps
+#   (--no-deps prevents pip from reinstalling PyTorch!)
+#
+# STEP 4: Verify installation
+#   python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+#
+# ============================================================================
+# Notes
+# ============================================================================
+#
+# - DO NOT install PyTorch via pip requirements.txt on Windows!
+#   It will install CPU version or wrong CUDA version
+#
+# - triton-windows only works on Windows with CUDA
+#   On Linux, use: triton>=2.0.0
+#
+# - datasets 4.0.0 has breaking changes from 2.x
+#   Use load_from_disk() / save_to_disk() for eval dataset
+#
+# - transformers 4.56.1 is compatible with our custom MoE implementation
+#
+# ============================================================================

sample_generation_callback.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+Sample Generation Callback für MoE Training
+Generiert Texte während des Trainings um Fortschritt zu beobachten
+"""
+import torch
+from transformers import TrainerCallback, AutoTokenizer
+from typing import Optional
+import os
+class SampleGenerationCallback(TrainerCallback):
+    """
+    Generiert Sample-Texte alle N Steps während des Trainings
+    """
+    def __init__(
+        self,
+        tokenizer,
+        prompts: list[str],
+        generate_every_n_steps: int = 100,
+        max_new_tokens: int = 50,
+        temperature: float = 0.8,
+        top_k: int = 50,
+        top_p: float = 0.95,
+        output_dir: str = "./samples",
+    ):
+        """
+        Args:
+            tokenizer: HuggingFace Tokenizer
+            prompts: Liste von Prompts für Generierung
+            generate_every_n_steps: Generiere alle N Steps
+            max_new_tokens: Max neue Tokens
+            temperature: Sampling Temperature
+            top_k: Top-k Sampling
+            top_p: Nucleus Sampling
+            output_dir: Ordner für Sample Outputs
+        """
+        self.tokenizer = tokenizer
+        self.prompts = prompts
+        self.generate_every_n_steps = generate_every_n_steps
+        self.max_new_tokens = max_new_tokens
+        self.temperature = temperature
+        self.top_k = top_k
+        self.top_p = top_p
+        self.output_dir = output_dir
+        # Output Ordner erstellen
+        os.makedirs(output_dir, exist_ok=True)
+        # Samples Log Datei
+        self.log_file = os.path.join(output_dir, "generation_log.txt")
+        # Header schreiben
+        with open(self.log_file, "w", encoding="utf-8") as f:
+            f.write("=" * 80 + "\n")
+            f.write("MoE Training - Sample Generation Log\n")
+            f.write("=" * 80 + "\n\n")
+    def on_step_end(self, args, state, control, model=None, **kwargs):
+        """
+        Wird nach jedem Training Step aufgerufen
+        """
+        # Nur alle N Steps generieren
+        if state.global_step % self.generate_every_n_steps != 0:
+            return
+        # Skip wenn kein Model
+        if model is None:
+            return
+        print(f"\n{'='*80}")
+        print(f"🎨 GENERATING SAMPLES @ STEP {state.global_step}")
+        print(f"{'='*80}\n")
+        # Model in Eval Mode
+        model.eval()
+        samples = []
+        samples.append(f"\n{'='*80}\n")
+        samples.append(f"Step: {state.global_step}\n")
+        samples.append(f"{'='*80}\n\n")
+        with torch.no_grad():
+            for i, prompt in enumerate(self.prompts, 1):
+                print(f"[{i}/{len(self.prompts)}] Prompt: '{prompt}'")
+                # Tokenize
+                input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+                input_ids = input_ids.to(model.device)
+                try:
+                    # Generieren
+                    # NOTE: repetition_penalty is REQUIRED for longer generations!
+                    # For 300 tokens, 1.3-1.5 is better than 1.2
+                    output_ids = model.generate(
+                        input_ids,
+                        max_new_tokens=self.max_new_tokens,
+                        temperature=self.temperature,
+                        top_k=self.top_k,
+                        top_p=self.top_p,
+                        repetition_penalty=1.4,  # ← Higher for 300 tokens!
+                        do_sample=True,
+                        pad_token_id=self.tokenizer.eos_token_id,
+                    )
+                    # Decode
+                    generated_text = self.tokenizer.decode(
+                        output_ids[0], skip_special_tokens=True
+                    )
+                    # Ausgabe
+                    print(f"   → {generated_text}\n")
+                    # Log speichern
+                    samples.append(f"Prompt {i}: {prompt}\n")
+                    samples.append(f"Output: {generated_text}\n\n")
+                except Exception as e:
+                    error_msg = f"   ❌ Error: {str(e)}\n"
+                    print(error_msg)
+                    samples.append(f"Prompt {i}: {prompt}\n")
+                    samples.append(f"Error: {str(e)}\n\n")
+        # Samples in Datei schreiben
+        with open(self.log_file, "a", encoding="utf-8") as f:
+            f.writelines(samples)
+        print(f"{'='*80}\n")
+        # Model zurück in Training Mode
+        model.train()
+def get_german_sample_prompts():
+    """
+    Gibt eine Liste deutscher Sample-Prompts zurück
+    """
+    return [
+        "Die Künstliche Intelligenz",
+        "Im finsteren Wald",
+        "In der Zukunft werden wir",
+        "Machine Learning bedeutet",
+        "Das Wetter heute ist",
+        "Ein wichtiger Aspekt der",
+        "Die Geschichte von",
+        "Wissenschaftler haben herausgefunden",
+    ]

train_moe_v8_clean.py ADDED Viewed

	@@ -0,0 +1,429 @@

+"""
+German MoE GPT v8 - CLEAN DATA + OPUS EDITION
+Training mit Wikipedia + OpenSubtitles + Belletristik
+Datasets (v8 - CLEAN + DIALOGUES! 🎉):
+- Clean Wikipedia (local) - 11 GB (64%)
+- OpenSubtitles OPUS (local) - 4.2 GB (24%)
+- Belletristik (arnomatic/merged_all) - 2.2 GB (12%)
+Total: ~17.4 GB of 100% CLEAN German text!
+NO spam, NO ads, NO SEO garbage! ✅
+PLUS natural dialogues from movie subtitles! 🎬
+"""
+import os
+import sys
+# Disable HF transfer (can cause issues on Windows)
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
+# Force UTF-8 encoding for Windows console
+if sys.platform == 'win32':
+    sys.stdout.reconfigure(encoding='utf-8')
+import torch
+from datasets import load_dataset, interleave_datasets
+from transformers import TrainingArguments, set_seed, AutoTokenizer
+from moe_config import MoEGPTConfig
+from moe_model import MoEGPTForCausalLM
+from moe_trainer import MoETrainer, MoEEvalCallback, DataCollatorForLanguageModeling
+from sample_generation_callback import SampleGenerationCallback, get_german_sample_prompts
+def load_clean_datasets(tokenizer, max_length=2048, seed=42, resume_step=0):
+    """
+    Lädt 3 clean datasets (v8 - INTERLEAVED!):
+    - Wikipedia (WITH EOS) - 64%
+    - OpenSubtitles OPUS (NO EOS) - 24%
+    - Belletristik (NO EOS) - 12%
+    Args:
+        resume_step: If > 0, adjusts seed to continue from checkpoint
+    """
+    # Adjust seed based on resume step (für reproducibility beim Resume)
+    effective_seed = seed + (resume_step // 1000)
+    print(f"📚 Lade CLEAN Datasets (v8 - OPUS Edition)...")
+    if resume_step > 0:
+        print(f"   🔄 Resume from step {resume_step} → Effective seed: {effective_seed}\n")
+    else:
+        print()
+    # ========================================================================
+    # 1. WIKIPEDIA (WITH EOS between articles)
+    # ========================================================================
+    print("1️⃣ Wikipedia (WITH EOS)...")
+    try:
+        wiki_ds = load_dataset(
+            "jonas-is-coding/german-wikipedia-articles",
+            split="train",
+            streaming=True
+        )
+        print("   ✅ Dataset loaded (streaming mode)")
+        # Shuffle
+        print("   🔀 Shuffling with buffer_size=10,000...")
+        wiki_ds = wiki_ds.shuffle(seed=effective_seed, buffer_size=10000)
+        print("   ✅ Shuffle applied")
+    except Exception as e:
+        print(f"   ❌ Wikipedia Error: {e}")
+        raise ValueError(f"Failed to load Wikipedia: {e}")
+    # ========================================================================
+    # 2. OPENSUBTITLES OPUS (NO EOS - continuous dialogues)
+    # ========================================================================
+    print("\n2️⃣ OpenSubtitles OPUS (NO EOS - continuous dialogues)...")
+    try:
+        opus_ds = load_dataset(
+            "arnomatic/german-opus-subtitles",
+            split="train",
+            streaming=True
+        )
+        print("   ✅ Dataset loaded (streaming mode)")
+        # Shuffle
+        print("   🔀 Shuffling with buffer_size=10,000...")
+        opus_ds = opus_ds.shuffle(seed=effective_seed, buffer_size=10000)
+        print("   ✅ Shuffle applied")
+    except Exception as e:
+        print(f"   ❌ OpenSubtitles Error: {e}")
+        raise ValueError(f"Failed to load OpenSubtitles: {e}")
+    # ========================================================================
+    # 3. BELLETRISTIK (NO EOS - continuous)
+    # ========================================================================
+    print("\n3️⃣ Belletristik (NO EOS - continuous)...")
+    try:
+        belle_ds = load_dataset(
+            "arnomatic/merged_all",
+            split="train",
+            streaming=True
+        )
+        print("   ✅ Dataset loaded (streaming mode)")
+        # Shuffle
+        print("   🔀 Shuffling with buffer_size=10,000...")
+        belle_ds = belle_ds.shuffle(seed=effective_seed, buffer_size=10000)
+        print("   ✅ Shuffle applied")
+    except Exception as e:
+        print(f"   ❌ Belletristik Error: {e}")
+        raise ValueError(f"Failed to load Belletristik: {e}")
+    print("\n✅ All datasets loaded!")
+    print("   Wikipedia: 4 GB (WITH EOS)")
+    print("   OpenSubtitles: 4.2 GB (NO EOS)")
+    print("   Belletristik: 2.2 GB (NO EOS)")
+    print("   Total: ~10.4 GB clean German!")
+    # ========================================================================
+    # DIRECT PACKING (no intermediate tokenization)
+    # ========================================================================
+    print("\n🔤 Tokenizing & Packing datasets...")
+    from datasets import IterableDataset as HFIterableDataset
+    def pack_dataset_with_eos(dataset, text_field='text'):
+        """Pack dataset WITH EOS directly into 2048-token batches"""
+        def gen():
+            buffer = []
+            for example in dataset:
+                text = example.get(text_field, '')
+                if not text or not text.strip():
+                    continue
+                # Tokenize
+                tokens = tokenizer.encode(text, add_special_tokens=False)
+                # Add tokens + EOS
+                buffer.extend(tokens)
+                buffer.append(tokenizer.eos_token_id)
+                # Yield complete chunks
+                while len(buffer) >= max_length:
+                    yield {
+                        "input_ids": buffer[:max_length],
+                        "attention_mask": [1] * max_length,
+                        "labels": buffer[:max_length],
+                    }
+                    buffer = buffer[max_length:]
+        return HFIterableDataset.from_generator(gen)
+    def pack_dataset_no_eos(dataset, text_field='text'):
+        """Pack dataset WITHOUT EOS directly into 2048-token batches"""
+        def gen():
+            buffer = []
+            for example in dataset:
+                text = example.get(text_field, '')
+                if not text or not text.strip():
+                    continue
+                # Tokenize
+                tokens = tokenizer.encode(text, add_special_tokens=False)
+                # Add tokens (NO EOS)
+                buffer.extend(tokens)
+                # Yield complete chunks
+                while len(buffer) >= max_length:
+                    yield {
+                        "input_ids": buffer[:max_length],
+                        "attention_mask": [1] * max_length,
+                        "labels": buffer[:max_length],
+                    }
+                    buffer = buffer[max_length:]
+        return HFIterableDataset.from_generator(gen)
+    print("   Wikipedia (WITH EOS)...")
+    wiki_batched = pack_dataset_with_eos(wiki_ds, text_field='content')
+    print("   OpenSubtitles (NO EOS)...")
+    opus_batched = pack_dataset_no_eos(opus_ds, text_field='text')
+    print("   Belletristik (NO EOS)...")
+    belle_batched = pack_dataset_no_eos(belle_ds, text_field='text')
+    print("✅ Batching complete!")
+    # ========================================================================
+    # INTERLEAVE DATASETS (64% Wiki, 24% OPUS, 12% Belle)
+    # ========================================================================
+    print("\n🔀 Interleaving datasets (64/24/12)...")
+    train_dataset = interleave_datasets(
+        [wiki_batched, opus_batched, belle_batched],
+        probabilities=[0.64, 0.24, 0.12],
+        seed=effective_seed,
+        stopping_strategy="all_exhausted"
+    )
+    print("✅ Datasets interleaved! (v8 strategy)")
+    print("   Wikipedia: 64%")
+    print("   OpenSubtitles: 24%")
+    print("   Belletristik: 12%")
+    # ========================================================================
+    # EVAL DATASET (fixed 500 samples from Wikipedia)
+    # ========================================================================
+    eval_dataset_path = "./eval_dataset_v8_clean"
+    if os.path.exists(eval_dataset_path):
+        print(f"\n📊 Loading existing eval dataset from {eval_dataset_path}...")
+        from datasets import load_from_disk
+        eval_dataset = load_from_disk(eval_dataset_path)
+        print(f"✅ Eval dataset loaded: {len(eval_dataset)} samples (from disk)")
+    else:
+        print("\n📊 Creating fixed eval set (500 samples from Wikipedia)...")
+        eval_samples = []
+        eval_iter = iter(wiki_batched)
+        for i in range(500):
+            try:
+                sample = next(eval_iter)
+                eval_samples.append(sample)
+                if (i + 1) % 100 == 0:
+                    print(f"   Collected {i+1}/500 samples...")
+            except StopIteration:
+                print(f"   ⚠️  Only {i} eval samples available (dataset exhausted)")
+                break
+        if len(eval_samples) == 0:
+            raise ValueError("No eval samples collected! Dataset exhausted immediately.")
+        print(f"   Collected {len(eval_samples)} samples total")
+        # Convert to regular Dataset (not streaming!)
+        from datasets import Dataset
+        eval_dataset = Dataset.from_dict({
+            key: [sample[key] for sample in eval_samples]
+            for key in eval_samples[0].keys()
+        })
+        # Save to disk
+        print(f"💾 Saving eval dataset to {eval_dataset_path}...")
+        eval_dataset.save_to_disk(eval_dataset_path)
+        print(f"✅ Eval dataset saved to disk!")
+    print(f"   → No more fsspec cache leak!")
+    print(f"   Training: Clean Mix (streaming)")
+    print(f"   Eval: {len(eval_dataset)} samples (fixed, from disk)\n")
+    return train_dataset, eval_dataset
+def main():
+    SEED = 42
+    set_seed(SEED)
+    # Config
+    config = MoEGPTConfig(
+        vocab_size=128256,
+        n_positions=2048,
+        n_embd=512,
+        n_layer=8,
+        n_head=8,
+        n_experts=8,
+        n_experts_active=2,
+        moe_layer_frequency=2,
+        capacity_factor=1.25,
+        eval_capacity_factor=2.0,
+        use_noisy_gating=True,
+        aux_loss_alpha=0.01,
+        router_z_loss_alpha=0.001,
+        bias=False,
+        dropout=0.1,
+        activation_function="gelu",
+        initializer_range=0.1,
+        rope_theta=10000.0,
+    )
+    print("\n🔧 Model Config:")
+    print(f"   - Experten: {config.n_experts} (Top-{config.n_experts_active})")
+    print(f"   - Parameter: {config.total_experts} MoE experts")
+    # Training Args
+    # Dataset: ~10.4 GB ≈ 2.5B tokens ≈ 1.2M batches (2048 tokens each)
+    # With batch size 32: ~38K steps per epoch
+    # ~1.3 epochs = ~50K steps (interleaved = more efficient)
+    training_args = TrainingArguments(
+        output_dir="./moe_checkpoints_v8_clean",
+        run_name="german_moe_v8_clean",
+        max_steps=200000,
+        per_device_train_batch_size=2,
+        per_device_eval_batch_size=2,
+        gradient_accumulation_steps=16,
+        learning_rate=6e-4,
+        warmup_steps=2000,
+        lr_scheduler_type="cosine",
+        weight_decay=0.1,
+        bf16=torch.cuda.is_bf16_supported() if torch.cuda.is_available() else False,
+        fp16=not torch.cuda.is_bf16_supported() if torch.cuda.is_available() else False,
+        logging_dir="./logs_v8_clean",
+        logging_steps=100,
+        logging_first_step=True,
+        report_to=["tensorboard"],
+        eval_strategy="steps",
+        eval_steps=1000,  # Every 1K steps (more frequent than v7)
+        save_strategy="steps",
+        save_steps=1000,
+        save_total_limit=10,
+        dataloader_num_workers=0,
+        dataloader_pin_memory=True,
+        gradient_checkpointing=True,
+        seed=SEED,
+        load_best_model_at_end=False,
+        metric_for_best_model="eval_loss",
+        greater_is_better=False,
+        ignore_data_skip=True,  # CRITICAL: Don't skip batches, use fresh shuffled data!
+    )
+    # Check for existing checkpoints (auto-resume) - DO THIS EARLY!
+    import glob
+    checkpoints = glob.glob(os.path.join(training_args.output_dir, "checkpoint-*"))
+    resume_from_checkpoint = None
+    resume_step = 0
+    if checkpoints:
+        latest_checkpoint = max(checkpoints, key=lambda x: int(x.split("-")[-1]))
+        resume_from_checkpoint = latest_checkpoint
+        resume_step = int(latest_checkpoint.split("-")[-1])
+        print(f"\n🔄 RESUME Training from: {latest_checkpoint} (Step {resume_step})")
+    else:
+        print("\n🆕 Starting fresh training (no checkpoints found)")
+    # Tokenizer
+    print("\n📚 Lade Tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
+    tokenizer.pad_token = tokenizer.eos_token
+    print("✅ Llama 3.2 Tokenizer geladen")
+    # Load Clean Datasets (with resume_step for reproducibility!)
+    train_dataset, eval_dataset = load_clean_datasets(
+        tokenizer=tokenizer,
+        max_length=2048,
+        seed=SEED,
+        resume_step=resume_step,
+    )
+    # Data Collator
+    data_collator = DataCollatorForLanguageModeling(pad_token_id=tokenizer.pad_token_id)
+    # Model
+    print("\n🏗️  Erstelle MoE Modell...")
+    model = MoEGPTForCausalLM(config)
+    # Ensure weight tying (especially after checkpoint load)
+    model.tie_weights()
+    total_params = sum(p.numel() for p in model.parameters())
+    print(f"✅ Modell erstellt! ({total_params/1e6:.1f}M params)")
+    # Callbacks
+    sample_callback = SampleGenerationCallback(
+        tokenizer=tokenizer,
+        prompts=get_german_sample_prompts(),
+        generate_every_n_steps=1000,  # Every 1K steps - fast feedback!
+        max_new_tokens=500,
+        temperature=0.7,
+        top_p=0.7,
+        output_dir="./samples_v8_clean",
+    )
+    # Trainer
+    print("\n🚀 Initialisiere Trainer...")
+    trainer = MoETrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator=data_collator,
+        callbacks=[MoEEvalCallback(), sample_callback],
+    )
+    print("✅ Trainer bereit!")
+    print("\n" + "=" * 60)
+    print("🎯 STARTE TRAINING v8 - OPUS EDITION!")
+    print("=" * 60)
+    print("\nDataset Composition (INTERLEAVED!):")
+    print("  Wikipedia (WITH EOS): 64%")
+    print("  OpenSubtitles OPUS (NO EOS): 24%")
+    print("  Belletristik (NO EOS): 12%")
+    print("\nTotal: ~10.4 GB CLEAN German!")
+    print("NO spam, NO ads, NO SEO garbage! 🎉")
+    print("PLUS natural dialogues from movie subtitles! 🎬")
+    print("=" * 60 + "\n")
+    # Train with resume support
+    trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+    # Save
+    print("\n💾 Speichere finales Modell...")
+    final_model_path = "./moe_final_v8_clean"
+    trainer.save_model(final_model_path)
+    config.save_pretrained(final_model_path)
+    print(f"✅ Modell gespeichert in: {final_model_path}")
+    # Eval
+    print("\n📊 Finale Evaluation...")
+    eval_results = trainer.evaluate()
+    for key, value in eval_results.items():
+        print(f"   - {key}: {value:.4f}")
+    if "eval_loss" in eval_results:
+        perplexity = torch.exp(torch.tensor(eval_results["eval_loss"]))
+        print(f"\n🎯 Finale Perplexity: {perplexity:.2f}")
+    print("\n" + "=" * 60)
+    print("✅ TRAINING ABGESCHLOSSEN!")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()