File size: 2,938 Bytes
2969cfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Food Image Classifier Configuration
project:
  name: "food_image_classifier"
  version: "1.0.0"
  description: "World-Class Food Image Classifier with Hybrid CNN-ViT Architecture"

# Hardware Configuration
hardware:
  device: "cuda"  # RTX 5060 Laptop GPU
  mixed_precision: true
  compile_model: true
  num_workers: 4
  pin_memory: true

# Data Configuration
data:
  image_size: 224
  batch_size: 32  # Reduced to avoid memory issues
  num_classes: 101  # Food101 dataset: 101 classes, 1000 images per class
  datasets:
    - name: "food101"
      source: "kaggle"
      path: "data/raw/food101"
    # Temporarily disabled HuggingFace dataset to use only Food101
    # - name: "food_images_hf"
    #   source: "huggingface"
    #   path: "data/raw/food_images_hf"
  
  # Data splits
  train_ratio: 0.8
  val_ratio: 0.15
  test_ratio: 0.05
  
  # Augmentation
  augmentation:
    horizontal_flip: 0.5
    rotation: 15
    color_jitter:
      brightness: 0.2
      contrast: 0.2
      saturation: 0.2
      hue: 0.1
    normalize:
      mean: [0.485, 0.456, 0.406]
      std: [0.229, 0.224, 0.225]

# Model Configuration
model:
  architecture: "hybrid_cnn_vit"
  
  # CNN Branch (ResNet50)
  cnn:
    backbone: "resnet50"
    pretrained: true
    freeze_early_layers: true
    dropout: 0.3
  
  # ViT Branch (DeiT-Base)
  vit:
    model_name: "facebook/deit-base-distilled-patch16-224"
    pretrained: true
    freeze_early_layers: true
    dropout: 0.1
  
  # Fusion Module
  fusion:
    hidden_dim: 512
    num_heads: 8
    dropout: 0.2
  
  # Classification Head
  head:
    hidden_dims: [1024, 512]
    dropout: 0.4

# Training Configuration
training:
  epochs: 100  # Increased for comprehensive training with 101k images
  learning_rate: 1e-4
  weight_decay: 1e-5
  
  # Optimizer
  optimizer:
    type: "adamw"
    betas: [0.9, 0.999]
    eps: 1e-8
    
  # Learning Rate Scheduler
  scheduler:
    type: "cosine_annealing_warm_restarts"
    T_0: 10
    T_mult: 2
    eta_min: 1e-6
  
  # Loss Function
  loss:
    type: "label_smoothing_cross_entropy"
    smoothing: 0.1
  
  # Advanced Training Techniques
  ema:
    enabled: true
    decay: 0.9999
  
  gradient_clipping:
    enabled: true
    max_norm: 1.0
  
  early_stopping:
    enabled: true
    patience: 10
    min_delta: 0.001

# Evaluation Configuration
evaluation:
  metrics:
    - "accuracy"
    - "top5_accuracy"
    - "f1_score"
    - "precision"
    - "recall"
  
  save_confusion_matrix: true
  save_classification_report: true

# Logging Configuration
logging:
  tensorboard:
    enabled: true
    log_dir: "runs"
  
  wandb:
    enabled: false  # Set to true if you want to use wandb
    project: "food_classifier"
  
  checkpoint:
    save_best: true
    save_last: true
    save_every_n_epochs: 10

# API Keys (will be loaded from environment)
api_keys:
  kaggle_username: "${KAGGLE_USERNAME}"
  kaggle_key: "${KAGGLE_KEY}"
  huggingface_token: "${HF_TOKEN}"