Upload folder using huggingface_hub
Browse files- .gitattributes +3 -0
- nemo/context/io.json +1 -0
- nemo/context/model.yaml +277 -0
- nemo/context/nemo_tokenizer/special_tokens_map.json +23 -0
- nemo/context/nemo_tokenizer/tokenizer.json +3 -0
- nemo/context/nemo_tokenizer/tokenizer_config.json +0 -0
- nemo/weights/.metadata +3 -0
- nemo/weights/__0_0.distcp +3 -0
- nemo/weights/__0_1.distcp +3 -0
- nemo/weights/common.pt +3 -0
- nemo/weights/metadata.json +1 -0
.gitattributes
CHANGED
|
@@ -34,3 +34,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
nemo/weights/.metadata filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
nemo/weights/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
nemo/weights/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
|
nemo/context/io.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"root": {"type": "ref", "key": "trainer_context_1"}, "objects": {"tuple_1": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "sequence_parallel"], ["Index(index=4)", "context_parallel_size"], ["Index(index=5)", "expert_model_parallel_size"], ["Index(index=6)", "expert_tensor_parallel_size"], ["Index(index=7)", "moe_extended_tp"], ["Index(index=8)", "use_te_rng_tracker"], ["Index(index=9)", "pipeline_dtype"], ["Index(index=10)", "microbatch_group_size_per_vp_stage"], ["Index(index=11)", "num_layers_in_first_pipeline_stage"], ["Index(index=12)", "num_layers_in_last_pipeline_stage"], ["Index(index=13)", "account_for_embedding_in_pipeline_split"], ["Index(index=14)", "account_for_loss_in_pipeline_split"]], "metadata": null}, "dict_1": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_2": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.ssm", "name": "NemotronHConfig47B"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_1"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_1"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_2"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "nemotron_hconfig47_b_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.virtual_pipeline_model_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.sequence_parallel"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.context_parallel_size"]}], ["Attr(name='expert_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.expert_model_parallel_size"]}], ["Attr(name='expert_tensor_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.expert_tensor_parallel_size"]}], ["Attr(name='moe_extended_tp')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.moe_extended_tp"]}], ["Attr(name='use_te_rng_tracker')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.use_te_rng_tracker"]}], ["Attr(name='pipeline_dtype')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.pipeline_dtype"]}], ["Attr(name='microbatch_group_size_per_vp_stage')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.microbatch_group_size_per_vp_stage"]}], ["Attr(name='num_layers_in_first_pipeline_stage')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.num_layers_in_first_pipeline_stage"]}], ["Attr(name='num_layers_in_last_pipeline_stage')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.num_layers_in_last_pipeline_stage"]}], ["Attr(name='account_for_embedding_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_embedding_in_pipeline_split"]}], ["Attr(name='account_for_loss_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_loss_in_pipeline_split"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_1"}, "paths": ["<root>.model.config"]}, "tuple_2": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "optimizer"], ["Index(index=1)", "lr"], ["Index(index=2)", "min_lr"], ["Index(index=3)", "decoupled_lr"], ["Index(index=4)", "decoupled_min_lr"], ["Index(index=5)", "weight_decay"], ["Index(index=6)", "fp16"], ["Index(index=7)", "bf16"], ["Index(index=8)", "params_dtype"], ["Index(index=9)", "use_precision_aware_optimizer"], ["Index(index=10)", "main_grads_dtype"], ["Index(index=11)", "main_params_dtype"], ["Index(index=12)", "exp_avg_dtype"], ["Index(index=13)", "exp_avg_sq_dtype"], ["Index(index=14)", "loss_scale"], ["Index(index=15)", "initial_loss_scale"], ["Index(index=16)", "min_loss_scale"], ["Index(index=17)", "loss_scale_window"], ["Index(index=18)", "hysteresis"], ["Index(index=19)", "adam_beta1"], ["Index(index=20)", "adam_beta2"], ["Index(index=21)", "adam_eps"], ["Index(index=22)", "sgd_momentum"], ["Index(index=23)", "use_distributed_optimizer"], ["Index(index=24)", "overlap_param_gather_with_optimizer_step"], ["Index(index=25)", "optimizer_cpu_offload"], ["Index(index=26)", "optimizer_offload_fraction"], ["Index(index=27)", "use_torch_optimizer_for_cpu_offload"], ["Index(index=28)", "overlap_cpu_optimizer_d2h_h2d"], ["Index(index=29)", "pin_cpu_grads"], ["Index(index=30)", "pin_cpu_params"], ["Index(index=31)", "clip_grad"], ["Index(index=32)", "log_num_zeros_in_grad"], ["Index(index=33)", "barrier_with_L1_time"], ["Index(index=34)", "timers"], ["Index(index=35)", "config_logger_dir"]], "metadata": null}, "dict_3": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_4": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.optimizer.optimizer_config", "name": "OptimizerConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_2"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_3"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_4"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "optimizer_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='optimizer')", {"type": "leaf", "value": "adam", "paths": ["<root>.model.optim.config.optimizer"]}], ["Attr(name='lr')", {"type": "leaf", "value": 0.0001, "paths": ["<root>.model.optim.config.lr"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.min_lr"]}], ["Attr(name='decoupled_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_lr"]}], ["Attr(name='decoupled_min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_min_lr"]}], ["Attr(name='weight_decay')", {"type": "leaf", "value": 0.01, "paths": ["<root>.model.optim.config.weight_decay"]}], ["Attr(name='fp16')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.fp16"]}], ["Attr(name='bf16')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='use_precision_aware_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.use_precision_aware_optimizer"]}], ["Attr(name='main_grads_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='main_params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_sq_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='loss_scale')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.loss_scale"]}], ["Attr(name='initial_loss_scale')", {"type": "leaf", "value": 4294967296, "paths": ["<root>.model.optim.config.initial_loss_scale"]}], ["Attr(name='min_loss_scale')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.min_loss_scale"]}], ["Attr(name='loss_scale_window')", {"type": "leaf", "value": 1000, "paths": ["<root>.model.optim.config.loss_scale_window"]}], ["Attr(name='hysteresis')", {"type": "leaf", "value": 2, "paths": ["<root>.model.optim.config.hysteresis"]}], ["Attr(name='adam_beta1')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.adam_beta1"]}], ["Attr(name='adam_beta2')", {"type": "leaf", "value": 0.999, "paths": ["<root>.model.optim.config.adam_beta2"]}], ["Attr(name='adam_eps')", {"type": "leaf", "value": 1e-08, "paths": ["<root>.model.optim.config.adam_eps"]}], ["Attr(name='sgd_momentum')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.sgd_momentum"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.use_distributed_optimizer"]}], ["Attr(name='overlap_param_gather_with_optimizer_step')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.overlap_param_gather_with_optimizer_step"]}], ["Attr(name='optimizer_cpu_offload')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.optimizer_cpu_offload"]}], ["Attr(name='optimizer_offload_fraction')", {"type": "leaf", "value": 0.0, "paths": ["<root>.model.optim.config.optimizer_offload_fraction"]}], ["Attr(name='use_torch_optimizer_for_cpu_offload')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.use_torch_optimizer_for_cpu_offload"]}], ["Attr(name='overlap_cpu_optimizer_d2h_h2d')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.overlap_cpu_optimizer_d2h_h2d"]}], ["Attr(name='pin_cpu_grads')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.pin_cpu_grads"]}], ["Attr(name='pin_cpu_params')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.pin_cpu_params"]}], ["Attr(name='clip_grad')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.clip_grad"]}], ["Attr(name='log_num_zeros_in_grad')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.log_num_zeros_in_grad"]}], ["Attr(name='barrier_with_L1_time')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.barrier_with_L1_time"]}], ["Attr(name='timers')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.timers"]}], ["Attr(name='config_logger_dir')", {"type": "leaf", "value": "", "paths": ["<root>.model.optim.config.config_logger_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_2"}, "paths": ["<root>.model.optim.config"]}, "tuple_3": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"]], "metadata": null}, "dict_5": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_6": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_3": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.megatron", "name": "MegatronOptimizerModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_3"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_5"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_6"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_optimizer_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "optimizer_config_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_3"}, "paths": ["<root>.model.optim"]}, "tuple_4": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "trust_remote_code"]], "metadata": null}, "dict_7": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_8": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_4": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_4"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_7"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_8"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "leaf", "value": "nemo_tokenizer", "paths": ["<root>.model.tokenizer.pretrained_model_name"]}], ["Attr(name='trust_remote_code')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.trust_remote_code"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_4"}, "paths": ["<root>.model.tokenizer"]}, "tuple_5": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "optim"], ["Index(index=2)", "tokenizer"]], "metadata": null}, "dict_9": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_10": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_5": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.ssm", "name": "MambaModel"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_5"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_9"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_10"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "mamba_model_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "nemotron_hconfig47_b_1"}], ["Attr(name='optim')", {"type": "ref", "key": "megatron_optimizer_module_1"}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_5"}, "paths": ["<root>.model"]}, "tuple_6": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "always_save_context"]], "metadata": null}, "dict_11": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='always_save_context')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.kwargs['always_save_context']"]}]], "metadata": {"type": "ref", "key": "tuple_6"}, "paths": ["<root>.trainer.strategy.kwargs"]}, "tuple_7": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "ckpt_save_optimizer"], ["Index(index=1)", "kwargs"]], "metadata": null}, "dict_12": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_13": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_6": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.strategies.megatron_strategy", "name": "MegatronStrategy"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_7"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_12"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_13"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_strategy_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='ckpt_save_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ckpt_save_optimizer"]}], ["Attr(name='kwargs')", {"type": "ref", "key": "dict_11"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_6"}, "paths": ["<root>.trainer.strategy"]}, "tuple_8": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "accelerator"], ["Index(index=1)", "strategy"], ["Index(index=2)", "devices"]], "metadata": null}, "dict_14": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_15": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_7": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.trainer", "name": "Trainer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_8"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_14"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_15"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='accelerator')", {"type": "leaf", "value": "cpu", "paths": ["<root>.trainer.accelerator"]}], ["Attr(name='strategy')", {"type": "ref", "key": "megatron_strategy_1"}], ["Attr(name='devices')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.devices"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_7"}, "paths": ["<root>.trainer"]}, "dict_16": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}, "paths": ["<root>.extra"]}, "tuple_9": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "model"], ["Index(index=1)", "trainer"], ["Index(index=2)", "extra"]], "metadata": null}, "dict_17": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_18": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_8": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.pl", "name": "TrainerContext"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_9"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_17"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_18"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_context_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='model')", {"type": "ref", "key": "mamba_model_1"}], ["Attr(name='trainer')", {"type": "ref", "key": "trainer_1"}], ["Attr(name='extra')", {"type": "ref", "key": "dict_16"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_8"}, "paths": ["<root>"]}}, "refcounts": {"tuple_1": 1, "dict_1": 1, "dict_2": 1, "buildable_traverser_metadata_1": 1, "nemotron_hconfig47_b_1": 1, "tuple_2": 1, "dict_3": 1, "dict_4": 1, "buildable_traverser_metadata_2": 1, "optimizer_config_1": 1, "tuple_3": 1, "dict_5": 1, "dict_6": 1, "buildable_traverser_metadata_3": 1, "megatron_optimizer_module_1": 1, "tuple_4": 1, "dict_7": 1, "dict_8": 1, "buildable_traverser_metadata_4": 1, "auto_tokenizer_1": 1, "tuple_5": 1, "dict_9": 1, "dict_10": 1, "buildable_traverser_metadata_5": 1, "mamba_model_1": 1, "tuple_6": 1, "dict_11": 1, "tuple_7": 1, "dict_12": 1, "dict_13": 1, "buildable_traverser_metadata_6": 1, "megatron_strategy_1": 1, "tuple_8": 1, "dict_14": 1, "dict_15": 1, "buildable_traverser_metadata_7": 1, "trainer_1": 1, "dict_16": 1, "tuple_9": 1, "dict_17": 1, "dict_18": 1, "buildable_traverser_metadata_8": 1, "trainer_context_1": 1}, "version": "0.0.1"}
|
nemo/context/model.yaml
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: nemo.collections.llm.gpt.model.ssm.MambaModel
|
| 2 |
+
config:
|
| 3 |
+
_cpu_offloading_context: null
|
| 4 |
+
_target_: nemo.collections.llm.gpt.model.ssm.NemotronHConfig47B
|
| 5 |
+
account_for_embedding_in_pipeline_split: false
|
| 6 |
+
account_for_loss_in_pipeline_split: false
|
| 7 |
+
activation_func:
|
| 8 |
+
_call_: false
|
| 9 |
+
_target_: nemo.collections.llm.gpt.model.ssm.NemotronHConfigBase.<lambda>
|
| 10 |
+
activation_func_fp8_input_store: false
|
| 11 |
+
add_bias_linear: false
|
| 12 |
+
add_qkv_bias: false
|
| 13 |
+
apply_query_key_layer_scaling: false
|
| 14 |
+
apply_residual_connection_post_layernorm: false
|
| 15 |
+
apply_rope_fusion: true
|
| 16 |
+
async_tensor_model_parallel_allreduce: false
|
| 17 |
+
attention_backend:
|
| 18 |
+
_call_: true
|
| 19 |
+
_target_: megatron.core.transformer.enums.AttnBackend
|
| 20 |
+
attention_dropout: 0.0
|
| 21 |
+
attention_softmax_in_fp32: false
|
| 22 |
+
autocast_dtype: null
|
| 23 |
+
barrier_with_L1_time: true
|
| 24 |
+
batch_p2p_comm: true
|
| 25 |
+
batch_p2p_sync: true
|
| 26 |
+
bf16: true
|
| 27 |
+
bias_activation_fusion: false
|
| 28 |
+
bias_dropout_fusion: true
|
| 29 |
+
calculate_per_token_loss: false
|
| 30 |
+
clone_scatter_output_in_embedding: true
|
| 31 |
+
config_logger_dir: ''
|
| 32 |
+
context_parallel_size: 1
|
| 33 |
+
cp_comm_type: null
|
| 34 |
+
cpu_offloading: false
|
| 35 |
+
cpu_offloading_activations: true
|
| 36 |
+
cpu_offloading_num_layers: 0
|
| 37 |
+
cpu_offloading_weights: true
|
| 38 |
+
cross_entropy_fusion_impl: native
|
| 39 |
+
cross_entropy_loss_fusion: true
|
| 40 |
+
cuda_graph_retain_backward_graph: false
|
| 41 |
+
cuda_graph_scope: full
|
| 42 |
+
cuda_graph_use_single_mempool: false
|
| 43 |
+
cuda_graph_warmup_steps: 3
|
| 44 |
+
data_step_fn:
|
| 45 |
+
_call_: false
|
| 46 |
+
_target_: nemo.collections.llm.gpt.model.base.gpt_data_step
|
| 47 |
+
deallocate_pipeline_outputs: true
|
| 48 |
+
defer_embedding_wgrad_compute: false
|
| 49 |
+
deterministic_mode: false
|
| 50 |
+
disable_parameter_transpose_cache: false
|
| 51 |
+
distribute_saved_activations: null
|
| 52 |
+
enable_autocast: false
|
| 53 |
+
enable_cuda_graph: false
|
| 54 |
+
expert_model_parallel_size: 1
|
| 55 |
+
expert_tensor_parallel_size: null
|
| 56 |
+
external_cuda_graph: false
|
| 57 |
+
ffn_hidden_size: 30720
|
| 58 |
+
finalize_model_grads_func: null
|
| 59 |
+
first_last_layers_bf16: true
|
| 60 |
+
flash_decode: false
|
| 61 |
+
forward_step_fn:
|
| 62 |
+
_call_: false
|
| 63 |
+
_target_: nemo.collections.llm.gpt.model.ssm.ssm_forward_step
|
| 64 |
+
fp16: false
|
| 65 |
+
fp16_lm_cross_entropy: false
|
| 66 |
+
fp32_residual_connection: false
|
| 67 |
+
fp8: null
|
| 68 |
+
fp8_amax_compute_algo: most_recent
|
| 69 |
+
fp8_amax_history_len: 1
|
| 70 |
+
fp8_dot_product_attention: false
|
| 71 |
+
fp8_interval: 1
|
| 72 |
+
fp8_margin: 0
|
| 73 |
+
fp8_multi_head_attention: false
|
| 74 |
+
fp8_recipe: delayed
|
| 75 |
+
fp8_wgrad: true
|
| 76 |
+
gated_linear_unit: false
|
| 77 |
+
get_attention_mask_from_fusion: false
|
| 78 |
+
grad_scale_func: null
|
| 79 |
+
grad_sync_func: null
|
| 80 |
+
gradient_accumulation_fusion: false
|
| 81 |
+
hidden_dropout: 0.0
|
| 82 |
+
hidden_size: 8192
|
| 83 |
+
hierarchical_context_parallel_sizes: null
|
| 84 |
+
hybrid_attention_ratio: 0.0
|
| 85 |
+
hybrid_mlp_ratio: 0.0
|
| 86 |
+
hybrid_override_pattern: M-M-M-M-M-M-M-M-M*-M-M-M-M-M-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-M-M---MM---M-M*-M-M-M-M-M-
|
| 87 |
+
inference_rng_tracker: false
|
| 88 |
+
init_method: null
|
| 89 |
+
init_method_std: 0.02
|
| 90 |
+
init_model_with_meta_device: false
|
| 91 |
+
is_hybrid_model: true
|
| 92 |
+
kv_channels: null
|
| 93 |
+
layernorm_epsilon: 1.0e-05
|
| 94 |
+
layernorm_zero_centered_gamma: false
|
| 95 |
+
make_vocab_size_divisible_by: 128
|
| 96 |
+
mamba_head_dim: 64
|
| 97 |
+
mamba_nheads: 256
|
| 98 |
+
mamba_num_groups: 8
|
| 99 |
+
mamba_state_dim: 256
|
| 100 |
+
mapping_type: nvidia-hybrid-nemotronh
|
| 101 |
+
masked_softmax_fusion: true
|
| 102 |
+
memory_efficient_layer_norm: false
|
| 103 |
+
microbatch_group_size_per_vp_stage: 1
|
| 104 |
+
moe_aux_loss_coeff: 0
|
| 105 |
+
moe_enable_deepep: false
|
| 106 |
+
moe_expert_capacity_factor: null
|
| 107 |
+
moe_extended_tp: false
|
| 108 |
+
moe_ffn_hidden_size: null
|
| 109 |
+
moe_grouped_gemm: false
|
| 110 |
+
moe_input_jitter_eps: null
|
| 111 |
+
moe_layer_freq: 1
|
| 112 |
+
moe_layer_recompute: false
|
| 113 |
+
moe_pad_expert_input_to_capacity: false
|
| 114 |
+
moe_per_layer_logging: false
|
| 115 |
+
moe_permute_fusion: false
|
| 116 |
+
moe_router_bias_update_rate: 0.001
|
| 117 |
+
moe_router_dtype: null
|
| 118 |
+
moe_router_enable_expert_bias: false
|
| 119 |
+
moe_router_group_topk: null
|
| 120 |
+
moe_router_load_balancing_type: aux_loss
|
| 121 |
+
moe_router_num_groups: null
|
| 122 |
+
moe_router_pre_softmax: false
|
| 123 |
+
moe_router_score_function: softmax
|
| 124 |
+
moe_router_topk: 2
|
| 125 |
+
moe_router_topk_limited_devices: null
|
| 126 |
+
moe_router_topk_scaling_factor: null
|
| 127 |
+
moe_shared_expert_intermediate_size: null
|
| 128 |
+
moe_shared_expert_overlap: false
|
| 129 |
+
moe_token_dispatcher_type: allgather
|
| 130 |
+
moe_token_drop_policy: probs
|
| 131 |
+
moe_token_dropping: false
|
| 132 |
+
moe_use_legacy_grouped_gemm: false
|
| 133 |
+
moe_z_loss_coeff: null
|
| 134 |
+
mtp_loss_scaling_factor: null
|
| 135 |
+
mtp_num_layers: null
|
| 136 |
+
multi_latent_attention: false
|
| 137 |
+
no_sync_func: null
|
| 138 |
+
normalization: RMSNorm
|
| 139 |
+
num_attention_heads: 64
|
| 140 |
+
num_layers: 98
|
| 141 |
+
num_layers_at_end_in_bf16: 1
|
| 142 |
+
num_layers_at_start_in_bf16: 1
|
| 143 |
+
num_layers_in_first_pipeline_stage: null
|
| 144 |
+
num_layers_in_last_pipeline_stage: null
|
| 145 |
+
num_microbatches_with_partial_activation_checkpoints: null
|
| 146 |
+
num_moe_experts: null
|
| 147 |
+
num_query_groups: 8
|
| 148 |
+
output_layer_init_method: null
|
| 149 |
+
overlap_p2p_comm: false
|
| 150 |
+
overlap_p2p_comm_warmup_flush: false
|
| 151 |
+
parallel_output: true
|
| 152 |
+
param_sync_func: null
|
| 153 |
+
params_dtype:
|
| 154 |
+
_call_: false
|
| 155 |
+
_target_: torch.bfloat16
|
| 156 |
+
perform_initialization: true
|
| 157 |
+
persist_layer_norm: true
|
| 158 |
+
pipeline_dtype: null
|
| 159 |
+
pipeline_model_parallel_comm_backend: null
|
| 160 |
+
pipeline_model_parallel_size: 1
|
| 161 |
+
pipeline_model_parallel_split_rank: null
|
| 162 |
+
position_embedding_type: none
|
| 163 |
+
post_process: true
|
| 164 |
+
pre_process: true
|
| 165 |
+
qk_layernorm: false
|
| 166 |
+
recompute_granularity: null
|
| 167 |
+
recompute_method: null
|
| 168 |
+
recompute_num_layers: null
|
| 169 |
+
rotary_base: 10000
|
| 170 |
+
rotary_interleaved: false
|
| 171 |
+
rotary_percent: 1.0
|
| 172 |
+
seq_len_interpolation_factor: null
|
| 173 |
+
seq_length: 8192
|
| 174 |
+
sequence_parallel: false
|
| 175 |
+
share_embeddings_and_output_weights: false
|
| 176 |
+
softmax_scale: null
|
| 177 |
+
tensor_model_parallel_size: 1
|
| 178 |
+
test_mode: false
|
| 179 |
+
timers: null
|
| 180 |
+
tokenizer_library: tiktoken
|
| 181 |
+
tokenizer_model_path: null
|
| 182 |
+
tokenizer_name: TiktokenTokenizer
|
| 183 |
+
tp_comm_atomic_ag: false
|
| 184 |
+
tp_comm_atomic_rs: false
|
| 185 |
+
tp_comm_bootstrap_backend: nccl
|
| 186 |
+
tp_comm_bulk_dgrad: true
|
| 187 |
+
tp_comm_bulk_wgrad: true
|
| 188 |
+
tp_comm_overlap: false
|
| 189 |
+
tp_comm_overlap_ag: true
|
| 190 |
+
tp_comm_overlap_disable_fc1: false
|
| 191 |
+
tp_comm_overlap_disable_qkv: false
|
| 192 |
+
tp_comm_overlap_rs: true
|
| 193 |
+
tp_comm_overlap_rs_dgrad: false
|
| 194 |
+
tp_comm_split_ag: true
|
| 195 |
+
tp_comm_split_rs: true
|
| 196 |
+
tp_only_amax_red: false
|
| 197 |
+
use_cpu_initialization: false
|
| 198 |
+
use_custom_fsdp: false
|
| 199 |
+
use_ring_exchange_p2p: false
|
| 200 |
+
use_te_rng_tracker: false
|
| 201 |
+
variable_seq_lengths: false
|
| 202 |
+
virtual_pipeline_model_parallel_size: null
|
| 203 |
+
vocab_file: null
|
| 204 |
+
vocab_size: 131072
|
| 205 |
+
wgrad_deferral_limit: 0
|
| 206 |
+
window_size: null
|
| 207 |
+
model_transform: null
|
| 208 |
+
optim:
|
| 209 |
+
_target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
|
| 210 |
+
config:
|
| 211 |
+
_target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
|
| 212 |
+
adam_beta1: 0.9
|
| 213 |
+
adam_beta2: 0.999
|
| 214 |
+
adam_eps: 1.0e-08
|
| 215 |
+
barrier_with_L1_time: false
|
| 216 |
+
bf16: false
|
| 217 |
+
clip_grad: 1.0
|
| 218 |
+
config_logger_dir: ''
|
| 219 |
+
decoupled_lr: null
|
| 220 |
+
decoupled_min_lr: null
|
| 221 |
+
exp_avg_dtype:
|
| 222 |
+
_call_: false
|
| 223 |
+
_target_: torch.float32
|
| 224 |
+
exp_avg_sq_dtype:
|
| 225 |
+
_call_: false
|
| 226 |
+
_target_: torch.float32
|
| 227 |
+
fp16: false
|
| 228 |
+
hysteresis: 2
|
| 229 |
+
initial_loss_scale: 4294967296
|
| 230 |
+
log_num_zeros_in_grad: false
|
| 231 |
+
loss_scale: null
|
| 232 |
+
loss_scale_window: 1000
|
| 233 |
+
lr: 0.0001
|
| 234 |
+
main_grads_dtype:
|
| 235 |
+
_call_: false
|
| 236 |
+
_target_: torch.float32
|
| 237 |
+
main_params_dtype:
|
| 238 |
+
_call_: false
|
| 239 |
+
_target_: torch.float32
|
| 240 |
+
min_loss_scale: 1.0
|
| 241 |
+
min_lr: null
|
| 242 |
+
optimizer: adam
|
| 243 |
+
optimizer_cpu_offload: false
|
| 244 |
+
optimizer_offload_fraction: 0.0
|
| 245 |
+
overlap_cpu_optimizer_d2h_h2d: false
|
| 246 |
+
overlap_param_gather_with_optimizer_step: false
|
| 247 |
+
params_dtype:
|
| 248 |
+
_call_: false
|
| 249 |
+
_target_: torch.float32
|
| 250 |
+
pin_cpu_grads: true
|
| 251 |
+
pin_cpu_params: true
|
| 252 |
+
sgd_momentum: 0.9
|
| 253 |
+
timers: null
|
| 254 |
+
use_distributed_optimizer: true
|
| 255 |
+
use_precision_aware_optimizer: false
|
| 256 |
+
use_torch_optimizer_for_cpu_offload: false
|
| 257 |
+
weight_decay: 0.01
|
| 258 |
+
lr_mult: 1.0
|
| 259 |
+
lr_scheduler: null
|
| 260 |
+
no_weight_decay_cond: null
|
| 261 |
+
scale_lr_cond: null
|
| 262 |
+
tokenizer:
|
| 263 |
+
_target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
|
| 264 |
+
additional_special_tokens: []
|
| 265 |
+
bos_token: null
|
| 266 |
+
cls_token: null
|
| 267 |
+
eos_token: null
|
| 268 |
+
include_special_tokens: false
|
| 269 |
+
mask_token: null
|
| 270 |
+
merges_file: null
|
| 271 |
+
pad_token: null
|
| 272 |
+
pretrained_model_name: nemo_tokenizer
|
| 273 |
+
sep_token: null
|
| 274 |
+
trust_remote_code: true
|
| 275 |
+
unk_token: null
|
| 276 |
+
use_fast: false
|
| 277 |
+
vocab_file: null
|
nemo/context/nemo_tokenizer/special_tokens_map.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"unk_token": {
|
| 17 |
+
"content": "<unk>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
}
|
| 23 |
+
}
|
nemo/context/nemo_tokenizer/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3277c00fe5fb3963b3cb7c07b7f183722d2af4d775a4aea7cfb3684d7cccbc2f
|
| 3 |
+
size 17078330
|
nemo/context/nemo_tokenizer/tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nemo/weights/.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c351cb6da43f06c2cbea80f6b64247dacaee17e4a7b0baee8c664709307b0e7
|
| 3 |
+
size 425519
|
nemo/weights/__0_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:87418abcf047f4dd5aa2924d8de88ccff8225e555249043a8916206ceebb17da
|
| 3 |
+
size 46792303844
|
nemo/weights/__0_1.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9270a49378b0de2284768e37136f913abffd74caa9cf0684b0455e22d89704f7
|
| 3 |
+
size 46792361482
|
nemo/weights/common.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf2d72b4f4fef17ea0a99868f378f27ac9989452f3ffdd0691821ae1a3b9f285
|
| 3 |
+
size 3099
|
nemo/weights/metadata.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
|