Update README.md
Browse files
README.md
CHANGED
|
@@ -26,13 +26,90 @@ Calibration uses natural top-k routing rather than forcing all experts to activa
|
|
| 26 |
|
| 27 |
Samples were drawn from a diverse mix of publicly available datasets spanning code generation, function/tool calling, multi-turn reasoning, math, and multilingual (English + Chinese) instruction following. System prompts were randomly varied across samples. The dataset was designed to broadly exercise the model's capabilities and activate diverse token distributions across expert modules.
|
| 28 |
|
| 29 |
-
###
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
#### SGLang
|
| 36 |
|
| 37 |
Tested on 2x and 4x RTX Pro 6000 Blackwell.
|
| 38 |
```
|
|
@@ -62,8 +139,4 @@ Tested on 2x and 4x RTX Pro 6000 Blackwell.
|
|
| 62 |
--host 0.0.0.0 --port 5000
|
| 63 |
```
|
| 64 |
|
| 65 |
-
#### vLLM
|
| 66 |
-
|
| 67 |
-
(pending)
|
| 68 |
-
|
| 69 |
```
|
|
|
|
| 26 |
|
| 27 |
Samples were drawn from a diverse mix of publicly available datasets spanning code generation, function/tool calling, multi-turn reasoning, math, and multilingual (English + Chinese) instruction following. System prompts were randomly varied across samples. The dataset was designed to broadly exercise the model's capabilities and activate diverse token distributions across expert modules.
|
| 28 |
|
| 29 |
+
### Running
|
| 30 |
|
| 31 |
+
```
|
| 32 |
+
exec docker run \
|
| 33 |
+
--name sglang-m27a \
|
| 34 |
+
--ipc=host \
|
| 35 |
+
--shm-size=12g \
|
| 36 |
+
--network=host \
|
| 37 |
+
--cpuset-cpus=0-31 \
|
| 38 |
+
--ulimit memlock=-1 \
|
| 39 |
+
--ulimit stack=67108864 \
|
| 40 |
+
--ulimit nofile=1048576:1048576 \
|
| 41 |
+
--restart unless-stopped \
|
| 42 |
+
-e SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=1 \
|
| 43 |
+
-e SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 \
|
| 44 |
+
-e SGLANG_CUSTOM_ALLREDUCE_ALGO=oneshot \
|
| 45 |
+
-e SGLANG_DISABLE_FA4_WARMUP=1 \
|
| 46 |
+
-e SGLANG_CHUNKED_PREFIX_CACHE_THRESHOLD=4096 \
|
| 47 |
+
-e SGLANG_ENABLE_JIT_DEEPGEMM=0 \
|
| 48 |
+
-e NCCL_IB_DISABLE=1 \
|
| 49 |
+
-e NCCL_P2P_DISABLE=1 \
|
| 50 |
+
-e NCCL_NVLS_ENABLE=0 \
|
| 51 |
+
-e NCCL_CUMEM_ENABLE=0 \
|
| 52 |
+
-e NCCL_P2P_LEVEL=SYS \
|
| 53 |
+
-e B12X_MOE_FORCE_A16=1 \
|
| 54 |
+
-e NCCL_ALLOC_P2P_NET_LL_BUFFERS=1 \
|
| 55 |
+
-e NCCL_MIN_NCHANNELS=8 \
|
| 56 |
+
-e NCCL_SOCKET_NTHREADS=4 \
|
| 57 |
+
-e NCCL_NSOCKS_PERTHREAD=2 \
|
| 58 |
+
-e NCCL_BUFFSIZE=16777216 \
|
| 59 |
+
-e TORCH_NCCL_AVOID_RECORD_STREAMS=1 \
|
| 60 |
+
-e OMP_NUM_THREADS=16 \
|
| 61 |
+
-e MKL_NUM_THREADS=16 \
|
| 62 |
+
-e OPENBLAS_NUM_THREADS=16 \
|
| 63 |
+
-e NUMEXPR_NUM_THREADS=16 \
|
| 64 |
+
-e TOKENIZERS_PARALLELISM=false \
|
| 65 |
+
-e CUDA_DEVICE_MAX_CONNECTIONS=1 \
|
| 66 |
+
-e CUDA_MODULE_LOADING=LAZY \
|
| 67 |
+
-e SAFETENSORS_FAST_GPU=1 \
|
| 68 |
+
-e TRITON_CACHE_DIR=/cache/triton \
|
| 69 |
+
-e TORCH_COMPILE_DEBUG=0 \
|
| 70 |
+
-e HF_HUB_ENABLE_HF_TRANSFER=1 \
|
| 71 |
+
-e HF_HOME=/root/.cache/huggingface \
|
| 72 |
+
-e TRANSFORMERS_OFFLINE=1 \
|
| 73 |
+
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
| 74 |
+
llm-sglang-blackwell:cu130 \
|
| 75 |
+
python -m sglang.launch_server \
|
| 76 |
+
--grammar-backend none \
|
| 77 |
+
--model lukealonso/MiniMax-M2.7-NVFP4 \
|
| 78 |
+
--served-model-name MiniMax-M2.7 \
|
| 79 |
+
--tensor-parallel-size 2 \
|
| 80 |
+
--quantization modelopt_fp4 \
|
| 81 |
+
--kv-cache-dtype bfloat16 \
|
| 82 |
+
--dtype auto \
|
| 83 |
+
--prefill-max-requests 4 \
|
| 84 |
+
--stream-interval 16 \
|
| 85 |
+
--load-format safetensors \
|
| 86 |
+
--trust-remote-code \
|
| 87 |
+
--context-length 196608 \
|
| 88 |
+
--mem-fraction-static 0.93 \
|
| 89 |
+
--chunked-prefill-size 4096 \
|
| 90 |
+
--max-prefill-tokens 4096 \
|
| 91 |
+
--disable-radix-cache \
|
| 92 |
+
--schedule-conservativeness 0.40 \
|
| 93 |
+
--max-running-requests 8 \
|
| 94 |
+
--cuda-graph-max-bs 8 \
|
| 95 |
+
--sampling-backend flashinfer \
|
| 96 |
+
--cuda-graph-bs 1 2 4 6 8 \
|
| 97 |
+
--num-continuous-decode-steps 4 \
|
| 98 |
+
--enable-mixed-chunk \
|
| 99 |
+
--attention-backend flashinfer \
|
| 100 |
+
--moe-runner-backend b12x \
|
| 101 |
+
--fp4-gemm-backend b12x \
|
| 102 |
+
--enable-pcie-oneshot-allreduce \
|
| 103 |
+
--pcie-oneshot-allreduce-max-size 8388608 \
|
| 104 |
+
--tool-call-parser minimax-m2 \
|
| 105 |
+
--reasoning-parser minimax-append-think \
|
| 106 |
+
--host 127.0.0.1 \
|
| 107 |
+
--hicache-size 36 \
|
| 108 |
+
--hicache-io-backend kernel \
|
| 109 |
+
--hicache-mem-layout page_first \
|
| 110 |
+
--hicache-write-policy write_through_selectitive
|
| 111 |
+
```
|
| 112 |
|
|
|
|
| 113 |
|
| 114 |
Tested on 2x and 4x RTX Pro 6000 Blackwell.
|
| 115 |
```
|
|
|
|
| 139 |
--host 0.0.0.0 --port 5000
|
| 140 |
```
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
```
|