Trying out Apertus 8B in Colab.
I'm trying out Apertus 8B Instruct in Colab for inference using vLLM with no success. The following code works perfectly with LLaMA and Qwen. I'm using an A100 GPU. Any help will be highly appreciated!:
CODE:
!sudo apt-get install git-lfs
!pip install transformers seqeval[gpu]
!pip install datasets
!pip install --upgrade --force-reinstall --no-cache-dir triton vllm protobuf==3.20.3
import numpy as np
import pandas as pd
import torch
import json
import os
from vllm import LLM, SamplingParams
os.environ['CUDA_VISIBLE_DEVICES']="0"
base_model_name= "swiss-ai/Apertus-8B-Instruct-2509" #"meta-llama/Llama-3.1-8B-Instruct"
merged_peft_model_name= "swiss-ai/Apertus-8B-Instruct-2509" #"meta-llama/Llama-3.1-8B-Instruct" #
llm = LLM(model=merged_peft_model_name, tokenizer=base_model_name , gpu_memory_utilization = 0.65, max_model_len = 3000).
ERROR:
INFO 09-03 15:45:24 [utils.py:326] non-default args: {'model': 'swiss-ai/Apertus-8B-Instruct-2509', 'max_model_len': 3000, 'gpu_memory_utilization': 0.5, 'disable_log_stats': True}
INFO 09-03 15:45:25 [init.py:711] Resolved architecture: TransformersForCausalLM
INFO 09-03 15:45:25 [init.py:1750] Using max model len 3000
INFO 09-03 15:45:25 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
RuntimeError Traceback (most recent call last)
/tmp/ipython-input-503133736.py in <cell line: 0>()
2 base_model_name= "swiss-ai/Apertus-8B-Instruct-2509" #"meta-llama/Llama-3.1-8B-Instruct"
3 merged_peft_model_name= "swiss-ai/Apertus-8B-Instruct-2509" #"meta-llama/Llama-3.1-8B-Instruct" #
----> 4 llm = LLM(model=merged_peft_model_name, gpu_memory_utilization = 0.5, max_model_len = 3000) #, tokenizer=base_model_name , gpu_memory_utilization = 0.65, max_model_len = 3000
9 frames
/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/llm.py in init(self, model, runner, convert, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, allowed_local_media_path, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, cpu_offload_gb, enforce_eager, max_seq_len_to_capture, disable_custom_all_reduce, disable_async_output_proc, hf_token, hf_overrides, mm_processor_kwargs, override_pooler_config, compilation_config, logits_processors, **kwargs)
283
284 # Create the Engine (autoselects V0 vs V1)
--> 285 self.llm_engine = LLMEngine.from_engine_args(
286 engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
287 self.engine_class = type(self.llm_engine)
/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py in from_engine_args(cls, engine_args, usage_context, stat_loggers)
488 engine_cls = V1LLMEngine
489
--> 490 return engine_cls.from_vllm_config(
491 vllm_config=vllm_config,
492 usage_context=usage_context,
/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/llm_engine.py in from_vllm_config(cls, vllm_config, usage_context, stat_loggers, disable_log_stats)
125 disable_log_stats: bool = False,
126 ) -> "LLMEngine":
--> 127 return cls(vllm_config=vllm_config,
128 executor_class=Executor.get_class(vllm_config),
129 log_stats=(not disable_log_stats),
/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/llm_engine.py in init(self, vllm_config, executor_class, log_stats, usage_context, stat_loggers, mm_registry, use_cached_outputs, multiprocess_mode)
102
103 # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
--> 104 self.engine_core = EngineCoreClient.make_client(
105 multiprocess_mode=multiprocess_mode,
106 asyncio_mode=False,
/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py in make_client(multiprocess_mode, asyncio_mode, vllm_config, executor_class, log_stats)
78
79 if multiprocess_mode and not asyncio_mode:
---> 80 return SyncMPClient(vllm_config, executor_class, log_stats)
81
82 return InprocClient(vllm_config, executor_class, log_stats)
/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py in init(self, vllm_config, executor_class, log_stats)
598 def init(self, vllm_config: VllmConfig, executor_class: type[Executor],
599 log_stats: bool):
--> 600 super().init(
601 asyncio_mode=False,
602 vllm_config=vllm_config,
/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py in init(self, asyncio_mode, vllm_config, executor_class, log_stats, client_addresses)
444 else:
445 # Engines are managed by this client.
--> 446 with launch_core_engines(vllm_config, executor_class,
447 log_stats) as (engine_manager,
448 coordinator,
/usr/lib/python3.12/contextlib.py in exit(self, typ, value, traceback)
142 if typ is None:
143 try:
--> 144 next(self.gen)
145 except StopIteration:
146 return False
/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py in launch_core_engines(vllm_config, executor_class, log_stats, num_api_servers)
704
705 # Now wait for engines to start.
--> 706 wait_for_engine_startup(
707 handshake_socket,
708 addresses,
/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py in wait_for_engine_startup(handshake_socket, addresses, core_engines, parallel_config, cache_config, proc_manager, coord_process)
757 if coord_process is not None and coord_process.exitcode is not None:
758 finished[coord_process.name] = coord_process.exitcode
--> 759 raise RuntimeError("Engine core initialization failed. "
760 "See root cause above. "
761 f"Failed core proc(s): {finished}")
RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
this is is likely not using the latest transformers version. can you try updating?
@mjaggi Running this code is taking too much time:
!pip install "git+https://github.com/vllm-project/vllm.git@main"
!pip install "git+https://github.com/huggingface/transformers.git@main"
!pip install datasets seqeval[gpu] protobuf==3.20.3
import os
from vllm import LLM, SamplingParams
os.environ['CUDA_VISIBLE_DEVICES']="0"
base_model_name = "swiss-ai/Apertus-8B-Instruct-2509"
llm = LLM(
model=base_model_name,
gpu_memory_utilization=0.65,
max_model_len=3000,
trust_remote_code=True # <-
)
sampling_params = SamplingParams(temperature=0.7, top_p=0.9, max_tokens=128)
output = llm.generate("Hello! Can you explain what Apertus is about?", sampling_params)
print(output[0].outputs[0].text)