| import os, torch, gradio as gr |
| from threading import Thread |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer |
|
|
| os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") |
|
|
| MODEL_ID = "TildeAI/TildeOpen-30b" |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False) |
|
|
| |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_ID, |
| torch_dtype=torch.bfloat16, |
| device_map="auto", |
| ) |
|
|
| |
| torch.backends.cuda.matmul.allow_tf32 = True |
|
|
| SYS = ( |
| "You are a helpful multilingual assistant. " |
| "This is a *base* model (not instruction tuned); follow the user's request precisely." |
| ) |
|
|
| def build_prompt(history, user_msg): |
| |
| parts = [SYS, ""] |
| for u, a in history: |
| parts += [f"User: {u}", f"Assistant: {a}"] |
| parts += [f"User: {user_msg}", "Assistant:"] |
| return "\n".join(parts) |
|
|
| def chat_fn(message, history): |
| prompt = build_prompt(history, message) |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
| gen_kwargs = dict( |
| **inputs, |
| max_new_tokens=512, |
| do_sample=True, |
| temperature=0.7, |
| top_p=0.9, |
| repetition_penalty=1.1, |
| streamer=streamer, |
| ) |
|
|
| t = Thread(target=model.generate, kwargs=gen_kwargs) |
| t.start() |
| partial = "" |
| for chunk in streamer: |
| partial += chunk |
| yield partial |
|
|
| demo = gr.ChatInterface( |
| fn=chat_fn, |
| title="TildeOpen-30B (Transformers, BF16)", |
| description="Base model; multilingual. If build fails with OOM, switch to Option B (GGUF).", |
| ) |
| demo.queue().launch() |
|
|