WildOjisan commited on
Commit
0af9b33
ยท
1 Parent(s): fcdd40a
Files changed (5) hide show
  1. .dockerignore +10 -0
  2. Dockerfile +26 -0
  3. main.py +160 -0
  4. requirements.txt +15 -0
  5. simplerequest.txt +13 -0
.dockerignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ *.log
7
+ .git
8
+ .gitignore
9
+ .cache
10
+ /root/.cache
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile
2
+ FROM python:3.11-slim
3
+
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ libglib2.0-0 libgl1 && \
6
+ rm -rf /var/lib/apt/lists/*
7
+
8
+ # โœ… ์บ์‹œ/ํ† ํฐ ๊ฒฝ๋กœ๋ฅผ /data๋กœ ๊ฐ•์ œ
9
+ ENV HF_HOME=/data \
10
+ TRANSFORMERS_CACHE=/data/transformers \
11
+ HF_HUB_CACHE=/data/hub \
12
+ HF_HUB_DISABLE_TELEMETRY=1 \
13
+ TOKENIZERS_PARALLELISM=false \
14
+ PYTHONUNBUFFERED=1 \
15
+ PYTHONDONTWRITEBYTECODE=1
16
+
17
+ # โœ… ๋””๋ ‰ํ„ฐ๋ฆฌ ์ƒ์„ฑ + ํผ๋ฏธ์…˜(์“ฐ๊ธฐ ๊ฐ€๋Šฅ)
18
+ RUN mkdir -p /data/transformers /data/hub && chmod -R 777 /data
19
+
20
+ WORKDIR /app
21
+ COPY requirements.txt /app/requirements.txt
22
+ RUN pip install --no-cache-dir -r /app/requirements.txt
23
+ COPY . /app
24
+
25
+ EXPOSE 7860
26
+ CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860} --workers 1"]
main.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import threading
3
+ from typing import List, Dict, Iterator
4
+
5
+ import torch
6
+ from fastapi import FastAPI, Body
7
+ from fastapi.responses import StreamingResponse
8
+ from pydantic import BaseModel, Field
9
+ from transformers import (
10
+ AutoTokenizer,
11
+ AutoModelForCausalLM,
12
+ TextIteratorStreamer,
13
+ Conversation, # granite ๋ชจ๋ธ์˜ apply_chat_template์„ ์œ„ํ•ด ์ถ”๊ฐ€
14
+ )
15
+ # from peft import PeftModel # LoRA ๋ฏธ์‚ฌ์šฉ์œผ๋กœ ์ œ๊ฑฐ
16
+
17
+ # ----------------- ํ™˜๊ฒฝ ๊ธฐ๋ณธ๊ฐ’ -----------------
18
+ # Hugging Face ์บ์‹œ/ํ† ํฐ ๊ฒฝ๋กœ๋ฅผ ์“ฐ๊ธฐ ๊ฐ€๋Šฅํ•œ ์œ„์น˜๋กœ ์ง€์ • (Spaces์—์„œ๋Š” /data๊ฐ€ ์•ˆ์ „)
19
+ os.environ["HF_HOME"] = "/data"
20
+ os.environ["TRANSFORMERS_CACHE"] = "/data/transformers"
21
+ os.environ["HF_HUB_CACHE"] = "/data/hub"
22
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
23
+ os.environ.setdefault("PYTORCH_FORCE_MPS_FALLBACK", "1")
24
+
25
+ # CPU-only: 4bit ๋น„ํ™œ์„ฑํ™”, float32 (granite-1b๋Š” 4bit ์ง€์› ์ •๋ณด๊ฐ€ ์—†์œผ๋ฏ€๋กœ float16/bfloat16 ๋Œ€์‹  float32 ์‚ฌ์šฉ)
26
+ USE_4BIT = False
27
+ COMPUTE_DTYPE = torch.float32
28
+
29
+ # ๋ฒ ์ด์Šค ๋ชจ๋ธ ๊ฒฝ๋กœ (granite ๋ชจ๋ธ๋กœ ๋ณ€๊ฒฝ)
30
+ MODEL_ID = "ibm-granite/granite-4.0-h-1b"
31
+ # ADAPTER_ID๋Š” ์‚ฌ์šฉํ•˜์ง€ ์•Š์Œ
32
+
33
+ # ๋””๋ฐ”์ด์Šค ์„ค์ •: GPU ์‚ฌ์šฉ ๊ฐ€๋Šฅ ์—ฌ๋ถ€ ํ™•์ธ
34
+ if torch.cuda.is_available():
35
+ device_map = "cuda"
36
+ COMPUTE_DTYPE = torch.bfloat16 # GPU ์‚ฌ์šฉ ์‹œ bfloat16์„ ์‚ฌ์šฉํ•˜์—ฌ ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ ๋ฐ ์„ฑ๋Šฅ ํ–ฅ์ƒ (float32๋„ ๊ฐ€๋Šฅ)
37
+ elif torch.backends.mps.is_available():
38
+ device_map = "mps"
39
+ COMPUTE_DTYPE = torch.float32
40
+ else:
41
+ device_map = "cpu"
42
+ COMPUTE_DTYPE = torch.float32
43
+
44
+ # ์Šค๋ ˆ๋“œ ์ˆ˜
45
+ try:
46
+ torch.set_num_threads(max(1, os.cpu_count() or 1))
47
+ except Exception:
48
+ pass
49
+
50
+ # ----------------- ๋กœ๋“œ -----------------
51
+ print(f"[BOOT] Base: {MODEL_ID}")
52
+ print(f"[BOOT] Device: {device_map}")
53
+ print(f"[BOOT] DType: {COMPUTE_DTYPE}")
54
+
55
+ # ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
56
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False, trust_remote_code=True)
57
+ print("[BOOT] Tokenizer loaded.")
58
+
59
+ # pad ํ† ํฐ ๋ณด์ • (granite ๋ชจ๋ธ์˜ ๊ฒฝ์šฐ ํ•„์š”ํ•  ์ˆ˜ ์žˆ์Œ)
60
+ if tokenizer.pad_token is None:
61
+ tokenizer.pad_token = tokenizer.eos_token
62
+
63
+ # ๋ฒ ์ด์Šค ๋ชจ๋ธ ๋กœ๋“œ
64
+ model = AutoModelForCausalLM.from_pretrained(
65
+ MODEL_ID,
66
+ device_map=device_map,
67
+ trust_remote_code=True,
68
+ torch_dtype=COMPUTE_DTYPE,
69
+ low_cpu_mem_usage=True if device_map == "cpu" else False,
70
+ )
71
+ model.eval()
72
+
73
+ # ----------------- API ์Šคํ‚ค๋งˆ/์•ฑ -----------------
74
+ class ChatMessage(BaseModel):
75
+ role: str = Field(..., description="system | user | assistant")
76
+ content: str
77
+
78
+ class ChatRequest(BaseModel):
79
+ messages: List[ChatMessage]
80
+ max_new_tokens: int = 128
81
+ temperature: float = 0.7
82
+ top_p: float = 0.9
83
+ repetition_penalty: float = 1.1
84
+
85
+ class ChatResponse(BaseModel):
86
+ text: str
87
+
88
+ app = FastAPI(title="IBM Granite 4.0 H 1B API")
89
+
90
+ @app.get("/")
91
+ def health():
92
+ return {"status": "ok", "base": MODEL_ID, "device": device_map, "use_4bit": USE_4BIT}
93
+
94
+ def build_prompt(messages: List[Dict[str, str]]) -> str:
95
+ # granite ๋ชจ๋ธ์€ Hugging Face `Conversation`์„ ์‚ฌ์šฉํ•œ ํ…œํ”Œ๋ฆฟ ์ ์šฉ์ด ๊ถŒ์žฅ๋จ
96
+ hf_messages = [
97
+ {"role": m["role"], "content": m["content"]}
98
+ for m in messages
99
+ ]
100
+ # Conversation ๊ฐ์ฒด๋ฅผ ์ƒ์„ฑํ•˜์—ฌ apply_chat_template์— ์ „๋‹ฌ
101
+ conversation = Conversation(hf_messages)
102
+
103
+ return tokenizer.apply_chat_template(
104
+ conversation,
105
+ tokenize=False,
106
+ add_generation_prompt=True,
107
+ )
108
+
109
+ @app.post("/v1/chat", response_model=ChatResponse)
110
+ def chat(req: ChatRequest):
111
+ prompt = build_prompt([m.dict() for m in req.messages])
112
+ inputs = tokenizer(prompt, return_tensors="pt")
113
+ # ๋ชจ๋ธ์˜ ๋””๋ฐ”์ด์Šค๋กœ ์ด๋™
114
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
115
+
116
+ with torch.no_grad():
117
+ output_ids = model.generate(
118
+ **inputs,
119
+ max_new_tokens=req.max_new_tokens,
120
+ do_sample=True,
121
+ temperature=req.temperature,
122
+ top_p=req.top_p,
123
+ repetition_penalty=req.repetition_penalty,
124
+ eos_token_id=tokenizer.eos_token_id,
125
+ pad_token_id=tokenizer.eos_token_id,
126
+ )
127
+
128
+ # ์ž…๋ ฅ ํ† ํฐ์„ ์ œ์™ธํ•œ ์ƒ์„ฑ๋œ ํ…์ŠคํŠธ๋งŒ ๋””์ฝ”๋”ฉ
129
+ text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
130
+ return ChatResponse(text=text)
131
+
132
+ def stream_generate(req: ChatRequest) -> Iterator[str]:
133
+ prompt = build_prompt([m.dict() for m in req.messages])
134
+ inputs = tokenizer(prompt, return_tensors="pt")
135
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
136
+ # skip_prompt=True๋กœ ์„ค์ •ํ•˜์—ฌ ์ŠคํŠธ๋ฆผ์— ํ”„๋กฌํ”„ํŠธ๊ฐ€ ํฌํ•จ๋˜์ง€ ์•Š๋„๋ก ํ•จ
137
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
138
+
139
+ gen_kwargs = dict(
140
+ **inputs,
141
+ max_new_tokens=req.max_new_tokens,
142
+ do_sample=True,
143
+ temperature=req.temperature,
144
+ top_p=req.top_p,
145
+ repetition_penalty=req.repetition_penalty,
146
+ eos_token_id=tokenizer.eos_token_id,
147
+ pad_token_id=tokenizer.eos_token_id,
148
+ streamer=streamer,
149
+ )
150
+
151
+ thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
152
+ thread.start()
153
+
154
+ for token_text in streamer:
155
+ # ์ŠคํŠธ๋ฆฌ๋ฐ ์‘๋‹ต ํ˜•์‹์— ๋งž๊ฒŒ JSON ํ˜•ํƒœ๋กœ yield
156
+ yield f'{{"delta": {token_text.__repr__()}}}\n'
157
+
158
+ @app.post("/v1/chat/stream")
159
+ def chat_stream(req: ChatRequest = Body(...)):
160
+ return StreamingResponse(stream_generate(req), media_type="application/x-ndjson")
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.2.2
2
+ transformers>=4.45.2
3
+ tokenizers>=0.20.1
4
+ accelerate>=0.34.2
5
+ safetensors>=0.4.5
6
+ huggingface_hub>=0.24.6
7
+ einops>=0.8.0
8
+ sentencepiece>=0.1.99
9
+ protobuf>=4.25.3
10
+
11
+ fastapi>=0.112
12
+ uvicorn[standard]>=0.30
13
+ peft>=0.11.1
14
+ unsloth
15
+ bitsandbytes==0.43.3
simplerequest.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $body = @{
2
+ messages = @(
3
+ @{ role = "system"; content = "" },
4
+ @{ role = "user"; content = "๊ฐ„๋‹จํžˆ ์ž๊ธฐ์†Œ๊ฐœํ•ด์ค˜" }
5
+ )
6
+ max_new_tokens = 128
7
+ temperature = 0.7
8
+ } | ConvertTo-Json -Depth 3
9
+
10
+ Invoke-RestMethod -Uri https://wildojisan-qwen2-5-1-5b-instruct-basic-test.hf.space/v1/chat `
11
+ -Method POST `
12
+ -ContentType "application/json" `
13
+ -Body $body