Files changed (1) hide show
  1. app.py +75 -72
app.py CHANGED
@@ -1,73 +1,76 @@
1
- import gradio as gr
2
- from peft import PeftModel
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
4
- import torch
5
- import os
6
- os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable CUDA for Gradio
7
-
8
- # Model and tokenizer loading
9
- model_name = "microsoft/phi-2" # Replace with your base model name
10
- adapter_path = "./checkpoint-500" # Path to your adapter directory (relative to app.py)
11
-
12
- bnb_config = BitsAndBytesConfig(
13
- load_in_4bit=True,
14
- bnb_4bit_quant_type="nf4",
15
- bnb_4bit_compute_dtype=torch.float16,
16
- )
17
-
18
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
19
- tokenizer.pad_token = tokenizer.eos_token
20
-
21
- model = AutoModelForCausalLM.from_pretrained(
22
- model_name,
23
- device_map="auto",
24
- quantization_config=bnb_config,
25
- trust_remote_code=True,
26
- torch_dtype=torch.float16, # Use float16 for faster inference
27
- )
28
- model = PeftModel.from_pretrained(model, adapter_path)
29
- model.eval()
30
-
31
- # Inference function
32
- def generate_text(prompt, max_length=200, temperature=0.7, top_p=0.9):
33
- input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
34
- attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask.to(model.device)
35
-
36
- with torch.no_grad():
37
- outputs = model.generate(
38
- input_ids=input_ids,
39
- attention_mask=attention_mask,
40
- max_length=max_length,
41
- temperature=temperature,
42
- top_p=top_p,
43
- do_sample=True,
44
- pad_token_id=tokenizer.pad_token_id,
45
- )
46
-
47
- generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
48
- return generated_text
49
-
50
- # Sample questions
51
- sample_questions = [
52
- "Write a short story about a dog who becomes a detective.",
53
- "What is 2+2?",
54
- "Write a Flask App in python to say 'Hello World!'",
55
- "Give me a short 200-word essay on 'monospony'.",
56
- ]
57
-
58
- # Gradio interface
59
- iface = gr.Interface(
60
- fn=generate_text,
61
- inputs=[
62
- gr.Textbox(lines=5, label="Prompt"),
63
- gr.Slider(minimum=50, maximum=500, value=250, label="Max Length"),
64
- gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Temperature"),
65
- gr.Slider(minimum=0.1, maximum=1.0, value=0.9, label="Top P"),
66
- ],
67
- outputs=gr.Textbox(label="Generated Text"),
68
- title="Phi-2 OASST Fine-Tuning Demo",
69
- description="Generate text using a fine-tuned Phi-2 model with PEFT adapters. Click a sample question below to get started!",
70
- examples=[[q, 250, 0.1, 0.9] for q in sample_questions], # Add examples
71
- )
72
-
 
 
 
73
  iface.launch()
 
1
+ import gradio as gr
2
+ from peft import PeftModel
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
4
+ import torch
5
+ import os
6
+ os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable CUDA for Gradio
7
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface_cache"
8
+ os.environ["HF_HOME"] = "/tmp/huggingface_home"
9
+
10
+
11
+ # Model and tokenizer loading
12
+ model_name = "microsoft/phi-2" # Replace with your base model name
13
+ adapter_path = "./checkpoint-500" # Path to your adapter directory (relative to app.py)
14
+
15
+ bnb_config = BitsAndBytesConfig(
16
+ load_in_4bit=True,
17
+ bnb_4bit_quant_type="nf4",
18
+ bnb_4bit_compute_dtype=torch.float16,
19
+ )
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
22
+ tokenizer.pad_token = tokenizer.eos_token
23
+
24
+ model = AutoModelForCausalLM.from_pretrained(
25
+ model_name,
26
+ device_map="auto",
27
+ quantization_config=bnb_config,
28
+ trust_remote_code=True,
29
+ torch_dtype=torch.float16, # Use float16 for faster inference
30
+ )
31
+ model = PeftModel.from_pretrained(model, adapter_path)
32
+ model.eval()
33
+
34
+ # Inference function
35
+ def generate_text(prompt, max_length=200, temperature=0.7, top_p=0.9):
36
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
37
+ attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask.to(model.device)
38
+
39
+ with torch.no_grad():
40
+ outputs = model.generate(
41
+ input_ids=input_ids,
42
+ attention_mask=attention_mask,
43
+ max_length=max_length,
44
+ temperature=temperature,
45
+ top_p=top_p,
46
+ do_sample=True,
47
+ pad_token_id=tokenizer.pad_token_id,
48
+ )
49
+
50
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
51
+ return generated_text
52
+
53
+ # Sample questions
54
+ sample_questions = [
55
+ "Write a short story about a dog who becomes a detective.",
56
+ "What is 2+2?",
57
+ "Write a Flask App in python to say 'Hello World!'",
58
+ "Give me a short 200-word essay on 'monospony'.",
59
+ ]
60
+
61
+ # Gradio interface
62
+ iface = gr.Interface(
63
+ fn=generate_text,
64
+ inputs=[
65
+ gr.Textbox(lines=5, label="Prompt"),
66
+ gr.Slider(minimum=50, maximum=500, value=250, label="Max Length"),
67
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Temperature"),
68
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.9, label="Top P"),
69
+ ],
70
+ outputs=gr.Textbox(label="Generated Text"),
71
+ title="Phi-2 OASST Fine-Tuning Demo",
72
+ description="Generate text using a fine-tuned Phi-2 model with PEFT adapters. Click a sample question below to get started!",
73
+ examples=[[q, 250, 0.1, 0.9] for q in sample_questions], # Add examples
74
+ )
75
+
76
  iface.launch()