-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinference.py
More file actions
62 lines (50 loc) · 1.54 KB
/
inference.py
File metadata and controls
62 lines (50 loc) · 1.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# Install dependencies
!pip install pip3-autoremove
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu128
!pip install unsloth
!pip install transformers==4.56.2
!pip install -q huggingface_hub
# Load model
from unsloth import FastLanguageModel
from transformers import TextStreamer
from huggingface_hub import login
login()
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "username/modelname",
max_seq_length = 2048,
dtype = None,
load_in_4bit = False
)
FastLanguageModel.for_inference(model)
# Inference
def chat(
user_message: str,
system_message: str | None = None,
max_new_tokens: int = 128,
temperature: float = 1.5,
min_p: float = 0.1,
):
"""
Streams a response from your fine-tuned model.
Prints tokens to stdout as they are generated.
"""
messages = []
if system_message is not None:
messages.append({"role": "system", "content": system_message})
messages.append({"role": "user", "content": user_message})
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
).to(model.device)
streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(
input_ids = inputs,
streamer = streamer,
max_new_tokens = max_new_tokens,
use_cache = True,
temperature = temperature,
min_p = min_p,
)
chat("Ask model a question")