Crystalcareai commited on
Commit
705c81f
1 Parent(s): dc09b15

Upload inference.py

Browse files
Files changed (1) hide show
  1. inference.py +36 -0
inference.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
3
+
4
+ model_path = "Crystalcareai/GemMoE-Medium-v0.4"
5
+
6
+ # Load model
7
+ model = AutoModelForCausalLM.from_pretrained(
8
+ model_path,
9
+ device_map="auto",
10
+ low_cpu_mem_usage=True,
11
+ torch_dtype=torch.float16,
12
+ attn_implementation="flash_attention_2"
13
+ trust_remote_code=True,
14
+ )
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
17
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
18
+
19
+ # Convert prompt to tokens
20
+ prompt_template = "[INST] {prompt} [/INST]"
21
+
22
+ prompt = "You're standing on the surface of the Earth. "\
23
+ "You walk one mile south, one mile west and one mile north. "\
24
+ "You end up exactly where you started. Where are you?"
25
+
26
+ tokens = tokenizer(
27
+ prompt_template.format(prompt=prompt),
28
+ return_tensors='pt'
29
+ ).input_ids.cuda()
30
+
31
+ # Generate output
32
+ generation_output = model.generate(
33
+ tokens,
34
+ streamer=streamer,
35
+ max_new_tokens=512
36
+ )