nisten commited on
Commit
0ff1cd2
1 Parent(s): 3802faf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -25
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import spaces
3
- from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import torch
5
  import subprocess
6
  import sys
@@ -8,12 +8,13 @@ import sys
8
  # Force install the specific transformers version from the GitHub PR
9
  subprocess.check_call([sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", "git+https://github.com/Muennighoff/transformers.git@olmoe"])
10
 
11
- model_name = "allenai/OLMoE-1B-7B-0924-Instruct"
12
 
13
  # Wrap model loading in a try-except block to handle potential errors
14
  try:
15
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).cuda().eval()
16
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
17
  except Exception as e:
18
  print(f"Error loading model: {e}")
19
  model = None
@@ -24,48 +25,56 @@ system_prompt = ("Adopt the persona of hilariously pissed off Andrej Karpathy "
24
  "while always answering questions in full first principles analysis type of thinking "
25
  "without using any analogies and always showing full working code or output in his answers.")
26
 
27
- user_prompt = '<|user|>\n'
28
- assistant_prompt = '<|assistant|>\n'
29
- prompt_suffix = "<|end|>\n"
30
-
31
  @spaces.GPU
32
- def generate_response(message, history):
33
  if model is None or tokenizer is None:
34
  return "Model or tokenizer not loaded properly. Please check the logs."
35
 
36
- full_prompt = f"{system_prompt}\n{user_prompt}{message}{prompt_suffix}{assistant_prompt}"
 
 
 
37
 
38
- inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda:0")
39
  with torch.no_grad():
40
  generate_ids = model.generate(
41
  **inputs,
42
- max_new_tokens=1000,
43
  do_sample=True,
44
- temperature=0.7,
45
- eos_token_id=tokenizer.eos_token_id,
46
  )
47
- response = tokenizer.batch_decode(generate_ids[:, inputs['input_ids'].shape[1]:],
48
- skip_special_tokens=True,
49
- clean_up_tokenization_spaces=False)[0]
50
- return response.strip()
 
 
 
 
 
 
 
 
51
 
52
- with gr.Blocks() as demo:
53
- gr.Markdown("# Pissed Off Karpathy Chatbot")
54
- chatbot = gr.Chatbot()
55
- msg = gr.Textbox()
 
 
 
56
  clear = gr.Button("Clear")
57
 
58
  def user(user_message, history):
59
  return "", history + [[user_message, None]]
60
 
61
- def bot(history):
62
  user_message = history[-1][0]
63
- bot_message = generate_response(user_message, history)
64
  history[-1][1] = bot_message
65
  return history
66
 
67
  msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
68
- bot, chatbot, chatbot
69
  )
70
  clear.click(lambda: None, None, chatbot, queue=False)
71
 
 
1
  import gradio as gr
2
  import spaces
3
+ from transformers import OlmoeForCausalLM, AutoTokenizer
4
  import torch
5
  import subprocess
6
  import sys
 
8
  # Force install the specific transformers version from the GitHub PR
9
  subprocess.check_call([sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", "git+https://github.com/Muennighoff/transformers.git@olmoe"])
10
 
11
+ model_name = "allenai/OLMoE-1B-7B-0924"
12
 
13
  # Wrap model loading in a try-except block to handle potential errors
14
  try:
15
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
16
+ model = OlmoeForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to(DEVICE)
17
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
18
  except Exception as e:
19
  print(f"Error loading model: {e}")
20
  model = None
 
25
  "while always answering questions in full first principles analysis type of thinking "
26
  "without using any analogies and always showing full working code or output in his answers.")
27
 
 
 
 
 
28
  @spaces.GPU
29
+ def generate_response(message, history, temperature, max_new_tokens):
30
  if model is None or tokenizer is None:
31
  return "Model or tokenizer not loaded properly. Please check the logs."
32
 
33
+ full_prompt = f"{system_prompt}\n\nHuman: {message}\n\nAssistant:"
34
+
35
+ inputs = tokenizer(full_prompt, return_tensors="pt")
36
+ inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
37
 
 
38
  with torch.no_grad():
39
  generate_ids = model.generate(
40
  **inputs,
41
+ max_length=inputs['input_ids'].shape[1] + max_new_tokens,
42
  do_sample=True,
43
+ temperature=temperature,
 
44
  )
45
+ response = tokenizer.decode(generate_ids[0], skip_special_tokens=True)
46
+ # Extract only the assistant's response
47
+ assistant_response = response.split("Assistant:")[-1].strip()
48
+ return assistant_response
49
+
50
+ css = """
51
+ #output {
52
+ height: 500px;
53
+ overflow: auto;
54
+ border: 1px solid #ccc;
55
+ }
56
+ """
57
 
58
+ with gr.Blocks(css=css) as demo:
59
+ gr.Markdown("# Nisten's Karpathy Chatbot with OSS olMoE")
60
+ chatbot = gr.Chatbot(elem_id="output")
61
+ msg = gr.Textbox(label="Your message")
62
+ with gr.Row():
63
+ temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
64
+ max_new_tokens = gr.Slider(minimum=50, maximum=4000, value=1000, step=50, label="Max New Tokens")
65
  clear = gr.Button("Clear")
66
 
67
  def user(user_message, history):
68
  return "", history + [[user_message, None]]
69
 
70
+ def bot(history, temp, max_tokens):
71
  user_message = history[-1][0]
72
+ bot_message = generate_response(user_message, history, temp, max_tokens)
73
  history[-1][1] = bot_message
74
  return history
75
 
76
  msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
77
+ bot, [chatbot, temperature, max_new_tokens], chatbot
78
  )
79
  clear.click(lambda: None, None, chatbot, queue=False)
80