vmuchinov commited on
Commit
a7cccd7
1 Parent(s): 3fe0eef

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -9
app.py CHANGED
@@ -7,26 +7,22 @@ import spaces
7
  import torch
8
 
9
 
10
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
11
 
12
  MAX_MAX_NEW_TOKENS = 2048
13
  DEFAULT_MAX_NEW_TOKENS = 1024
14
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
15
  ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
16
 
17
- model_id = "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8"
18
-
19
- quantization_config = BitsAndBytesConfig(load_in_8bit=True)
20
 
21
  model = AutoModelForCausalLM.from_pretrained(
22
  model_id,
23
  torch_dtype=torch.float16,
24
- device_map="cuda",
25
  trust_remote_code=True,
26
  low_cpu_mem_usage=True,
27
- quantize_config=quantization_config,
28
- token=ACCESS_TOKEN).to("cuda")
29
-
30
  tokenizer = AutoTokenizer.from_pretrained(
31
  model_id,
32
  trust_remote_code=True,
@@ -47,7 +43,7 @@ def generate(
47
  conversation.append({"role": "system", "content": system_prompt})
48
  conversation.append({"role": "user", "content": message})
49
 
50
- input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to("cuda")
51
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
52
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
53
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
 
7
  import torch
8
 
9
 
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
 
12
  MAX_MAX_NEW_TOKENS = 2048
13
  DEFAULT_MAX_NEW_TOKENS = 1024
14
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
15
  ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
16
 
17
+ model_id = "Qwen/Qwen2.5-7B-Instruct"
 
 
18
 
19
  model = AutoModelForCausalLM.from_pretrained(
20
  model_id,
21
  torch_dtype=torch.float16,
22
+ device_map="auto",
23
  trust_remote_code=True,
24
  low_cpu_mem_usage=True,
25
+ token=ACCESS_TOKEN)
 
 
26
  tokenizer = AutoTokenizer.from_pretrained(
27
  model_id,
28
  trust_remote_code=True,
 
43
  conversation.append({"role": "system", "content": system_prompt})
44
  conversation.append({"role": "user", "content": message})
45
 
46
+ input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
47
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
48
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
49
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")