vilarin commited on
Commit
677d853
1 Parent(s): 6b67af9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import time
3
  import spaces
4
  import torch
5
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
  import gradio as gr
7
  from threading import Thread
8
 
@@ -33,13 +33,14 @@ h3 {
33
 
34
  device = "cuda" # for GPU usage or "cpu" for CPU usage
35
 
 
 
36
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
37
  model = AutoModelForCausalLM.from_pretrained(
38
  MODEL,
39
  torch_dtype=torch.bfloat16,
40
- low_cpu_mem_usage=True,
41
  device_map="auto",
42
- trust_remote_code=True)
43
 
44
  @spaces.GPU()
45
  def stream_chat(
 
2
  import time
3
  import spaces
4
  import torch
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
6
  import gradio as gr
7
  from threading import Thread
8
 
 
33
 
34
  device = "cuda" # for GPU usage or "cpu" for CPU usage
35
 
36
+ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
37
+
38
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
39
  model = AutoModelForCausalLM.from_pretrained(
40
  MODEL,
41
  torch_dtype=torch.bfloat16,
 
42
  device_map="auto",
43
+ quantization_config=quantization_config)
44
 
45
  @spaces.GPU()
46
  def stream_chat(