TuringsSolutions commited on
Commit
5e0126f
1 Parent(s): 40652ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -4
app.py CHANGED
@@ -10,13 +10,13 @@ model_id = "xtuner/llava-llama-3-8b-v1_1-transformers"
10
 
11
  print("Loading model...")
12
  processor = AutoProcessor.from_pretrained(model_id)
 
 
13
  model = LlavaForConditionalGeneration.from_pretrained(
14
  model_id,
15
  torch_dtype=torch.float16,
16
- low_cpu_mem_usage=True
17
  )
18
- model.to("cuda" if torch.cuda.is_available() else "cpu")
19
- model.generation_config.eos_token_id = 128009
20
  print("Model loaded successfully!")
21
 
22
  PLACEHOLDER = """
@@ -50,7 +50,7 @@ def bot_streaming(message, history):
50
  # Prepare inputs
51
  image = Image.open(image)
52
  prompt = f"<|start_header_id|>user<|end_header_id|>\n\n<image>\n{message['text']}<|eot_id|>"
53
- inputs = processor(prompt, image, return_tensors="pt").to(device=model.device, dtype=torch.float16)
54
 
55
  # Stream text generation
56
  streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
 
10
 
11
  print("Loading model...")
12
  processor = AutoProcessor.from_pretrained(model_id)
13
+
14
+ # Adjusted model loading to use Accelerate's `device_map`
15
  model = LlavaForConditionalGeneration.from_pretrained(
16
  model_id,
17
  torch_dtype=torch.float16,
18
+ device_map="auto" # Uses the Accelerate library for efficient memory usage
19
  )
 
 
20
  print("Model loaded successfully!")
21
 
22
  PLACEHOLDER = """
 
50
  # Prepare inputs
51
  image = Image.open(image)
52
  prompt = f"<|start_header_id|>user<|end_header_id|>\n\n<image>\n{message['text']}<|eot_id|>"
53
+ inputs = processor(prompt, image, return_tensors="pt").to(model.device, dtype=torch.float16)
54
 
55
  # Stream text generation
56
  streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)