BAAI
/

gsh33 commited on
Commit
54a4ed6
1 Parent(s): d5be120

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +62 -0
README.md CHANGED
@@ -65,6 +65,68 @@ We evaluated the model using the [VLMEvalKit](https://github.com/open-compass/VL
65
 
66
  For comparison models, evaluations were conducted in a local environment, so the scores may differ slightly from those reported in papers or on the official VLMEvalKit leaderboard.
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  # Future Plan
69
 
70
  * We plan to train models of various sizes.
 
65
 
66
  For comparison models, evaluations were conducted in a local environment, so the scores may differ slightly from those reported in papers or on the official VLMEvalKit leaderboard.
67
 
68
+ # How to use
69
+
70
+ ```python
71
+ # pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
72
+ from llava.model.builder import load_pretrained_model
73
+ from llava.mm_utils import process_images, tokenizer_image_token
74
+ from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
75
+ from llava.conversation import conv_templates
76
+ from PIL import Image
77
+ import requests
78
+ import copy
79
+ import torch
80
+ import warnings
81
+
82
+ warnings.filterwarnings("ignore")
83
+
84
+ pretrained = "BAAI/Aquila-VL-2B-llava-qwen"
85
+
86
+ model_name = "llava_qwen"
87
+ device = "cuda"
88
+ device_map = "auto"
89
+ tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args
90
+
91
+ model.eval()
92
+
93
+ # load image from url
94
+ url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
95
+ image = Image.open(requests.get(url, stream=True).raw)
96
+
97
+ # load image from local environment
98
+ # url = "./local_image.jpg"
99
+ # image = Image.open(url)
100
+
101
+ image_tensor = process_images([image], image_processor, model.config)
102
+ image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
103
+
104
+ conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
105
+ question = DEFAULT_IMAGE_TOKEN + "\nWhat is shown in this image?"
106
+ conv = copy.deepcopy(conv_templates[conv_template])
107
+ conv.append_message(conv.roles[0], question)
108
+ conv.append_message(conv.roles[1], None)
109
+ prompt_question = conv.get_prompt()
110
+
111
+ input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
112
+ image_sizes = [image.size]
113
+
114
+ cont = model.generate(
115
+ input_ids,
116
+ images=image_tensor,
117
+ image_sizes=image_sizes,
118
+ do_sample=False,
119
+ temperature=0,
120
+ max_new_tokens=4096,
121
+ )
122
+
123
+ text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
124
+
125
+ print(text_outputs)
126
+ ```
127
+
128
+
129
+
130
  # Future Plan
131
 
132
  * We plan to train models of various sizes.