Update README.md
Browse files
README.md
CHANGED
@@ -55,74 +55,31 @@ tags:
|
|
55 |
|
56 |
## 🤖 Inference with VideoLLaMA2
|
57 |
```python
|
58 |
-
import torch
|
59 |
-
import transformers
|
60 |
import sys
|
61 |
sys.path.append('./')
|
62 |
-
from videollama2
|
63 |
-
from videollama2.
|
64 |
-
|
65 |
-
|
66 |
def inference():
|
|
|
|
|
67 |
# Video Inference
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
modal_list = ['video']
|
73 |
-
# Video Inference
|
74 |
-
paths = ['assets/sora.mp4']
|
75 |
-
questions = ['Please describe this video.']
|
76 |
-
# Reply:
|
77 |
-
# The video features a series of colorful kites flying in the sky. The kites are first seen flying over trees, and then they are shown flying in the sky. The kites come in various shapes and colors, including red, green, blue, and yellow. The video captures the kites soaring gracefully through the air, with some kites flying higher than others. The sky is clear and blue, and the trees below are lush and green. The kites are the main focus of the video, and their vibrant colors and intricate designs are highlighted against the backdrop of the sky and trees. Overall, the video showcases the beauty and artistry of kite-flying, and it is a delight to watch the kites dance and glide through the air.
|
78 |
-
modal_list = ['video']
|
79 |
# Image Inference
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
modal_list = ['image']
|
85 |
-
# 1. Initialize the model.
|
86 |
model_path = 'DAMO-NLP-SG/VideoLLaMA2-72B-Base'
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
if modal_list[0] == 'video':
|
93 |
-
tensor = process_video(paths[0], processor, model.config.image_aspect_ratio).to(dtype=torch.float16, device='cuda', non_blocking=True)
|
94 |
-
default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
|
95 |
-
modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
|
96 |
-
else:
|
97 |
-
tensor = process_image(paths[0], processor, model.config.image_aspect_ratio)[0].to(dtype=torch.float16, device='cuda', non_blocking=True)
|
98 |
-
default_mm_token = DEFAULT_MMODAL_TOKEN["IMAGE"]
|
99 |
-
modal_token_index = MMODAL_TOKEN_INDEX["IMAGE"]
|
100 |
-
tensor = [tensor]
|
101 |
-
# 3. Text preprocess (tag process & generate prompt).
|
102 |
-
question = default_mm_token + "\n" + questions[0]
|
103 |
-
conv = conv_templates[conv_mode].copy()
|
104 |
-
conv.append_message(conv.roles[0], question)
|
105 |
-
conv.append_message(conv.roles[1], None)
|
106 |
-
prompt = conv.get_prompt()
|
107 |
-
input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').unsqueeze(0).to('cuda:0')
|
108 |
-
# 4. Generate a response according to visual signals and prompts.
|
109 |
-
stop_str = conv.sep if conv.sep_style in [SeparatorStyle.SINGLE] else conv.sep2
|
110 |
-
# keywords = ["<s>", "</s>"]
|
111 |
-
keywords = [stop_str]
|
112 |
-
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
|
113 |
-
with torch.inference_mode():
|
114 |
-
output_ids = model.generate(
|
115 |
-
input_ids,
|
116 |
-
images_or_videos=tensor,
|
117 |
-
modal_list=modal_list,
|
118 |
-
do_sample=True,
|
119 |
-
temperature=0.2,
|
120 |
-
max_new_tokens=1024,
|
121 |
-
use_cache=True,
|
122 |
-
stopping_criteria=[stopping_criteria],
|
123 |
-
)
|
124 |
-
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
125 |
-
print(outputs[0])
|
126 |
if __name__ == "__main__":
|
127 |
inference()
|
128 |
```
|
|
|
55 |
|
56 |
## 🤖 Inference with VideoLLaMA2
|
57 |
```python
|
|
|
|
|
58 |
import sys
|
59 |
sys.path.append('./')
|
60 |
+
from videollama2 import model_init, mm_infer
|
61 |
+
from videollama2.utils import disable_torch_init
|
62 |
+
|
63 |
+
|
64 |
def inference():
|
65 |
+
disable_torch_init()
|
66 |
+
|
67 |
# Video Inference
|
68 |
+
modal = 'video'
|
69 |
+
modal_path = 'assets/cat_and_chicken.mp4'
|
70 |
+
instruct = 'What animals are in the video, what are they doing, and how does the video feel?'
|
71 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
# Image Inference
|
73 |
+
modal = 'image'
|
74 |
+
modal_path = 'assets/sora.png'
|
75 |
+
instruct = 'What is the woman wearing, what is she doing, and how does the image feel?'
|
76 |
+
|
|
|
|
|
77 |
model_path = 'DAMO-NLP-SG/VideoLLaMA2-72B-Base'
|
78 |
+
model, processor, tokenizer = model_init(model_path)
|
79 |
+
output = mm_infer(processor[modal](modal_path), instruct, model=model, tokenizer=tokenizer, do_sample=False, modal=modal)
|
80 |
+
|
81 |
+
print(output)
|
82 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
if __name__ == "__main__":
|
84 |
inference()
|
85 |
```
|