|
--- |
|
license: apache-2.0 |
|
language: |
|
- en |
|
base_model: |
|
- Qwen/Qwen2-VL-7B-Instruct |
|
--- |
|
|
|
|
|
# Usage |
|
|
|
First follow the instructions in [our repo](https://github.com/VectorSpaceLab/Video-XL) to install relevant packages. |
|
|
|
```python |
|
from videoxl.model.builder import load_pretrained_model |
|
from videoxl.mm_utils import tokenizer_image_token, process_images,transform_input_id |
|
from videoxl.constants import IMAGE_TOKEN_INDEX,TOKEN_PERFRAME |
|
from PIL import Image |
|
from decord import VideoReader, cpu |
|
import torch |
|
import numpy as np |
|
# fix seed |
|
torch.manual_seed(0) |
|
|
|
|
|
model_path = "assets/VideoXL_weight_8" |
|
video_path="assets/ad2_watch_15min.mp4" |
|
|
|
max_frames_num =900 |
|
gen_kwargs = {"do_sample": True, "temperature": 1, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 1024} |
|
tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, "llava_qwen", device_map="cuda:0") |
|
|
|
model.config.beacon_ratio=[8] # you can delete this line to realize random compression of {2,4,8} ratio |
|
|
|
|
|
#video input |
|
prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nDoes this video contain any inserted advertisement? If yes, which is the content of the ad?<|im_end|>\n<|im_start|>assistant\n" |
|
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(model.device) |
|
vr = VideoReader(video_path, ctx=cpu(0)) |
|
total_frame_num = len(vr) |
|
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int) |
|
frame_idx = uniform_sampled_frames.tolist() |
|
frames = vr.get_batch(frame_idx).asnumpy() |
|
video_tensor = image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(model.device, dtype=torch.float16) |
|
|
|
beacon_skip_first = (input_ids == IMAGE_TOKEN_INDEX).nonzero(as_tuple=True)[1].item() |
|
num_tokens=TOKEN_PERFRAME *max_frames_num |
|
beacon_skip_last = beacon_skip_first + num_tokens |
|
|
|
with torch.inference_mode(): |
|
output_ids = model.generate(input_ids, images=[video_tensor], modalities=["video"],beacon_skip_first=beacon_skip_first,beacon_skip_last=beacon_skip_last, **gen_kwargs) |
|
|
|
if IMAGE_TOKEN_INDEX in input_ids: |
|
transform_input_ids=transform_input_id(input_ids,num_tokens,model.config.vocab_size-1) |
|
|
|
output_ids=output_ids[:,transform_input_ids.shape[1]:] |
|
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() |
|
print(outputs) |
|
``` |
|
|
|
## License |
|
|
|
This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses, including but not limited to the OpenAI Terms of Use for the dataset and the specific licenses for base language models (Qwen2 license). This project does not impose any additional constraints beyond those stipulated in the original licenses. Furthermore, users are reminded to ensure that their use of the dataset and checkpoints is in compliance with all applicable laws and regulations. |
|
|