sy1998
/

Video_XL

Safetensors

English

Model card Files Files and versions Community

sy1998 commited on Oct 17

Commit

8748fa1

•

1 Parent(s): 33f59f3

Upload README.md

Browse files

Files changed (1) hide show

README.md +57 -1

README.md CHANGED Viewed

@@ -4,4 +4,60 @@ language:
 - en
 base_model:
 - Qwen/Qwen2-VL-7B-Instruct
----

 - en
 base_model:
 - Qwen/Qwen2-VL-7B-Instruct
+---
+# Usage
+First follow the instructions in [our repo](https://github.com/VectorSpaceLab/Video-XL) to install relevant packages.
+```python
+from videoxl.model.builder import load_pretrained_model
+from videoxl.mm_utils import tokenizer_image_token, process_images,transform_input_id
+from videoxl.constants import IMAGE_TOKEN_INDEX,TOKEN_PERFRAME
+from PIL import Image
+from decord import VideoReader, cpu
+import torch
+import numpy as np
+# fix seed
+torch.manual_seed(0)
+model_path = "assets/VideoXL_weight_8"
+video_path="assets/ad2_watch_15min.mp4"
+max_frames_num =900
+gen_kwargs = {"do_sample": True, "temperature": 1, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 1024}
+tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, "llava_qwen", device_map="cuda:0")
+model.config.beacon_ratio=[8]   # you can delete this line to realize random compression of {2,4,8} ratio
+#video input
+prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nDoes this video contain any inserted advertisement? If yes, which is the content of the ad?<|im_end|>\n<|im_start|>assistant\n"
+input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(model.device)
+vr = VideoReader(video_path, ctx=cpu(0))
+total_frame_num = len(vr)
+uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int)
+frame_idx = uniform_sampled_frames.tolist()
+frames = vr.get_batch(frame_idx).asnumpy()
+video_tensor = image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(model.device, dtype=torch.float16)
+beacon_skip_first = (input_ids == IMAGE_TOKEN_INDEX).nonzero(as_tuple=True)[1].item()
+num_tokens=TOKEN_PERFRAME *max_frames_num
+beacon_skip_last = beacon_skip_first  + num_tokens
+with torch.inference_mode():
+    output_ids = model.generate(input_ids, images=[video_tensor],  modalities=["video"],beacon_skip_first=beacon_skip_first,beacon_skip_last=beacon_skip_last, **gen_kwargs)
+if IMAGE_TOKEN_INDEX in input_ids:
+    transform_input_ids=transform_input_id(input_ids,num_tokens,model.config.vocab_size-1)
+output_ids=output_ids[:,transform_input_ids.shape[1]:]
+outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+print(outputs)
+```
+## License
+This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses, including but not limited to the OpenAI Terms of Use for the dataset and the specific licenses for base language models (Qwen2 license). This project does not impose any additional constraints beyond those stipulated in the original licenses. Furthermore, users are reminded to ensure that their use of the dataset and checkpoints is in compliance with all applicable laws and regulations.