sy1998 commited on
Commit
8748fa1
1 Parent(s): 33f59f3

Upload README.md

Browse files
Files changed (1) hide show
  1. README.md +57 -1
README.md CHANGED
@@ -4,4 +4,60 @@ language:
4
  - en
5
  base_model:
6
  - Qwen/Qwen2-VL-7B-Instruct
7
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  - en
5
  base_model:
6
  - Qwen/Qwen2-VL-7B-Instruct
7
+ ---
8
+
9
+
10
+ # Usage
11
+
12
+ First follow the instructions in [our repo](https://github.com/VectorSpaceLab/Video-XL) to install relevant packages.
13
+
14
+ ```python
15
+ from videoxl.model.builder import load_pretrained_model
16
+ from videoxl.mm_utils import tokenizer_image_token, process_images,transform_input_id
17
+ from videoxl.constants import IMAGE_TOKEN_INDEX,TOKEN_PERFRAME
18
+ from PIL import Image
19
+ from decord import VideoReader, cpu
20
+ import torch
21
+ import numpy as np
22
+ # fix seed
23
+ torch.manual_seed(0)
24
+
25
+
26
+ model_path = "assets/VideoXL_weight_8"
27
+ video_path="assets/ad2_watch_15min.mp4"
28
+
29
+ max_frames_num =900
30
+ gen_kwargs = {"do_sample": True, "temperature": 1, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 1024}
31
+ tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, "llava_qwen", device_map="cuda:0")
32
+
33
+ model.config.beacon_ratio=[8] # you can delete this line to realize random compression of {2,4,8} ratio
34
+
35
+
36
+ #video input
37
+ prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nDoes this video contain any inserted advertisement? If yes, which is the content of the ad?<|im_end|>\n<|im_start|>assistant\n"
38
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(model.device)
39
+ vr = VideoReader(video_path, ctx=cpu(0))
40
+ total_frame_num = len(vr)
41
+ uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int)
42
+ frame_idx = uniform_sampled_frames.tolist()
43
+ frames = vr.get_batch(frame_idx).asnumpy()
44
+ video_tensor = image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(model.device, dtype=torch.float16)
45
+
46
+ beacon_skip_first = (input_ids == IMAGE_TOKEN_INDEX).nonzero(as_tuple=True)[1].item()
47
+ num_tokens=TOKEN_PERFRAME *max_frames_num
48
+ beacon_skip_last = beacon_skip_first + num_tokens
49
+
50
+ with torch.inference_mode():
51
+ output_ids = model.generate(input_ids, images=[video_tensor], modalities=["video"],beacon_skip_first=beacon_skip_first,beacon_skip_last=beacon_skip_last, **gen_kwargs)
52
+
53
+ if IMAGE_TOKEN_INDEX in input_ids:
54
+ transform_input_ids=transform_input_id(input_ids,num_tokens,model.config.vocab_size-1)
55
+
56
+ output_ids=output_ids[:,transform_input_ids.shape[1]:]
57
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
58
+ print(outputs)
59
+ ```
60
+
61
+ ## License
62
+
63
+ This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses, including but not limited to the OpenAI Terms of Use for the dataset and the specific licenses for base language models (Qwen2 license). This project does not impose any additional constraints beyond those stipulated in the original licenses. Furthermore, users are reminded to ensure that their use of the dataset and checkpoints is in compliance with all applicable laws and regulations.