Spaces:
Sleeping
Sleeping
File size: 1,842 Bytes
b2afdba 85f5a79 b2afdba 290c75a b2afdba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
model:
arch: video_instruction_llama
model_type: pretrain_vicuna
freeze_vit: True
freeze_qformer: True
# Q-Former
num_query_token: 32
# If you want train models based on LLaMA-2-chat,
# some ckpts could be download from our provided huggingface repo
# i.e. https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Finetuned llama-2-7b-chat-hf
#llama_model: "/projectnb/ivc-ml/rxtan/llama-2-7b-chat-hf/"
llama_model: "meta-llama/Llama-2-7b-chat-hf"
imagebind_ckpt_path: "ckpt/imagebind_path/"
# The ckpt of vision branch after stage1 pretrained,
ckpt: 'ckpt/VL_LLaMA_2_7B_Finetuned.pth' # you can use our pretrained ckpt from https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Pretrained/
# only train vision branch
equip_audio_branch: False # whether equips the audio branch
frozen_llama_proj: False
frozen_video_Qformer: True
frozen_audio_Qformer: True
fusion_head_layers: 2
max_frame_pos: 32
fusion_header_type: "seqTransf"
max_txt_len: 320
# for llama_2_chat:
end_sym: "</s>"
prompt_path: "prompts/alignment_image.txt"
prompt_template: '[INST] <<SYS>>\n \n<</SYS>>\n\n{} [/INST] '
datasets:
webvid:
vis_processor:
train:
name: "alpro_video_eval"
n_frms: 8
image_size: 224
text_processor:
train:
name: "blip_caption"
run:
task: video_text_pretrain
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 3e-5
min_lr: 1e-5
warmup_lr: 1e-6
weight_decay: 0.05
max_epoch: 3
iters_per_epoch: 1000
batch_size_train: 4
batch_size_eval: 4
num_workers: 4
warmup_steps: 1000
seed: 42
output_dir: "output/videollama_stage2_finetune"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: ["train"]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True |