Spaces:

rxtan
/

Koala-video-llm

Sleeping

File size: 1,842 Bytes

b2afdba
 
 
 
 
 
 
 
 
 
 
 
 
 
85f5a79
b2afdba
 
 
290c75a
b2afdba

model:
  arch: video_instruction_llama
  model_type: pretrain_vicuna
  freeze_vit: True
  freeze_qformer: True


  # Q-Former
  num_query_token: 32

  # If you want train models based on LLaMA-2-chat,
  # some ckpts could be download from our provided huggingface repo
  # i.e.  https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Finetuned llama-2-7b-chat-hf
  #llama_model: "/projectnb/ivc-ml/rxtan/llama-2-7b-chat-hf/"
  llama_model: "meta-llama/Llama-2-7b-chat-hf"
  imagebind_ckpt_path: "ckpt/imagebind_path/"

  # The ckpt of vision branch after stage1 pretrained, 
  ckpt: 'ckpt/VL_LLaMA_2_7B_Finetuned.pth'  # you can use our pretrained ckpt from https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Pretrained/


  # only train vision branch
  equip_audio_branch: False  # whether equips the audio branch
  frozen_llama_proj: False
  frozen_video_Qformer: True
  frozen_audio_Qformer: True
  
  fusion_head_layers: 2
  max_frame_pos: 32
  fusion_header_type: "seqTransf"

  max_txt_len: 320

  # for llama_2_chat:
  end_sym: "</s>"
  prompt_path: "prompts/alignment_image.txt"
  prompt_template: '[INST] <<SYS>>\n \n<</SYS>>\n\n{} [/INST] '

datasets:
  webvid:
    vis_processor:
      train:
        name: "alpro_video_eval"
        n_frms: 8
        image_size: 224
    text_processor:
      train:
        name: "blip_caption"
    
run:
  task: video_text_pretrain
  # optimizer
  lr_sched: "linear_warmup_cosine_lr"
  init_lr: 3e-5
  min_lr: 1e-5
  warmup_lr: 1e-6

  weight_decay: 0.05
  max_epoch: 3
  iters_per_epoch: 1000
  batch_size_train: 4
  batch_size_eval: 4
  num_workers: 4
  warmup_steps: 1000

  seed: 42
  output_dir: "output/videollama_stage2_finetune"

  amp: True
  resume_ckpt_path: null

  evaluate: False 
  train_splits: ["train"]

  device: "cuda"
  world_size: 1
  dist_url: "env://"
  distributed: True