python clip_aoi_train.py \ --output_dir ./aoi_clip_high_resolution_crossAttenttionFusion_fusin_gpt_random_sampler \ --model_name_or_path OFA-Sys/chinese-clip-vit-base-patch16\ --image_processor_name OFA-Sys/chinese-clip-vit-base-patch16 \ --tokenizer_name OFA-Sys/chinese-clip-vit-base-patch16 \ --train_file ./data_csv/AOI/train_high_resolution_gpt.csv \ --validation_file ./data_csv/AOI/valid_high_resolution_gpt.csv \ --image_column image_path --caption_column text --aoi_caption_column aoi_text\ --remove_unused_columns=False --do_eval --do_train \ --per_device_train_batch_size=40 --per_device_eval_batch_size=20 \ --gradient_accumulation_steps=10 --overwrite_output_dir\ --num_train_epochs=200 \ --learning_rate=1e-5 --warmup_steps=0 --weight_decay 0.1 \ --save_total_limit 5 \ --eval_strategy steps --logging_steps 0.1 --report_to wandb --fp16 --push_to_hub --batch_eval_metrics \ --max_seq_length 153 --aoi_max_seq_length 384 #--model_name_or_path OFA-Sys/chinese-clip-vit-base-patch16 #--overwrite_output_dir \ # aoi_clip_high_resolution_concate_fusin_gpt_random_sampler