Add training scripts and initial model trained on 1% of the data.

Browse files

Files changed (10) hide show

.gitattributes +1 -0
clip_spanish_1_percent/config.json +157 -0
clip_spanish_1_percent/flax_model.msgpack +3 -0
configuration_hybrid_clip.py +1 -0
discard_incorrect_files.py +23 -0
modeling_hybrid_clip.py +1 -0
prepare_wit.py +68 -0
run-clip.sh +18 -0
run_hybrid_clip.py +1 -0
test_on_image.py +34 -0

.gitattributes CHANGED Viewed

@@ -14,3 +14,4 @@
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text

 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

clip_spanish_1_percent/config.json ADDED Viewed

	@@ -0,0 +1,157 @@

+{
+  "architectures": [
+    "HybridCLIP"
+  ],
+  "initializer_factor": 1.0,
+  "model_type": "hybrid-clip",
+  "projection_dim": 512,
+  "seed": 42,
+  "text_config": {
+    "_name_or_path": "dccuchile/bert-base-spanish-wwm-cased",
+    "add_cross_attention": false,
+    "architectures": [
+      "BertForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "bad_words_ids": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 512,
+    "min_length": 0,
+    "model_type": "bert",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_past": true,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "position_embedding_type": "absolute",
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.9.0.dev0",
+    "type_vocab_size": 2,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 31002
+  },
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "gradient_checkpointing": false,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 32,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.9.0.dev0",
+    "use_bfloat16": false
+  }
+}

clip_spanish_1_percent/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29e4478aa3195ba626a7051a3d2a8d17bb540b4e68d8d75cca2d549104e586c2
+size 792387416

configuration_hybrid_clip.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ /home/eduardogonzalezponferrada/transformers/examples/research_projects/jax-projects/hybrid_clip/configuration_hybrid_clip.py

discard_incorrect_files.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import json
+import os
+import torch
+from torchvision.io import ImageReadMode, read_image
+# SUPPORTED_EXTENSIONS = {'PNG', 'JPG', 'png', 'JPEG', 'jpg', 'jpeg'}
+for split in ["train", "valid", "test"]:
+    with open(f"/home/{os.environ['USER']}/data/wit/prepared_dataset/{split}_dataset.json") as f:
+        examples = [json.loads(line) for line in f.readlines()]
+    supported_examples = []
+    for example in examples:
+        try:
+            image = read_image(example["image_path"], mode=ImageReadMode.RGB)
+            supported_examples.append(json.dumps(example, ensure_ascii=False))
+        except Exception as e:
+            print(f"Excluding file: {example['image_path']} due to error: {e}")
+    print(f"Total {split} examples: {len(supported_examples)}")
+    with open(f"/home/{os.environ['USER']}/data/wit/prepared_dataset/{split}_dataset_filtered.json", "w") as f:
+        f.write("\n".join(supported_examples))

modeling_hybrid_clip.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ /home/eduardogonzalezponferrada/transformers/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py

prepare_wit.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import argparse
+import json
+import logging
+import os
+import time
+import urllib.request
+import urllib.error
+import pandas as pd
+from tqdm import tqdm
+logger = logging.getLogger(__name__)
+def prepare_wit(tsv: str, language: str, output_dir: str, seed: int, train_proportion: float, valid_proportion: float, language_col: str="language", caption_col: str="caption_reference_description", url_col: str="image_url", pause=1.0, retries: int=5):
+    os.makedirs(output_dir, exist_ok=True)
+    df = pd.read_csv(tsv, sep="\t", engine="python")
+    df = df[(df["language"] == language) & (~df["caption_reference_description"].isnull())]
+    # Shuffle
+    df = df.sample(frac=1.0, random_state=seed)
+    lines = []
+    try:
+        with tqdm(total=len(df)) as pbar:
+            for i, row in tqdm(df.iterrows()):
+                url = row[url_col]
+                caption = row[caption_col]
+                # Trim image file names so that they are no longer than 100 characters
+                image_filename = url.split('/')[-1][-100:]
+                image_path = f"{output_dir}/{image_filename}"
+                for retry in range(retries):
+                    try:
+                        # Download file
+                        urllib.request.urlretrieve(url, image_path)
+                        lines.append(json.dumps({"image_path": image_path, "captions": [caption]}, ensure_ascii=False))
+                        break
+                    except urllib.error.HTTPError as e:
+                        time.sleep(pause)
+                if retry == retries:
+                    raise ValueError("Rate limit achieved:", e)
+                pbar.update(1)
+    # Save existing dataset, even upon failure
+    finally:
+        total_lines = len(lines)
+        train_lines = lines[:int(total_lines * train_proportion)]
+        valid_lines = lines[int(total_lines * train_proportion):int(total_lines * (train_proportion + valid_proportion))]
+        test_lines = lines[int(total_lines * (train_proportion + valid_proportion)):]
+        with open(f"{output_dir}/train_dataset.json", "w") as f:
+            f.write("\n".join(train_lines))
+        with open(f"{output_dir}/valid_dataset.json", "w") as f:
+            f.write("\n".join(valid_lines))
+        with open(f"{output_dir}/test_dataset.json", "w") as f:
+            f.write("\n".join(test_lines))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description = "Download and prepare the WIT dataset")
+    parser.add_argument("--tsv", type=str, default=f"/home/{os.environ['USER']}/data/wit/wit_v1.train.all-1percent_sample.tsv")
+    parser.add_argument("--language", type=str, default="es")
+    parser.add_argument("--output_dir", type=str, default=f"/home/{os.environ['USER']}/data/wit/prepared_dataset")
+    parser.add_argument("--random_seed", type=int, default=0)
+    parser.add_argument("--train_proportion", type=float, default=0.8)
+    parser.add_argument("--valid_proportion", type=float, default=0.1)
+    args = parser.parse_args()
+    assert args.train_proportion + args.valid_proportion < 1.0, "The sum of train_proportion and valid_proportion has to be < 1.0"
+    prepare_wit(args.tsv, args.language, args.output_dir, args.random_seed, args.train_proportion, args.valid_proportion)

run-clip.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+HUB_TOKEN=`cat $HOME/.huggingface/token`
+python run_hybrid_clip.py \
+    --output_dir "./output_dir" \
+    --text_model_name_or_path="dccuchile/bert-base-spanish-wwm-cased" \
+    --vision_model_name_or_path="openai/clip-vit-base-patch32" \
+    --tokenizer_name="dccuchile/bert-base-spanish-wwm-cased" \
+    --train_file="/home/${USER}/data/wit/prepared_dataset/train_dataset_filtered.json" \
+    --validation_file="/home/${USER}/data/wit/prepared_dataset/valid_dataset_filtered.json" \
+    --do_train --do_eval \
+    --num_train_epochs="40" \
+    --max_seq_length 96 \
+    --per_device_train_batch_size="64" \
+    --per_device_eval_batch_size="64" \
+    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 32
+    #--push_to_hub

run_hybrid_clip.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ /home/eduardogonzalezponferrada/transformers/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py

test_on_image.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import jax
+import torch
+from torchvision.io import ImageReadMode, read_image
+from transformers import AutoTokenizer
+from modeling_hybrid_clip import FlaxHybridCLIP
+from run_hybrid_clip import Transform
+model = FlaxHybridCLIP.from_pretrained("clip_spanish_1_percent")
+tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
+def prepare_image(image_path):
+    image = read_image(image_path, mode=ImageReadMode.RGB)
+    preprocess = Transform(model.config.vision_config.image_size)
+    preprocess = torch.jit.script(preprocess)
+    preprocessed_image = preprocess(image)
+    pixel_values = torch.stack([preprocessed_image]).permute(0, 2, 3, 1).numpy()
+    return pixel_values
+def prepare_text(text):
+    return tokenizer(text, return_tensors="np")
+def run_inference(image_path, text):
+    pixel_values = prepare_image(image_path)
+    input_text = prepare_text(text)
+    model_output = model(input_text["input_ids"], pixel_values, attention_mask=input_text["attention_mask"], token_type_ids=input_text["token_type_ids"], train=False, return_dict=True)
+    logits = model_output["logits_per_image"]
+    score = jax.nn.sigmoid(logits)
+    return score
+image_path = "/home/eduardogonzalezponferrada/data/wit/full_dataset/Casa_de_Cultura_%284%29.JPG"
+text = "Patio interior de un edificio"
+print(run_inference(image_path, text))