diff --git "a/results_1 (1).html" "b/results_1 (1).html" new file mode 100644--- /dev/null +++ "b/results_1 (1).html" @@ -0,0 +1,16765 @@ + + +
+ + +import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+pd.options.display.float_format = '{:.3f}'.format
+dataset_type = {
+ "imagenet1k": "natural",
+ "imagenetv2": "natural",
+ "imagenet-r": "natural",
+ "imagenet_sketch": "specialized",
+ "objectnet": "natural",
+ "imagenet-a": "natural",
+ "imagenet-o": "natural",
+
+ "vtab/cifar10": "natural",
+ "vtab/cifar100": "natural",
+ "mnist": "specialized",
+
+ "vtab/flowers": "natural",
+ "cars": "natural",
+ "vtab/svhn": "natural",
+ "fer2013": "natural",
+ "renderedsst2": "specialized",
+ "vtab/pets": "natural",
+ "vtab/caltech101": "natural",
+ "voc2007_multilabel": "natural",
+ "voc2007": "natural",
+ "sun397": "natural",
+ "fgvc_aircraft": "natural",
+ "country211": "natural",
+ "vtab/dtd": "natural",
+ "gtsrb": "natural",
+ "stl10": "natural",
+
+ "vtab/diabetic_retinopathy": "specialized",
+ "vtab/eurosat": "specialized",
+ "vtab/resisc45": "specialized",
+ "vtab/pcam": "specialized",
+
+ "vtab/clevr_count_all": "structured",
+ "vtab/clevr_closest_object_distance": "structured",
+
+ "vtab/dsprites_label_orientation": "structured",
+ "vtab/dsprites_label_x_position": "structured",
+
+ "vtab/smallnorb_label_elevation": "structured",
+ "vtab/smallnorb_label_azimuth": "structured",
+
+ "vtab/dmlab": "structured",
+ "vtab/kitti_closest_vehicle_distance": "structured",
+
+ "mscoco_captions": "retrieval",
+ "flickr8k": "retrieval",
+ "flickr30k": "retrieval",
+}
+
+def extract_arch(model):
+ vit, size, patch_size, *rest = model.split("-")
+ return vit+"-"+size+"-"+patch_size
+
df = pd.read_csv("benchmark.csv")
+vtab_plus = list(map(lambda s:s.strip(), open("datasets.txt").readlines()))
+df = df[df.dataset.isin(vtab_plus)]
+df.loc[:, "dataset_type"] = df.dataset.apply(lambda d:dataset_type[d])
+df.loc[:, "model_arch"] = df.model.apply(extract_arch)
+
df["model_fullname"]=df["model_fullname"].str.replace("/fsx/rom1504/open_clip/good_models/", "openclip ")
+df["model_fullname"]=df["model_fullname"].str.replace("/fsx/rwightman/", "openclip ")
+df["pretrained"]=df["pretrained"].str.replace("/fsx/rom1504/open_clip/good_models/", "openclip ")
+df["pretrained"]=df["pretrained"].str.replace("/fsx/rwightman/", "openclip ")
+
df_retrieval = df[df["dataset_type"] == "retrieval"]
+df = df[df["dataset_type"] != "retrieval"]
+df = df.drop(["image_retrieval_recall@5", "text_retrieval_recall@5"], axis=1)
+
+dataset_type = {k:v for k,v in dataset_type.items() if v != "retrieval"}
+
+
fig = plt.figure(figsize=(12,8))
+#order = df.sort_values(by="dataset_type").dataset.unique()
+order = list(dataset_type.keys())
+ax = sns.barplot(
+ x="dataset", y="acc1",
+ data=df,
+ order=order,
+ hue="model_fullname"
+)
+ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
+ax
+
<AxesSubplot:xlabel='dataset', ylabel='acc1'>+
fig = plt.figure(figsize=(12,8))
+order = list(dataset_type.keys())
+d = df[df.model_arch=="ViT-B-32"]
+ax = sns.barplot(
+ x="dataset", y="acc1",
+ data=d,
+ order=order,
+ hue="model_fullname"
+)
+ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
+ax
+
<AxesSubplot:xlabel='dataset', ylabel='acc1'>+
fig = plt.figure(figsize=(12,8))
+order = list(dataset_type.keys())
+ax = sns.barplot(
+ x="dataset", y="acc1", data=df,
+ order=order
+)
+ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
+ax
+
/home/rom1504/CLIP_benchmark/.env/lib/python3.8/site-packages/seaborn/algorithms.py:98: RuntimeWarning: Mean of empty slice + boot_dist.append(f(*sample, **func_kwargs)) +/home/rom1504/CLIP_benchmark/.env/lib/python3.8/site-packages/numpy/lib/nanfunctions.py:1559: RuntimeWarning: All-NaN slice encountered + r, k = function_base._ureduce(a, ++
<AxesSubplot:xlabel='dataset', ylabel='acc1'>+
fig = plt.figure(figsize=(12,8))
+order = list(dataset_type.keys())
+ax = sns.barplot(
+ x="dataset", y="acc1",
+ data=df,
+ order=order,
+ hue="model_arch"
+)
+ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
+ax
+
<AxesSubplot:xlabel='dataset', ylabel='acc1'>+
fig = plt.figure(figsize=(12,8))
+order = list(dataset_type.keys())
+d = df.copy()
+ax = sns.barplot(
+ x="dataset", y="acc1",
+ data=d,
+ order=order,
+ hue="pretrained"
+)
+ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
+ax
+
<AxesSubplot:xlabel='dataset', ylabel='acc1'>+
fig = plt.figure(figsize=(12,8))
+order = list(dataset_type.keys())
+d = df.copy()
+ax = sns.barplot(
+ x="dataset", y="acc1",
+ data=d,
+ order=order,
+ hue="pretrained",
+ estimator=np.max,
+ ci=None
+)
+ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
+ax
+
/tmp/ipykernel_114262/2264146503.py:4: FutureWarning: + +The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. + + ax = sns.barplot( ++
<AxesSubplot:xlabel='dataset', ylabel='acc1'>+
metric = "acc1"
+df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
+df_metric
+
model_fullname | +ViT-B-32-quickgelu laion400m_e32 | +roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | +
---|---|---|
dataset | ++ | + |
cars | +0.792 | +0.832 | +
country211 | +0.147 | +0.147 | +
fer2013 | +0.427 | +0.421 | +
fgvc_aircraft | +0.168 | +0.174 | +
gtsrb | +0.420 | +0.409 | +
imagenet-a | +0.217 | +0.212 | +
imagenet-r | +0.734 | +0.722 | +
imagenet1k | +0.629 | +0.617 | +
imagenet_sketch | +0.493 | +0.491 | +
imagenetv2 | +0.551 | +0.533 | +
mnist | +0.374 | +0.663 | +
objectnet | +0.439 | +0.451 | +
renderedsst2 | +0.526 | +0.544 | +
stl10 | +0.955 | +0.956 | +
sun397 | +0.670 | +0.663 | +
voc2007 | +0.757 | +0.780 | +
vtab/caltech101 | +0.833 | +0.826 | +
vtab/cifar10 | +0.908 | +0.932 | +
vtab/cifar100 | +0.702 | +0.750 | +
vtab/clevr_closest_object_distance | +0.159 | +0.201 | +
vtab/clevr_count_all | +0.163 | +0.147 | +
vtab/diabetic_retinopathy | +0.338 | +0.502 | +
vtab/dmlab | +0.172 | +0.129 | +
vtab/dsprites_label_orientation | +0.019 | +0.025 | +
vtab/dsprites_label_x_position | +0.029 | +0.028 | +
vtab/dtd | +0.543 | +0.591 | +
vtab/eurosat | +0.516 | +0.521 | +
vtab/flowers | +0.683 | +0.621 | +
vtab/kitti_closest_vehicle_distance | +0.288 | +0.387 | +
vtab/pcam | +0.546 | +0.498 | +
vtab/pets | +0.868 | +0.868 | +
vtab/resisc45 | +0.546 | +0.612 | +
vtab/smallnorb_label_azimuth | +0.045 | +0.060 | +
vtab/smallnorb_label_elevation | +0.097 | +0.102 | +
vtab/svhn | +0.279 | +0.442 | +
metric = "mean_per_class_recall"
+df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
+df_metric
+
model_fullname | +ViT-B-32-quickgelu laion400m_e32 | +roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | +
---|---|---|
dataset | ++ | + |
cars | +0.793 | +0.830 | +
country211 | +0.147 | +0.147 | +
fer2013 | +0.399 | +0.401 | +
fgvc_aircraft | +0.166 | +0.174 | +
gtsrb | +0.393 | +0.383 | +
imagenet-a | +0.235 | +0.242 | +
imagenet-r | +0.721 | +0.708 | +
imagenet1k | +0.629 | +0.617 | +
imagenet_sketch | +0.494 | +0.491 | +
imagenetv2 | +0.551 | +0.533 | +
mnist | +0.371 | +0.659 | +
objectnet | +0.427 | +0.440 | +
renderedsst2 | +0.526 | +0.545 | +
stl10 | +0.955 | +0.957 | +
sun397 | +0.661 | +0.664 | +
voc2007 | +0.791 | +0.809 | +
vtab/caltech101 | +0.909 | +0.905 | +
vtab/cifar10 | +0.908 | +0.933 | +
vtab/cifar100 | +0.703 | +0.750 | +
vtab/clevr_closest_object_distance | +0.167 | +0.167 | +
vtab/clevr_count_all | +0.158 | +0.144 | +
vtab/diabetic_retinopathy | +0.259 | +0.202 | +
vtab/dmlab | +0.158 | +0.160 | +
vtab/dsprites_label_orientation | +0.020 | +0.026 | +
vtab/dsprites_label_x_position | +0.031 | +0.028 | +
vtab/dtd | +0.547 | +0.593 | +
vtab/eurosat | +0.526 | +0.534 | +
vtab/flowers | +0.663 | +0.590 | +
vtab/kitti_closest_vehicle_distance | +0.365 | +0.404 | +
vtab/pcam | +0.546 | +0.498 | +
vtab/pets | +0.866 | +0.867 | +
vtab/resisc45 | +0.554 | +0.616 | +
vtab/smallnorb_label_azimuth | +0.045 | +0.060 | +
vtab/smallnorb_label_elevation | +0.097 | +0.102 | +
vtab/svhn | +0.280 | +0.393 | +
# Imagenet robustness results
+metric = "acc1"
+df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
+df_metric[(df_metric.index.str.startswith("imagenet")) | (df_metric.index=="objectnet")]
+
model_fullname | +ViT-B-32-quickgelu laion400m_e32 | +roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | +
---|---|---|
dataset | ++ | + |
imagenet-a | +0.217 | +0.212 | +
imagenet-r | +0.734 | +0.722 | +
imagenet1k | +0.629 | +0.617 | +
imagenet_sketch | +0.493 | +0.491 | +
imagenetv2 | +0.551 | +0.533 | +
objectnet | +0.439 | +0.451 | +
Here, following "Measuring Robustness to Natural Distribution Shifts +in Image Classification" (https://arxiv.org/pdf/2007.00644.pdf, https://share.streamlit.io/modestyachts/imagenet-testbed-website/main/website.py), +we show the deviation from the line fit of (x=imagenet1k accuracy, y=imagenetv2/imagenet-1/imagenet_sketch) which was used +to measure robustnest improvements separately from accuracy improvements in imagenet1k, as the two are correlated.
+In the plot below, deviation from the line are improvements in robustness.
+ +df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values="acc1").T.dropna()
+dataset = "imagenetv2"
+line_fits_data = {
+ # slopes and intercepts from https://share.streamlit.io/modestyachts/imagenet-testbed-website/main/website.py
+ "imagenetv2": (1.112, -20.433),
+ "imagenet-r": (1.549, -104.556),
+ "imagenet_sketch": (0.931, -45.373)
+}
+x=np.linspace(0, 100,100)
+slope, intercept = line_fits_data[dataset]
+y=x*slope+intercept
+plt.xlim(55,90)
+plt.ylim(40,90)
+d = df_metric.T[["imagenet1k", dataset]]*100
+plt.scatter(d["imagenet1k"], d[dataset], color="green")
+plt.plot(x,y, color="red")
+plt.xlabel("imagenet1k top-1 accuracy (%)")
+plt.ylabel(f"{dataset} top-1 accuracy (%)")
+plt.legend()
+
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. ++
<matplotlib.legend.Legend at 0x7f0ca8673520>+
metric = "mean_per_class_recall"
+pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
+
model_fullname | +ViT-B-32-quickgelu laion400m_e32 | +roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | +
---|---|---|
dataset | ++ | + |
cars | +0.793 | +0.830 | +
country211 | +0.147 | +0.147 | +
fer2013 | +0.399 | +0.401 | +
fgvc_aircraft | +0.166 | +0.174 | +
gtsrb | +0.393 | +0.383 | +
imagenet-a | +0.235 | +0.242 | +
imagenet-r | +0.721 | +0.708 | +
imagenet1k | +0.629 | +0.617 | +
imagenet_sketch | +0.494 | +0.491 | +
imagenetv2 | +0.551 | +0.533 | +
mnist | +0.371 | +0.659 | +
objectnet | +0.427 | +0.440 | +
renderedsst2 | +0.526 | +0.545 | +
stl10 | +0.955 | +0.957 | +
sun397 | +0.661 | +0.664 | +
voc2007 | +0.791 | +0.809 | +
vtab/caltech101 | +0.909 | +0.905 | +
vtab/cifar10 | +0.908 | +0.933 | +
vtab/cifar100 | +0.703 | +0.750 | +
vtab/clevr_closest_object_distance | +0.167 | +0.167 | +
vtab/clevr_count_all | +0.158 | +0.144 | +
vtab/diabetic_retinopathy | +0.259 | +0.202 | +
vtab/dmlab | +0.158 | +0.160 | +
vtab/dsprites_label_orientation | +0.020 | +0.026 | +
vtab/dsprites_label_x_position | +0.031 | +0.028 | +
vtab/dtd | +0.547 | +0.593 | +
vtab/eurosat | +0.526 | +0.534 | +
vtab/flowers | +0.663 | +0.590 | +
vtab/kitti_closest_vehicle_distance | +0.365 | +0.404 | +
vtab/pcam | +0.546 | +0.498 | +
vtab/pets | +0.866 | +0.867 | +
vtab/resisc45 | +0.554 | +0.616 | +
vtab/smallnorb_label_azimuth | +0.045 | +0.060 | +
vtab/smallnorb_label_elevation | +0.097 | +0.102 | +
vtab/svhn | +0.280 | +0.393 | +
# For multi-label classification tasks
+metric = "mean_average_precision"
+pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
+
model_fullname | +ViT-B-32-quickgelu laion400m_e32 | +roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | +
---|---|---|
dataset | ++ | + |
voc2007_multilabel | +0.762 | +0.766 | +
metric = "image_retrieval_recall@5"
+pd.pivot(df_retrieval, index="model_fullname", columns="dataset", values=metric).T.dropna()
+
model_fullname | +ViT-B-32-quickgelu laion400m_e32 | +roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | +
---|---|---|
dataset | ++ | + |
flickr30k | +0.855 | +0.868 | +
flickr8k | +0.579 | +0.595 | +
mscoco_captions | +0.608 | +0.631 | +
metric = "text_retrieval_recall@5"
+pd.pivot(df_retrieval, index="model_fullname", columns="dataset", values=metric).T.dropna()
+
model_fullname | +ViT-B-32-quickgelu laion400m_e32 | +roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | +
---|---|---|
dataset | ++ | + |
flickr30k | +0.941 | +0.948 | +
flickr8k | +0.739 | +0.751 | +
mscoco_captions | +0.768 | +0.778 | +
See VTAB (https://arxiv.org/pdf/1910.04867.pdf, Section E) for a discussion about different aggregation +strategies and how much they correlate. They find that all aggregation strategies have high +Kendall score with the simple top-1 mean accuracy over datasets.
+ +df.groupby("model_fullname").agg(['mean', 'std', 'median']).sort_values(by=("acc1", "mean"), ascending=False)
+
/tmp/ipykernel_114262/453967910.py:1: FutureWarning: ['dataset', 'model', 'pretrained', 'task', 'dataset_type', 'model_arch'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning. + df.groupby("model_fullname").agg(['mean', 'std', 'median']).sort_values(by=("acc1", "mean"), ascending=False) ++
+ | acc1 | +acc5 | +mean_per_class_recall | +mean_average_precision | +||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
+ | mean | +std | +median | +mean | +std | +median | +mean | +std | +median | +mean | +std | +median | +
model_fullname | ++ | + | + | + | + | + | + | + | + | + | + | + |
roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | +0.482 | +0.273 | +0.502 | +0.768 | +0.255 | +0.880 | +0.474 | +0.280 | +0.498 | +0.766 | +NaN | +0.766 | +
ViT-B-32-quickgelu laion400m_e32 | +0.458 | +0.272 | +0.493 | +0.757 | +0.254 | +0.858 | +0.459 | +0.276 | +0.494 | +0.762 | +NaN | +0.762 | +
metric = "acc1"
+df_metric = pd.pivot(df, index="model_fullname", columns="dataset", values=metric).T.dropna()
+df_metric.rank(axis=1,ascending=False).agg(["mean", "std"]).T.sort_values(by="mean",ascending=True)
+
+ | mean | +std | +
---|---|---|
model_fullname | ++ | + |
roberta-ViT-B-32 /fsx/rom1504/open_clip/roberta_B_32/checkpoints/epoch_90.pt | +1.429 | +0.502 | +
ViT-B-32-quickgelu laion400m_e32 | +1.571 | +0.502 | +
+