Spaces:

Geonmo
/

socratic-models-image-captioning-with-BLOOM

Runtime error

App Files Files Community

geonmo.gu commited on Jul 26, 2022

Commit

fba8607

•

1 Parent(s): 6019f50

initial commit

Browse files

Files changed (10) hide show

README.md +6 -4
app.py +277 -0
k21-1.jpg +0 -0
prompts/categories_places365.txt +365 -0
prompts/extract_text_features.py +154 -0
prompts/openimage-classnames.csv +0 -0
prompts/place365-classnames.txt +365 -0
prompts/tencent-ml-classnames.txt +0 -0
prompts/tencent-ml-images.txt +0 -0
requirements.txt +8 -0

README.md CHANGED Viewed

@@ -1,12 +1,14 @@
 ---
-title: Socratic Models Image Captioning With BLOOM
-emoji: 🔥
-colorFrom: green
-colorTo: yellow
 sdk: gradio
 sdk_version: 3.1.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Socratic Models Image Captioning
+emoji: 👀
+colorFrom: blue
+colorTo: blue
 sdk: gradio
 sdk_version: 3.1.1
 app_file: app.py
 pinned: false
+models:
+- bigscience/bloom
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import os
+import torch
+import gradio as gr
+import time
+import clip
+#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+#from flores200_codes import flores_codes
+import requests
+import csv
+import json
+import wget
+url_dict = {'clip_ViTL14_openimage_classifier_weights.pt': 'https://raw.githubusercontent.com/geonm/socratic-models-demo/master/prompts/clip_ViTL14_openimage_classifier_weights.pt',
+            'clip_ViTL14_place365_classifier_weights.pt': 'https://raw.githubusercontent.com/geonm/socratic-models-demo/master/prompts/clip_ViTL14_place365_classifier_weights.pt',
+            'clip_ViTL14_tencentml_classifier_weights.pt': 'https://raw.githubusercontent.com/geonm/socratic-models-demo/master/prompts/clip_ViTL14_tencentml_classifier_weights.pt'}
+os.makedirs('./prompts', exist_ok=True)
+for k, v in url_dict.items():
+        wget.download(v, out='./prompts')
+os.environ['CUDA_VISIBLE_DEVICES'] = ''
+API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
+HF_TOKEN = os.environ["HF_TOKEN"]
+headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+def load_openimage_classnames(csv_path):
+    csv_data = open(csv_path)
+    csv_reader = csv.reader(csv_data)
+    classnames = {idx: row[-1] for idx, row in enumerate(csv_reader)}
+    return classnames
+def load_tencentml_classnames(txt_path):
+    txt_data = open(txt_path)
+    lines = txt_data.readlines()
+    classnames = {idx: line.strip() for idx, line in enumerate(lines)}
+    return classnames
+def build_simple_classifier(clip_model, text_list, template, device):
+    with torch.no_grad():
+        texts = [template(text) for text in text_list]
+        text_inputs = clip.tokenize(texts).to(device)
+        text_features = clip_model.encode_text(text_inputs)
+        text_features /= text_features.norm(dim=-1, keepdim=True)
+    return text_features, {idx: text for idx, text in enumerate(text_list)}
+def load_models():
+    # build model and tokenizer
+    model_dict = {}
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print('\tLoading CLIP ViT-L/14')
+    clip_model, clip_preprocess = clip.load("ViT-L/14", device=device)
+    print('\tLoading precomputed zeroshot classifier')
+    openimage_classifier_weights = torch.load('./prompts/clip_ViTL14_openimage_classifier_weights.pt', map_location=device).type(torch.FloatTensor)
+    openimage_classnames = load_openimage_classnames('./prompts/openimage-classnames.csv')
+    tencentml_classifier_weights = torch.load('./prompts/clip_ViTL14_tencentml_classifier_weights.pt', map_location=device).type(torch.FloatTensor)
+    tencentml_classnames = load_tencentml_classnames('./prompts/tencent-ml-classnames.txt')
+    place365_classifier_weights = torch.load('./prompts/clip_ViTL14_place365_classifier_weights.pt', map_location=device).type(torch.FloatTensor)
+    place365_classnames = load_tencentml_classnames('./prompts/place365-classnames.txt')
+    print('\tBuilding simple zeroshot classifier')
+    img_types = ['photo', 'cartoon', 'sketch', 'painting']
+    ppl_texts = ['no people', 'people']
+    ifppl_texts = ['is one person', 'are two people', 'are three people', 'are several people', 'are many people']
+    imgtype_classifier_weights, imgtype_classnames = build_simple_classifier(clip_model, img_types, lambda c: f'This is a {c}.', device)
+    ppl_classifier_weights, ppl_classnames = build_simple_classifier(clip_model, ppl_texts, lambda c: f'There are {c} in this photo.', device)
+    ifppl_classifier_weights, ifppl_classnames = build_simple_classifier(clip_model, ifppl_texts, lambda c: f'There {c} in this photo.', device)
+    model_dict['clip_model'] = clip_model
+    model_dict['clip_preprocess'] = clip_preprocess
+    model_dict['openimage_classifier_weights'] = openimage_classifier_weights
+    model_dict['openimage_classnames'] = openimage_classnames
+    model_dict['tencentml_classifier_weights'] = tencentml_classifier_weights
+    model_dict['tencentml_classnames'] = tencentml_classnames
+    model_dict['place365_classifier_weights'] = place365_classifier_weights
+    model_dict['place365_classnames'] = place365_classnames
+    model_dict['imgtype_classifier_weights'] = imgtype_classifier_weights
+    model_dict['imgtype_classnames'] = imgtype_classnames
+    model_dict['ppl_classifier_weights'] = ppl_classifier_weights
+    model_dict['ppl_classnames'] = ppl_classnames
+    model_dict['ifppl_classifier_weights'] = ifppl_classifier_weights
+    model_dict['ifppl_classnames'] = ifppl_classnames
+    model_dict['device'] = device
+    return model_dict
+def drop_gpu(tensor):
+    if torch.cuda.is_available():
+        return tensor.cpu().numpy()
+    else:
+        return tensor.numpy()
+def zeroshot_classifier(image):
+    image_input = model_dict['clip_preprocess'](image).unsqueeze(0).to(model_dict['device'])
+    with torch.no_grad():
+        image_features = model_dict['clip_model'].encode_image(image_input)
+        image_features /= image_features.norm(dim=-1, keepdim=True)
+        sim = (100.0 * image_features @ model_dict['openimage_classifier_weights'].T).softmax(dim=-1)
+        openimage_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(10)]
+        openimage_classes = [model_dict['openimage_classnames'][idx] for idx in indices]
+        sim = (100.0 * image_features @ model_dict['tencentml_classifier_weights'].T).softmax(dim=-1)
+        tencentml_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(10)]
+        tencentml_classes = [model_dict['tencentml_classnames'][idx] for idx in indices]
+        sim = (100.0 * image_features @ model_dict['place365_classifier_weights'].T).softmax(dim=-1)
+        place365_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(10)]
+        place365_classes = [model_dict['place365_classnames'][idx] for idx in indices]
+        sim = (100.0 * image_features @ model_dict['imgtype_classifier_weights'].T).softmax(dim=-1)
+        imgtype_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(model_dict['imgtype_classnames']))]
+        imgtype_classes = [model_dict['imgtype_classnames'][idx] for idx in indices]
+        sim = (100.0 * image_features @ model_dict['ppl_classifier_weights'].T).softmax(dim=-1)
+        ppl_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(model_dict['ppl_classnames']))]
+        ppl_classes = [model_dict['ppl_classnames'][idx] for idx in indices]
+        sim = (100.0 * image_features @ model_dict['ifppl_classifier_weights'].T).softmax(dim=-1)
+        ifppl_scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(model_dict['ifppl_classnames']))]
+        ifppl_classes = [model_dict['ifppl_classnames'][idx] for idx in indices]
+    return image_features, openimage_scores, openimage_classes, tencentml_scores, tencentml_classes,\
+           place365_scores, place365_classes, imgtype_scores, imgtype_classes,\
+           ppl_scores, ppl_classes, ifppl_scores, ifppl_classes
+def generate_prompt(openimage_classes, tencentml_classes, place365_classes, imgtype_classes, ppl_classes, ifppl_classes):
+    img_type = imgtype_classes[0]
+    ppl_result = ppl_classes[0]
+    if ppl_result == 'people':
+        ppl_result = ifppl_classes[0]
+    else:
+        ppl_result = 'are %s' % ppl_result
+    sorted_places = place365_classes
+    object_list = ''
+    for cls in tencentml_classes:
+        object_list += f'{cls}, '
+    for cls in openimage_classes[:2]:
+        object_list += f'{cls}, '
+    object_list = object_list[:-2]
+    prompt_caption = f'''I am an intelligent image captioning bot.
+    This image is a {img_type}. There {ppl_result}.
+    I think this photo was taken at a {sorted_places[0]}, {sorted_places[1]}, or {sorted_places[2]}.
+    I think there might be a {object_list} in this {img_type}.
+    A creative short caption I can generate to describe this image is:'''
+    #prompt_search = f'''Let's list keywords that include the following description.
+    #This image is a {img_type}. There {ppl_result}.
+    #I think this photo was taken at a {sorted_places[0]}, {sorted_places[1]}, or {sorted_places[2]}.
+    #I think there might be a {object_list} in this {img_type}.
+    #Relevant keywords which we can list and are seperated with comma are:'''
+    return prompt_caption
+def generate_captions(prompt, num_captions=3):
+    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+    max_length = 16
+    seed = 42
+    sample_or_greedy = 'Greedy'
+    input_sentence = prompt
+    if sample_or_greedy == "Sample":
+        parameters = {
+            "max_new_tokens": max_length,
+            "top_p": 0.7,
+            "do_sample": True,
+            "seed": seed,
+            "early_stopping": False,
+            "length_penalty": 0.0,
+            "eos_token_id": None,
+        }
+    else:
+        parameters = {
+            "max_new_tokens": max_length,
+            "do_sample": False,
+            "seed": seed,
+            "early_stopping": False,
+            "length_penalty": 0.0,
+            "eos_token_id": None,
+        }
+    payload = {"inputs": input_sentence, "parameters": parameters,"options" : {"use_cache": False}}
+    bloom_results = []
+    for _ in range(num_captions):
+        response = requests.post(API_URL, headers=headers, json=payload)
+        output = response.json()
+        generated_text = output[0]['generated_text'].replace(prompt, '')
+        bloom_results.append(generated_text)
+    return bloom_results
+def sorting_texts(image_features, captions):
+    with torch.no_grad():
+        text_inputs = clip.tokenize(captions).to(model_dict['device'])
+        text_features = model_dict['clip_model'].encode_text(text_inputs)
+        text_features /= text_features.norm(dim=-1, keepdim=True)
+        sim = (100.0 * image_features @ text_features.T).softmax(dim=-1)
+        scores, indices = [drop_gpu(tensor) for tensor in sim[0].topk(len(captions))]
+        sorted_captions = [captions[idx] for idx in indices]
+    return scores, sorted_captions
+def postprocess_results(scores, classes):
+    scores = [float('%.4f' % float(val)) for val in scores]
+    outputs = []
+    for score, cls in zip(scores, classes):
+        outputs.append({'score': score, 'output': cls})
+    return outputs
+def image_captioning(image):
+    start_time = time.time()
+    image_features, openimage_scores, openimage_classes, tencentml_scores, tencentml_classes, place365_scores, place365_classes, imgtype_scores, imgtype_classes, ppl_scores, ppl_classes, ifppl_scores, ifppl_classes = zeroshot_classifier(image)
+    end_zeroshot = time.time()
+    prompt_caption = generate_prompt(openimage_classes, tencentml_classes, place365_classes, imgtype_classes, ppl_classes, ifppl_classes)
+    generated_captions = generate_captions(prompt_caption, num_captions=1)
+    end_bloom = time.time()
+    caption_scores, sorted_captions = sorting_texts(image_features, generated_captions)
+    output_dict = {}
+    output_dict['inference_time'] = {'CLIP inference': end_zeroshot - start_time,
+                                     'BLOOM request': end_bloom - end_zeroshot}
+    output_dict['generated_captions'] = postprocess_results(caption_scores, sorted_captions)
+    output_dict['reasoning'] = {'openimage_results': postprocess_results(openimage_scores, openimage_classes),
+                                'tencentml_results': postprocess_results(tencentml_scores, tencentml_classes),
+                                'place365_results': postprocess_results(place365_scores, place365_classes),
+                                'imgtype_results': postprocess_results(imgtype_scores, imgtype_classes),
+                                'ppl_results': postprocess_results(ppl_scores, ppl_classes),
+                                'ifppl_results': postprocess_results(ifppl_scores, ifppl_classes)}
+    return output_dict
+if __name__ == '__main__':
+    print('\tinit models')
+    global model_dict
+    model_dict = load_models()
+    # define gradio demo
+    inputs = [gr.inputs.Image(type="pil", label="Image")
+              ]
+    outputs = gr.outputs.JSON()
+    title = "Socratic models for image captioning with BLOOM"
+    demo_status = "Demo is running on CPU"
+    description = f"Details: https://github.com/geonm/socratic-models-demo. {demo_status}"
+    article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.00598'>Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language</a></p>"
+    examples = ['k21-1.jpg']
+    gr.Interface(image_captioning,
+                 inputs,
+                 outputs,
+                 title=title,
+                 description=description,
+                 article=article,
+                 examples=examples,
+                 #examples_per_page=50,
+                 ).launch()

k21-1.jpg ADDED Viewed

prompts/categories_places365.txt ADDED Viewed

	@@ -0,0 +1,365 @@

+/a/airfield 0
+/a/airplane_cabin 1
+/a/airport_terminal 2
+/a/alcove 3
+/a/alley 4
+/a/amphitheater 5
+/a/amusement_arcade 6
+/a/amusement_park 7
+/a/apartment_building/outdoor 8
+/a/aquarium 9
+/a/aqueduct 10
+/a/arcade 11
+/a/arch 12
+/a/archaelogical_excavation 13
+/a/archive 14
+/a/arena/hockey 15
+/a/arena/performance 16
+/a/arena/rodeo 17
+/a/army_base 18
+/a/art_gallery 19
+/a/art_school 20
+/a/art_studio 21
+/a/artists_loft 22
+/a/assembly_line 23
+/a/athletic_field/outdoor 24
+/a/atrium/public 25
+/a/attic 26
+/a/auditorium 27
+/a/auto_factory 28
+/a/auto_showroom 29
+/b/badlands 30
+/b/bakery/shop 31
+/b/balcony/exterior 32
+/b/balcony/interior 33
+/b/ball_pit 34
+/b/ballroom 35
+/b/bamboo_forest 36
+/b/bank_vault 37
+/b/banquet_hall 38
+/b/bar 39
+/b/barn 40
+/b/barndoor 41
+/b/baseball_field 42
+/b/basement 43
+/b/basketball_court/indoor 44
+/b/bathroom 45
+/b/bazaar/indoor 46
+/b/bazaar/outdoor 47
+/b/beach 48
+/b/beach_house 49
+/b/beauty_salon 50
+/b/bedchamber 51
+/b/bedroom 52
+/b/beer_garden 53
+/b/beer_hall 54
+/b/berth 55
+/b/biology_laboratory 56
+/b/boardwalk 57
+/b/boat_deck 58
+/b/boathouse 59
+/b/bookstore 60
+/b/booth/indoor 61
+/b/botanical_garden 62
+/b/bow_window/indoor 63
+/b/bowling_alley 64
+/b/boxing_ring 65
+/b/bridge 66
+/b/building_facade 67
+/b/bullring 68
+/b/burial_chamber 69
+/b/bus_interior 70
+/b/bus_station/indoor 71
+/b/butchers_shop 72
+/b/butte 73
+/c/cabin/outdoor 74
+/c/cafeteria 75
+/c/campsite 76
+/c/campus 77
+/c/canal/natural 78
+/c/canal/urban 79
+/c/candy_store 80
+/c/canyon 81
+/c/car_interior 82
+/c/carrousel 83
+/c/castle 84
+/c/catacomb 85
+/c/cemetery 86
+/c/chalet 87
+/c/chemistry_lab 88
+/c/childs_room 89
+/c/church/indoor 90
+/c/church/outdoor 91
+/c/classroom 92
+/c/clean_room 93
+/c/cliff 94
+/c/closet 95
+/c/clothing_store 96
+/c/coast 97
+/c/cockpit 98
+/c/coffee_shop 99
+/c/computer_room 100
+/c/conference_center 101
+/c/conference_room 102
+/c/construction_site 103
+/c/corn_field 104
+/c/corral 105
+/c/corridor 106
+/c/cottage 107
+/c/courthouse 108
+/c/courtyard 109
+/c/creek 110
+/c/crevasse 111
+/c/crosswalk 112
+/d/dam 113
+/d/delicatessen 114
+/d/department_store 115
+/d/desert/sand 116
+/d/desert/vegetation 117
+/d/desert_road 118
+/d/diner/outdoor 119
+/d/dining_hall 120
+/d/dining_room 121
+/d/discotheque 122
+/d/doorway/outdoor 123
+/d/dorm_room 124
+/d/downtown 125
+/d/dressing_room 126
+/d/driveway 127
+/d/drugstore 128
+/e/elevator/door 129
+/e/elevator_lobby 130
+/e/elevator_shaft 131
+/e/embassy 132
+/e/engine_room 133
+/e/entrance_hall 134
+/e/escalator/indoor 135
+/e/excavation 136
+/f/fabric_store 137
+/f/farm 138
+/f/fastfood_restaurant 139
+/f/field/cultivated 140
+/f/field/wild 141
+/f/field_road 142
+/f/fire_escape 143
+/f/fire_station 144
+/f/fishpond 145
+/f/flea_market/indoor 146
+/f/florist_shop/indoor 147
+/f/food_court 148
+/f/football_field 149
+/f/forest/broadleaf 150
+/f/forest_path 151
+/f/forest_road 152
+/f/formal_garden 153
+/f/fountain 154
+/g/galley 155
+/g/garage/indoor 156
+/g/garage/outdoor 157
+/g/gas_station 158
+/g/gazebo/exterior 159
+/g/general_store/indoor 160
+/g/general_store/outdoor 161
+/g/gift_shop 162
+/g/glacier 163
+/g/golf_course 164
+/g/greenhouse/indoor 165
+/g/greenhouse/outdoor 166
+/g/grotto 167
+/g/gymnasium/indoor 168
+/h/hangar/indoor 169
+/h/hangar/outdoor 170
+/h/harbor 171
+/h/hardware_store 172
+/h/hayfield 173
+/h/heliport 174
+/h/highway 175
+/h/home_office 176
+/h/home_theater 177
+/h/hospital 178
+/h/hospital_room 179
+/h/hot_spring 180
+/h/hotel/outdoor 181
+/h/hotel_room 182
+/h/house 183
+/h/hunting_lodge/outdoor 184
+/i/ice_cream_parlor 185
+/i/ice_floe 186
+/i/ice_shelf 187
+/i/ice_skating_rink/indoor 188
+/i/ice_skating_rink/outdoor 189
+/i/iceberg 190
+/i/igloo 191
+/i/industrial_area 192
+/i/inn/outdoor 193
+/i/islet 194
+/j/jacuzzi/indoor 195
+/j/jail_cell 196
+/j/japanese_garden 197
+/j/jewelry_shop 198
+/j/junkyard 199
+/k/kasbah 200
+/k/kennel/outdoor 201
+/k/kindergarden_classroom 202
+/k/kitchen 203
+/l/lagoon 204
+/l/lake/natural 205
+/l/landfill 206
+/l/landing_deck 207
+/l/laundromat 208
+/l/lawn 209
+/l/lecture_room 210
+/l/legislative_chamber 211
+/l/library/indoor 212
+/l/library/outdoor 213
+/l/lighthouse 214
+/l/living_room 215
+/l/loading_dock 216
+/l/lobby 217
+/l/lock_chamber 218
+/l/locker_room 219
+/m/mansion 220
+/m/manufactured_home 221
+/m/market/indoor 222
+/m/market/outdoor 223
+/m/marsh 224
+/m/martial_arts_gym 225
+/m/mausoleum 226
+/m/medina 227
+/m/mezzanine 228
+/m/moat/water 229
+/m/mosque/outdoor 230
+/m/motel 231
+/m/mountain 232
+/m/mountain_path 233
+/m/mountain_snowy 234
+/m/movie_theater/indoor 235
+/m/museum/indoor 236
+/m/museum/outdoor 237
+/m/music_studio 238
+/n/natural_history_museum 239
+/n/nursery 240
+/n/nursing_home 241
+/o/oast_house 242
+/o/ocean 243
+/o/office 244
+/o/office_building 245
+/o/office_cubicles 246
+/o/oilrig 247
+/o/operating_room 248
+/o/orchard 249
+/o/orchestra_pit 250
+/p/pagoda 251
+/p/palace 252
+/p/pantry 253
+/p/park 254
+/p/parking_garage/indoor 255
+/p/parking_garage/outdoor 256
+/p/parking_lot 257
+/p/pasture 258
+/p/patio 259
+/p/pavilion 260
+/p/pet_shop 261
+/p/pharmacy 262
+/p/phone_booth 263
+/p/physics_laboratory 264
+/p/picnic_area 265
+/p/pier 266
+/p/pizzeria 267
+/p/playground 268
+/p/playroom 269
+/p/plaza 270
+/p/pond 271
+/p/porch 272
+/p/promenade 273
+/p/pub/indoor 274
+/r/racecourse 275
+/r/raceway 276
+/r/raft 277
+/r/railroad_track 278
+/r/rainforest 279
+/r/reception 280
+/r/recreation_room 281
+/r/repair_shop 282
+/r/residential_neighborhood 283
+/r/restaurant 284
+/r/restaurant_kitchen 285
+/r/restaurant_patio 286
+/r/rice_paddy 287
+/r/river 288
+/r/rock_arch 289
+/r/roof_garden 290
+/r/rope_bridge 291
+/r/ruin 292
+/r/runway 293
+/s/sandbox 294
+/s/sauna 295
+/s/schoolhouse 296
+/s/science_museum 297
+/s/server_room 298
+/s/shed 299
+/s/shoe_shop 300
+/s/shopfront 301
+/s/shopping_mall/indoor 302
+/s/shower 303
+/s/ski_resort 304
+/s/ski_slope 305
+/s/sky 306
+/s/skyscraper 307
+/s/slum 308
+/s/snowfield 309
+/s/soccer_field 310
+/s/stable 311
+/s/stadium/baseball 312
+/s/stadium/football 313
+/s/stadium/soccer 314
+/s/stage/indoor 315
+/s/stage/outdoor 316
+/s/staircase 317
+/s/storage_room 318
+/s/street 319
+/s/subway_station/platform 320
+/s/supermarket 321
+/s/sushi_bar 322
+/s/swamp 323
+/s/swimming_hole 324
+/s/swimming_pool/indoor 325
+/s/swimming_pool/outdoor 326
+/s/synagogue/outdoor 327
+/t/television_room 328
+/t/television_studio 329
+/t/temple/asia 330
+/t/throne_room 331
+/t/ticket_booth 332
+/t/topiary_garden 333
+/t/tower 334
+/t/toyshop 335
+/t/train_interior 336
+/t/train_station/platform 337
+/t/tree_farm 338
+/t/tree_house 339
+/t/trench 340
+/t/tundra 341
+/u/underwater/ocean_deep 342
+/u/utility_room 343
+/v/valley 344
+/v/vegetable_garden 345
+/v/veterinarians_office 346
+/v/viaduct 347
+/v/village 348
+/v/vineyard 349
+/v/volcano 350
+/v/volleyball_court/outdoor 351
+/w/waiting_room 352
+/w/water_park 353
+/w/water_tower 354
+/w/waterfall 355
+/w/watering_hole 356
+/w/wave 357
+/w/wet_bar 358
+/w/wheat_field 359
+/w/wind_farm 360
+/w/windmill 361
+/y/yard 362
+/y/youth_hostel 363
+/z/zen_garden 364

prompts/extract_text_features.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import os
+import numpy as np
+import torch
+import clip
+import csv
+import tqdm
+from profanity_filter import ProfanityFilter
+templates = [
+    lambda c: f'a bad photo of a {c}.',
+    lambda c: f'a photo of many {c}.',
+    lambda c: f'a sculpture of a {c}.',
+    lambda c: f'a photo of the hard to see {c}.',
+    lambda c: f'a low resolution photo of the {c}.',
+    lambda c: f'a rendering of a {c}.',
+    lambda c: f'graffiti of a {c}.',
+    lambda c: f'a bad photo of the {c}.',
+    lambda c: f'a cropped photo of the {c}.',
+    lambda c: f'a tattoo of a {c}.',
+    lambda c: f'the embroidered {c}.',
+    lambda c: f'a photo of a hard to see {c}.',
+    lambda c: f'a bright photo of a {c}.',
+    lambda c: f'a photo of a clean {c}.',
+    lambda c: f'a photo of a dirty {c}.',
+    lambda c: f'a dark photo of the {c}.',
+    lambda c: f'a drawing of a {c}.',
+    lambda c: f'a photo of my {c}.',
+    lambda c: f'the plastic {c}.',
+    lambda c: f'a photo of the cool {c}.',
+    lambda c: f'a close-up photo of a {c}.',
+    lambda c: f'a black and white photo of the {c}.',
+    lambda c: f'a painting of the {c}.',
+    lambda c: f'a painting of a {c}.',
+    lambda c: f'a pixelated photo of the {c}.',
+    lambda c: f'a sculpture of the {c}.',
+    lambda c: f'a bright photo of the {c}.',
+    lambda c: f'a cropped photo of a {c}.',
+    lambda c: f'a plastic {c}.',
+    lambda c: f'a photo of the dirty {c}.',
+    lambda c: f'a jpeg corrupted photo of a {c}.',
+    lambda c: f'a blurry photo of the {c}.',
+    lambda c: f'a photo of the {c}.',
+    lambda c: f'a good photo of the {c}.',
+    lambda c: f'a rendering of the {c}.',
+    lambda c: f'a {c} in a video game.',
+    lambda c: f'a photo of one {c}.',
+    lambda c: f'a doodle of a {c}.',
+    lambda c: f'a close-up photo of the {c}.',
+    lambda c: f'a photo of a {c}.',
+    lambda c: f'the origami {c}.',
+    lambda c: f'the {c} in a video game.',
+    lambda c: f'a sketch of a {c}.',
+    lambda c: f'a doodle of the {c}.',
+    lambda c: f'a origami {c}.',
+    lambda c: f'a low resolution photo of a {c}.',
+    lambda c: f'the toy {c}.',
+    lambda c: f'a rendition of the {c}.',
+    lambda c: f'a photo of the clean {c}.',
+    lambda c: f'a photo of a large {c}.',
+    lambda c: f'a rendition of a {c}.',
+    lambda c: f'a photo of a nice {c}.',
+    lambda c: f'a photo of a weird {c}.',
+    lambda c: f'a blurry photo of a {c}.',
+    lambda c: f'a cartoon {c}.',
+    lambda c: f'art of a {c}.',
+    lambda c: f'a sketch of the {c}.',
+    lambda c: f'a embroidered {c}.',
+    lambda c: f'a pixelated photo of a {c}.',
+    lambda c: f'itap of the {c}.',
+    lambda c: f'a jpeg corrupted photo of the {c}.',
+    lambda c: f'a good photo of a {c}.',
+    lambda c: f'a plushie {c}.',
+    lambda c: f'a photo of the nice {c}.',
+    lambda c: f'a photo of the small {c}.',
+    lambda c: f'a photo of the weird {c}.',
+    lambda c: f'the cartoon {c}.',
+    lambda c: f'art of the {c}.',
+    lambda c: f'a drawing of the {c}.',
+    lambda c: f'a photo of the large {c}.',
+    lambda c: f'a black and white photo of a {c}.',
+    lambda c: f'the plushie {c}.',
+    lambda c: f'a dark photo of a {c}.',
+    lambda c: f'itap of a {c}.',
+    lambda c: f'graffiti of the {c}.',
+    lambda c: f'a toy {c}.',
+    lambda c: f'itap of my {c}.',
+    lambda c: f'a photo of a cool {c}.',
+    lambda c: f'a photo of a small {c}.',
+    lambda c: f'a tattoo of the {c}.',
+]
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+device = "cuda" if torch.cuda.is_available() else "cpu"
+clip_model, clip_preprocess = clip.load("ViT-L/14", device=device)
+'''
+csv_data = open('openimage-classnames.csv')
+csv_reader = csv.reader(csv_data)
+class_names = []
+for row in csv_reader:
+    class_names.append(row[-1])
+'''
+'''
+txt_data = open('tencent-ml-images.txt')
+pf = ProfanityFilter()
+lines = txt_data.readlines()
+class_names = []
+for line in lines[4:]:
+    class_name_precook = line.strip().split('\t')[-1]
+    safe_list = ''
+    for class_name in class_name_precook.split(', '):
+        if pf.is_clean(class_name):
+            safe_list += '%s, ' % class_name
+    safe_list = safe_list[:-2]
+    if len(safe_list) > 0:
+        class_names.append(safe_list)
+f_w = open('tencent-ml-classnames.txt', 'w')
+for cln in class_names:
+    f_w.write('%s\n' % cln)
+f_w.close()
+'''
+place_categories = np.loadtxt('categories_places365.txt', dtype=str)
+place_texts = []
+for place in place_categories[:, 0]:
+    place = place.split('/')[2:]
+    if len(place) > 1:
+        place = place[1] + ' ' + place[0]
+    else:
+        place = place[0]
+    place = place.replace('_', ' ')
+    place_texts.append(place)
+class_names = place_texts
+f_w = open('place365-classnames.txt', 'w')
+for cln in class_names:
+    f_w.write('%s\n' % cln)
+f_w.close()
+print(class_names)
+class_weights = []
+with torch.no_grad():
+    for classname in tqdm.tqdm(class_names, desc='encoding text'):
+        texts = [template(classname) for template in templates]
+        text_inputs = clip.tokenize(texts).to(device)
+        text_features = clip_model.encode_text(text_inputs)
+        text_features /= text_features.norm(dim=-1, keepdim=True)
+        text_features = text_features.mean(dim=0)
+        text_features /= text_features.norm()
+        class_weights.append(text_features)
+class_weights = torch.stack(class_weights)
+print(class_weights.shape)
+#torch.save(class_weights, 'clip_ViTL14_openimage_classifier_weights.pt')
+torch.save(class_weights, 'clip_ViTL14_place365_classifier_weights.pt')

prompts/openimage-classnames.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

prompts/place365-classnames.txt ADDED Viewed

	@@ -0,0 +1,365 @@

+airfield
+airplane cabin
+airport terminal
+alcove
+alley
+amphitheater
+amusement arcade
+amusement park
+outdoor apartment building
+aquarium
+aqueduct
+arcade
+arch
+archaelogical excavation
+archive
+hockey arena
+performance arena
+rodeo arena
+army base
+art gallery
+art school
+art studio
+artists loft
+assembly line
+outdoor athletic field
+public atrium
+attic
+auditorium
+auto factory
+auto showroom
+badlands
+shop bakery
+exterior balcony
+interior balcony
+ball pit
+ballroom
+bamboo forest
+bank vault
+banquet hall
+bar
+barn
+barndoor
+baseball field
+basement
+indoor basketball court
+bathroom
+indoor bazaar
+outdoor bazaar
+beach
+beach house
+beauty salon
+bedchamber
+bedroom
+beer garden
+beer hall
+berth
+biology laboratory
+boardwalk
+boat deck
+boathouse
+bookstore
+indoor booth
+botanical garden
+indoor bow window
+bowling alley
+boxing ring
+bridge
+building facade
+bullring
+burial chamber
+bus interior
+indoor bus station
+butchers shop
+butte
+outdoor cabin
+cafeteria
+campsite
+campus
+natural canal
+urban canal
+candy store
+canyon
+car interior
+carrousel
+castle
+catacomb
+cemetery
+chalet
+chemistry lab
+childs room
+indoor church
+outdoor church
+classroom
+clean room
+cliff
+closet
+clothing store
+coast
+cockpit
+coffee shop
+computer room
+conference center
+conference room
+construction site
+corn field
+corral
+corridor
+cottage
+courthouse
+courtyard
+creek
+crevasse
+crosswalk
+dam
+delicatessen
+department store
+sand desert
+vegetation desert
+desert road
+outdoor diner
+dining hall
+dining room
+discotheque
+outdoor doorway
+dorm room
+downtown
+dressing room
+driveway
+drugstore
+door elevator
+elevator lobby
+elevator shaft
+embassy
+engine room
+entrance hall
+indoor escalator
+excavation
+fabric store
+farm
+fastfood restaurant
+cultivated field
+wild field
+field road
+fire escape
+fire station
+fishpond
+indoor flea market
+indoor florist shop
+food court
+football field
+broadleaf forest
+forest path
+forest road
+formal garden
+fountain
+galley
+indoor garage
+outdoor garage
+gas station
+exterior gazebo
+indoor general store
+outdoor general store
+gift shop
+glacier
+golf course
+indoor greenhouse
+outdoor greenhouse
+grotto
+indoor gymnasium
+indoor hangar
+outdoor hangar
+harbor
+hardware store
+hayfield
+heliport
+highway
+home office
+home theater
+hospital
+hospital room
+hot spring
+outdoor hotel
+hotel room
+house
+outdoor hunting lodge
+ice cream parlor
+ice floe
+ice shelf
+indoor ice skating rink
+outdoor ice skating rink
+iceberg
+igloo
+industrial area
+outdoor inn
+islet
+indoor jacuzzi
+jail cell
+japanese garden
+jewelry shop
+junkyard
+kasbah
+outdoor kennel
+kindergarden classroom
+kitchen
+lagoon
+natural lake
+landfill
+landing deck
+laundromat
+lawn
+lecture room
+legislative chamber
+indoor library
+outdoor library
+lighthouse
+living room
+loading dock
+lobby
+lock chamber
+locker room
+mansion
+manufactured home
+indoor market
+outdoor market
+marsh
+martial arts gym
+mausoleum
+medina
+mezzanine
+water moat
+outdoor mosque
+motel
+mountain
+mountain path
+mountain snowy
+indoor movie theater
+indoor museum
+outdoor museum
+music studio
+natural history museum
+nursery
+nursing home
+oast house
+ocean
+office
+office building
+office cubicles
+oilrig
+operating room
+orchard
+orchestra pit
+pagoda
+palace
+pantry
+park
+indoor parking garage
+outdoor parking garage
+parking lot
+pasture
+patio
+pavilion
+pet shop
+pharmacy
+phone booth
+physics laboratory
+picnic area
+pier
+pizzeria
+playground
+playroom
+plaza
+pond
+porch
+promenade
+indoor pub
+racecourse
+raceway
+raft
+railroad track
+rainforest
+reception
+recreation room
+repair shop
+residential neighborhood
+restaurant
+restaurant kitchen
+restaurant patio
+rice paddy
+river
+rock arch
+roof garden
+rope bridge
+ruin
+runway
+sandbox
+sauna
+schoolhouse
+science museum
+server room
+shed
+shoe shop
+shopfront
+indoor shopping mall
+shower
+ski resort
+ski slope
+sky
+skyscraper
+slum
+snowfield
+soccer field
+stable
+baseball stadium
+football stadium
+soccer stadium
+indoor stage
+outdoor stage
+staircase
+storage room
+street
+platform subway station
+supermarket
+sushi bar
+swamp
+swimming hole
+indoor swimming pool
+outdoor swimming pool
+outdoor synagogue
+television room
+television studio
+asia temple
+throne room
+ticket booth
+topiary garden
+tower
+toyshop
+train interior
+platform train station
+tree farm
+tree house
+trench
+tundra
+ocean deep underwater
+utility room
+valley
+vegetable garden
+veterinarians office
+viaduct
+village
+vineyard
+volcano
+outdoor volleyball court
+waiting room
+water park
+water tower
+waterfall
+watering hole
+wave
+wet bar
+wheat field
+wind farm
+windmill
+yard
+youth hostel
+zen garden

prompts/tencent-ml-classnames.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

prompts/tencent-ml-images.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+git+https://github.com/huggingface/transformers
+ftfy
+regex
+tqdm
+git+https://github.com/openai/CLIP.git
+gradio
+torch
+wget