diff --git "a/Modelo_colab_bueno" "b/Modelo_colab_bueno" deleted file mode 100644--- "a/Modelo_colab_bueno" +++ /dev/null @@ -1 +0,0 @@ -{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[{"file_id":"1NtcTgRbSBKN7pYD3Vdx1j9m8pt3fhFDB","timestamp":1714752649597}],"gpuType":"A100","machine_shape":"hm"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"03d462afc44d4e0aa1bce8467b70de43":{"model_module":"@jupyter-widgets/controls","model_name":"VBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"VBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"VBoxView","box_style":"","children":["IPY_MODEL_45917c19555a4bc0b16cab0dcf89815f","IPY_MODEL_da863a35e1e54aa7a92dbff431d51dfd","IPY_MODEL_dadf6af249b64d6fb0594bf8f5c75faf","IPY_MODEL_1025bc80a5da4d98a1cce8a7b3782756"],"layout":"IPY_MODEL_23de99060d0047c38e3ef1597923f3b9"}},"53b7d4b493114063b41977820628a2e2":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3ba29e1a318a4cf6b3a065f7ee4bdeb1","placeholder":"","style":"IPY_MODEL_0cbec2a1a4c04798a8ff87cba5a8c628","value":"
\n"," \n","
\n","\n","This tutorial showcases how one can fine-tune Idefics2 on their own use-case.\n","\n","Idefics2 is an open multimodal model that accepts arbitrary sequences of image and text inputs and produces text outputs. The model can answer questions about images, describe visual content, create stories grounded on multiple images, or simply behave as a pure language model without visual inputs. It improves upon Idefics1, significantly enhancing capabilities around OCR, document understanding and visual reasoning.\n","\n","Read more about Idefics2 this [blogpost](https://huggingface.co/blog/idefics2) and the [model card](https://huggingface.co/HuggingFaceM4/idefics2-8b)."],"metadata":{"id":"UiDl78dSKKHK"}},{"cell_type":"markdown","source":["# Setup\n","\n","We first setup the environment with the primary necessary libraries and login into Hugging Face."],"metadata":{"id":"LEn_UMRFKMua"}},{"cell_type":"code","source":["!pip install -q git+https://github.com/huggingface/transformers.git\n","!pip install -q accelerate datasets peft bitsandbytes\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"RsPQHbF3KM97","executionInfo":{"status":"ok","timestamp":1715533611368,"user_tz":-120,"elapsed":105228,"user":{"displayName":"Qilu Diana Wu","userId":"15924277351051054428"}},"outputId":"68bdfeac-87d8-4a74-d0c0-5e8238b75c72"},"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":[" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n"," Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n"," Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m401.2/401.2 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Building wheel for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.6/302.6 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m542.0/542.0 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.1/199.1 kB\u001b[0m \u001b[31m23.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.8/119.8 MB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m13.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m22.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h"]}]},{"cell_type":"code","source":["from huggingface_hub import notebook_login\n","\n","notebook_login()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":145,"referenced_widgets":["03d462afc44d4e0aa1bce8467b70de43","53b7d4b493114063b41977820628a2e2","77bdad6f014b484f98bdd6e0547c1dfe","3e17e3e11c084a47affa36807e6d4295","25329a3c73cd48d9972bae2df3b59423","629c658e285f4b6a97e6db149e745508","23de99060d0047c38e3ef1597923f3b9","3ba29e1a318a4cf6b3a065f7ee4bdeb1","0cbec2a1a4c04798a8ff87cba5a8c628","3c9980f7c9d7425cbedcc34370766fb3","9a7ed016aa1b4af7887a8479ac890928","4261664530b047bf88adc2b5521eb9d6","c46def5b24054fb796991618642202e3","b83eece01c204e1db67f191ac4ebda20","c6b95e0a0ab5470babfd171bd0b63dc2","f523ba2d68844c83b98e6f994fac21a2","d1b856b16ea84158a662bc2a9bad897d","a9d9632cfbff4c3f9b86a59fd85129ab","88550470171d4c1fb0b2f6b255057079","754a44a709db439bb309f3cbfd773b12","5dd63f1a539e4a9d8659265b80c8a9ed","e3315558182943aaac4fa3d119c62715","f7dcb89241d54038a1d1894ec2ccd7ab","913bd895dc294b57b848b9a762ad9150","b299fac787a44659b3c6053eb30ce8bb","1a68570cfb884089b59a8b9ef0cae2d2","98dd1ac1d6da42e8a2c72d53d8025086","c441089422e74db495decab01014ce22","6f8c13a65a8348ce83147e456f9c196a","af1e6ffa86dc4a2e925d14b763311cd7","c89e84fad3614b1c9f2d84ca5fcaf83e","e1c907b33f9f485091a5aa29784f2736","ec21e69ba81e4375ba1dc36ba043cd9f","c8375e2ec80a4f0c877d5b0383358934","0453cf39148e40ab95be370687169c42","45917c19555a4bc0b16cab0dcf89815f","da863a35e1e54aa7a92dbff431d51dfd","dadf6af249b64d6fb0594bf8f5c75faf","1025bc80a5da4d98a1cce8a7b3782756","1cf5b4fbb2d14d6eba48bd08de1b97fc","2d3289d84e414d12ba037a60c44e4133","b74aba2f61ea4e36af9c56689d8ba510","eb7e15add7f34ab887d0d20dff0b1d26","51d38992c2ac404981dfa6c5cd8d6e46","b07b045e4eb64a74b487c4d897c6ba33","0d199bf4ad004b2ab38039034212e80d","85b2aed18fdf4168a14b97cb65104c6a"]},"id":"xHGtfnyBKXEb","executionInfo":{"status":"ok","timestamp":1715533611737,"user_tz":-120,"elapsed":377,"user":{"displayName":"Qilu Diana Wu","userId":"15924277351051054428"}},"outputId":"aca3d43b-828d-4c72-96b4-80dc7c3b40b3"},"execution_count":2,"outputs":[{"output_type":"display_data","data":{"text/plain":["VBox(children=(HTML(value='\n"," | Unnamed: 0 | \n","PARTNUMBER | \n","COLOR | \n","SUBSECCION | \n","FAMILY | \n","BASE_COLOR | \n","COLOR_HEX | \n","PHOTO_PLAIN | \n","PHOTO_MODEL | \n","DESCRIPTION | \n","NAME | \n","
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n","1 | \n","13352110800-I2023 | \n","800 | \n","Trf | \n","['shoes', 'sandals'] | \n","['black'] | \n","['#19191b'] | \n","/2023/I/1/1/p/3352/110/800/2/w/400/3352110800_... | \n","/2023/I/1/1/p/3352/110/800/2/w/400/3352110800_... | \n","Fabric high-heel sandals. Embellished floral f... | \n","HIGH-HEEL SANDALS WITH FLORAL DETAILS ... | \n","
1 | \n","39 | \n","04548287817-I2023 | \n","817 | \n","Basic | \n","['jewelry', 'earrings', 'ear cuffs'] | \n","['grey'] | \n","['#d7d4d2'] | \n","/2023/I/0/1/p/4548/287/817/2/w/400/4548287817_... | \n","/2023/I/0/1/p/4548/287/817/17/w/400/4548287817... | \n","-Pack of metal ear cuff earrings.<br/>-Wide ho... | \n","PACK OF EAR CUFFS ... | \n","
2 | \n","45 | \n","08338909401-I2023 | \n","401 | \n","Trf | \n","['skorts'] | \n","['blue'] | \n","['#141827'] | \n","/2023/I/0/1/p/8338/909/401/2/w/400/8338909401_... | \n","/2023/I/0/1/p/8338/909/401/2/w/400/8338909401_... | \n","High-waist skort. False welt pockets on the fr... | \n","TEXTURED ASYMMETRIC SKORT ... | \n","
3 | \n","46 | \n","02142043712-V2023 | \n","712 | \n","Woman | \n","['skirts', 'knit', 'knitwear'] | \n","['neutral'] | \n","['#e9e5d8'] | \n","/2023/V/0/1/p/2142/043/712/2/w/400/2142043712_... | \n","/2023/V/0/1/p/2142/043/712/2/w/400/2142043712_... | \n","High-waist skirt with an elastic waistband. A-... | \n","KNIT SKATER MINI SKIRT ... | \n","
4 | \n","60 | \n","02216483631-I2023 | \n","631 | \n","Woman | \n","['trousers'] | \n","['pink'] | \n","['#ce0066'] | \n","/2023/V/0/1/p/2216/483/631/2/w/400/2216483631_... | \n","/2023/V/0/1/p/2216/483/631/2/w/400/2216483631_... | \n","Loose-fitting high-waist trousers with a strai... | \n","LONG FLOWING TROUSERS ... | \n","
... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","
23243 | \n","158916 | \n","03641844641-I2023 | \n","641 | \n","Trf | \n","['t-shirts'] | \n","['red'] | \n","['#902121'] | \n","/2023/I/0/1/p/3641/844/641/2/w/400/3641844641_... | \n","/2023/I/0/1/p/3641/844/641/2/w/400/3641844641_... | \n","SLIM FIT - ROUND NECK - SHORT - LONG SLEEVES<b... | \n","CROPPED POLYAMIDE T-SHIRT ... | \n","
23244 | \n","158921 | \n","03641844737-I2023 | \n","737 | \n","Trf | \n","['t-shirts'] | \n","['grey'] | \n","['#6f534a'] | \n","/2023/I/0/1/p/3641/844/737/2/w/400/3641844737_... | \n","/2023/I/0/1/p/3641/844/737/17/w/400/3641844737... | \n","SLIM FIT - ROUND NECK - SHORT - LONG SLEEVES<b... | \n","CROPPED POLYAMIDE T-SHIRT ... | \n","
23245 | \n","158926 | \n","03641844805-I2023 | \n","805 | \n","Trf | \n","['t-shirts'] | \n","['grey'] | \n","['#e6e5e3'] | \n","/2023/I/0/1/p/3641/844/805/2/w/400/3641844805_... | \n","/2023/I/0/1/p/3641/844/805/17/w/400/3641844805... | \n","SLIM FIT - ROUND NECK - SHORT - LONG SLEEVES<b... | \n","CROPPED POLYAMIDE T-SHIRT ... | \n","
23246 | \n","158928 | \n","03641844800-I2023 | \n","800 | \n","Trf | \n","['t-shirts'] | \n","['black'] | \n","['#0a0a0a'] | \n","/2023/I/0/1/p/3641/844/800/2/w/400/3641844800_... | \n","/2023/I/0/1/p/7901/983/030/2/w/400/7901983030_... | \n","SLIM FIT - ROUND NECK - SHORT - LONG SLEEVES<b... | \n","CROPPED POLYAMIDE T-SHIRT ... | \n","
23247 | \n","158943 | \n","01758945922-I2023 | \n","922 | \n","Lencería | \n","['cardigans', 'lingerie'] | \n","['grey'] | \n","['#64686b'] | \n","/2023/I/0/1/p/1758/945/922/2/w/400/1758945922_... | \n","/2023/I/0/1/p/1758/945/922/17/w/400/1758945922... | \n","Cardigan made of 100% spun wool. Round neck an... | \n","100% WOOL CARDIGAN ... | \n","
23248 rows × 11 columns
\n","\n"," | PHOTO_PLAIN | \n","DESCRIPTION | \n","NAME | \n","
---|---|---|---|
0 | \n","<PIL.Image.Image image mode=RGB size=100x100 a... | \n","fabric high-heel sandals. embellished floral f... | \n","<Image> This image contains HIGH-HEEL SANDALS ... | \n","
1 | \n","<PIL.Image.Image image mode=RGB size=100x100 a... | \n","pack of metal ear cuff earrings. wide hoop ear... | \n","<Image> This image contains PACK OF EAR CUFFS ... | \n","
2 | \n","<PIL.Image.Image image mode=RGB size=100x100 a... | \n","high-waist skort. false welt pockets on the fr... | \n","<Image> This image contains TEXTURED ASYMMETRI... | \n","
3 | \n","<PIL.Image.Image image mode=RGB size=100x100 a... | \n","high-waist skirt with an elastic waistband. a-... | \n","<Image> This image contains KNIT SKATER MINI S... | \n","
4 | \n","<PIL.Image.Image image mode=RGB size=100x100 a... | \n","loose-fitting high-waist trousers with a strai... | \n","<Image> This image contains LONG FLOWING TROUS... | \n","
Epoch | \n","Training Loss | \n","Validation Loss | \n","
---|---|---|
0 | \n","0.285100 | \n","0.339750 | \n","
1 | \n","0.214200 | \n","0.293700 | \n","
"]},"metadata":{}},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.10/dist-packages/transformers/integrations/peft.py:399: FutureWarning: The `active_adapter` method is deprecated and will be removed in a future version.\n"," warnings.warn(\n","/usr/local/lib/python3.10/dist-packages/transformers/integrations/peft.py:399: FutureWarning: The `active_adapter` method is deprecated and will be removed in a future version.\n"," warnings.warn(\n","/usr/local/lib/python3.10/dist-packages/transformers/integrations/peft.py:399: FutureWarning: The `active_adapter` method is deprecated and will be removed in a future version.\n"," warnings.warn(\n","/usr/local/lib/python3.10/dist-packages/transformers/integrations/peft.py:399: FutureWarning: The `active_adapter` method is deprecated and will be removed in a future version.\n"," warnings.warn(\n","/usr/local/lib/python3.10/dist-packages/transformers/integrations/peft.py:399: FutureWarning: The `active_adapter` method is deprecated and will be removed in a future version.\n"," warnings.warn(\n","/usr/local/lib/python3.10/dist-packages/transformers/integrations/peft.py:399: FutureWarning: The `active_adapter` method is deprecated and will be removed in a future version.\n"," warnings.warn(\n","/usr/local/lib/python3.10/dist-packages/transformers/integrations/peft.py:399: FutureWarning: The `active_adapter` method is deprecated and will be removed in a future version.\n"," warnings.warn(\n","/usr/local/lib/python3.10/dist-packages/transformers/integrations/peft.py:399: FutureWarning: The `active_adapter` method is deprecated and will be removed in a future version.\n"," warnings.warn(\n","/usr/local/lib/python3.10/dist-packages/transformers/integrations/peft.py:399: FutureWarning: The `active_adapter` method is deprecated and will be removed in a future version.\n"," warnings.warn(\n"]},{"output_type":"execute_result","data":{"text/plain":["TrainOutput(global_step=2324, training_loss=0.2725233615162861, metrics={'train_runtime': 9705.9903, 'train_samples_per_second': 3.832, 'train_steps_per_second': 0.239, 'total_flos': 2.584157381900498e+17, 'train_loss': 0.2725233615162861, 'epoch': 1.9993547693300355})"]},"metadata":{},"execution_count":19}]},{"cell_type":"markdown","source":["We push to the fine-tuned checkpoint to the hub!"],"metadata":{"id":"y1FwigHsKru6"}},{"cell_type":"code","source":["trainer.push_to_hub()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"RgSYgAHCKt5o","outputId":"da1f3ed4-6401-4184-f7bc-c6e7619d28be"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.10/dist-packages/transformers/integrations/peft.py:399: FutureWarning: The `active_adapter` method is deprecated and will be removed in a future version.\n"," warnings.warn(\n"]}]},{"cell_type":"code","source":["\n"],"metadata":{"id":"uNS6u9zhyHxu","executionInfo":{"status":"aborted","timestamp":1715538702865,"user_tz":-120,"elapsed":7,"user":{"displayName":"Qilu Diana Wu","userId":"15924277351051054428"}}},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Evaluation\n","\n","Let's evaluate the model. First, we can have a look at a qualitative generation from the model."],"metadata":{"id":"VJ30xncwKvNA"}},{"cell_type":"code","source":["example = eval_dataset\n","example\n"],"metadata":{"id":"qrC2OT9nKwep","executionInfo":{"status":"aborted","timestamp":1715538702865,"user_tz":-120,"elapsed":7,"user":{"displayName":"Qilu Diana Wu","userId":"15924277351051054428"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["example[5]['PHOTO_PLAIN']"],"metadata":{"id":"EaeZdYXcKxgM","executionInfo":{"status":"aborted","timestamp":1715538702865,"user_tz":-120,"elapsed":6,"user":{"displayName":"Qilu Diana Wu","userId":"15924277351051054428"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["\n","for i in range(100):\n"," model.eval()\n"," image = example[i][\"PHOTO_PLAIN\"]\n"," query = example[i][\"NAME\"]\n","\n"," messages = [\n"," {\n"," \"role\": \"user\",\n"," \"content\": [\n"," {\"type\": \"text\", \"text\": \"Answer briefly.\"},\n"," {\"type\": \"image\"},\n"," {\"type\": \"text\", \"text\": query}\n"," ]\n"," }\n"," ]\n"," text = processor.apply_chat_template(messages, add_generation_prompt=True)\n"," inputs = processor(text=[text.strip()], images=[image], return_tensors=\"pt\", padding=True)\n"," generated_ids = model.generate(**inputs, max_new_tokens=64)\n"," generated_texts = processor.batch_decode(generated_ids[:, inputs[\"input_ids\"].size(1):], skip_special_tokens=True)\n"," print(generated_texts)\n"," print(example[i][\"DESCRIPTION\"])"],"metadata":{"id":"LA2wmpbqKyiw","executionInfo":{"status":"aborted","timestamp":1715538702865,"user_tz":-120,"elapsed":6,"user":{"displayName":"Qilu Diana Wu","userId":"15924277351051054428"}}},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["During the training, we tracked the loss on the evaluation split. It is interesting to measure the performance using the \"true metric\" used for DocVQA.\n","\n","The metric at hand is the *Average Normalized Levenshtein Similarity* (ANLS). The Average Normalized Levenshtein Similarity (ANLS) proposed by [Biten+ ICCV'19](https://arxiv.org/abs/1905.13648) smoothly captures the OCR mistakes applying a slight penalization in case of correct intended responses, but badly recognized. It also makes use of a threshold of value 0.5 that dictates whether the output of the metric will be the ANLS if its value is equal or bigger than 0.5 or 0 otherwise. The key point of this threshold is to determine if the answer has been correctly selected but not properly recognized, or on the contrary, the output is a wrong text selected from the options and given as an answer.\n","\n","We first define a few utilities to compute the ANLS."],"metadata":{"id":"9dx6jP6GK1Qh"}},{"cell_type":"code","source":["!pip install Levenshtein"],"metadata":{"id":"M51V73iTK3lS","executionInfo":{"status":"aborted","timestamp":1715538702865,"user_tz":-120,"elapsed":6,"user":{"displayName":"Qilu Diana Wu","userId":"15924277351051054428"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import Levenshtein\n","\n","def normalized_levenshtein(s1, s2):\n"," len_s1, len_s2 = len(s1), len(s2)\n"," distance = Levenshtein.distance(s1, s2)\n"," return distance / max(len_s1, len_s2)\n","\n","def similarity_score(a_ij, o_q_i, tau=0.5):\n"," nl = normalized_levenshtein(a_ij, o_q_i)\n"," return 1 - nl if nl < tau else 0\n","\n","def average_normalized_levenshtein_similarity(ground_truth, predicted_answers):\n"," assert len(ground_truth) == len(predicted_answers), \"Length of ground_truth and predicted_answers must match.\"\n","\n"," N = len(ground_truth)\n"," total_score = 0\n","\n"," for i in range(N):\n"," a_i = ground_truth[i]\n"," o_q_i = predicted_answers[i]\n"," if o_q_i == \"\":\n"," print(\"Warning: Skipped an empty prediction.\")\n"," max_score = 0\n"," else:\n"," max_score = max(similarity_score(a_ij, o_q_i) for a_ij in a_i)\n","\n"," total_score += max_score\n","\n"," return total_score / N\n"],"metadata":{"id":"aHhkjnZGK5H-","executionInfo":{"status":"aborted","timestamp":1715538702865,"user_tz":-120,"elapsed":7,"user":{"displayName":"Qilu Diana Wu","userId":"15924277351051054428"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Some gpu mem cleaning before inferencing eval. necessary because we are in memory constrained env\n","torch.cuda.empty_cache()\n"],"metadata":{"id":"W7yGE9lmK6h4","executionInfo":{"status":"aborted","timestamp":1715538702866,"user_tz":-120,"elapsed":7,"user":{"displayName":"Qilu Diana Wu","userId":"15924277351051054428"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from tqdm import tqdm\n","\n","EVAL_BATCH_SIZE = 1\n","\n","answers_unique = []\n","generated_texts_unique = []\n","\n","for i in tqdm(range(0, len(eval_dataset), EVAL_BATCH_SIZE)):\n"," examples = eval_dataset[i: i + EVAL_BATCH_SIZE]\n"," answers_unique.extend(examples[\"DESCRIPTION\"])\n"," images = [[im] for im in examples[\"PHOTO_PLAIN\"]]\n"," texts = []\n"," for q in examples[\"NAME\"]:\n"," messages = [\n"," {\n"," \"role\": \"user\",\n"," \"content\": [\n"," {\"type\": \"text\", \"text\": \"Answer briefly.\"},\n"," {\"type\": \"image\"},\n"," {\"type\": \"text\", \"text\": q}\n"," ]\n"," }\n"," ]\n"," text = processor.apply_chat_template(messages, add_generation_prompt=True)\n"," texts.append(text.strip())\n"," inputs = processor(text=texts, images=images, return_tensors=\"pt\", padding=True)\n"," generated_ids = model.generate(**inputs, max_new_tokens=64)\n"," generated_texts = processor.batch_decode(generated_ids[:, inputs[\"input_ids\"].size(1):], skip_special_tokens=True)\n"," generated_texts_unique.extend(generated_texts)\n"],"metadata":{"id":"8Cem8ywHK7nf","executionInfo":{"status":"aborted","timestamp":1715538702866,"user_tz":-120,"elapsed":7,"user":{"displayName":"Qilu Diana Wu","userId":"15924277351051054428"}}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["generated_texts_unique = [g.strip().strip(\".\") for g in generated_texts_unique]\n","anls = average_normalized_levenshtein_similarity(\n"," ground_truth=answers_unique, predicted_answers=generated_texts_unique,\n",")\n","print(anls)\n"],"metadata":{"id":"5zHANoWQK84e","executionInfo":{"status":"aborted","timestamp":1715538702866,"user_tz":-120,"elapsed":6,"user":{"displayName":"Qilu Diana Wu","userId":"15924277351051054428"}}},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["We obtain an ANLS score of ~65. This is relatively low compared to well-trained models on DocVQA, although keep in mind that we are training and evaluating on a relatively small subset of the data as an exercise. Note that we have not tuned the hyper-parameters.\n","\n","You should now have all the tools you need to fine-tuned Idefics-2 on your own dataset!"],"metadata":{"id":"a0Mn33y1K-H1"}}]} \ No newline at end of file