diff --git "a/peft_lora_embedding_semantic_similarity_inference.ipynb" "b/peft_lora_embedding_semantic_similarity_inference.ipynb" new file mode 100644--- /dev/null +++ "b/peft_lora_embedding_semantic_similarity_inference.ipynb" @@ -0,0 +1,1808 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3e7b6247", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-06-29 09:08:24,868] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n", + "\n", + "===================================BUG REPORT===================================\n", + "Welcome to bitsandbytes. For bug reports, please run\n", + "\n", + "python -m bitsandbytes\n", + "\n", + " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", + "================================================================================\n", + "bin /home/sourab/miniconda3/envs/ml/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda118.so\n", + "CUDA SETUP: CUDA runtime path found: /home/sourab/miniconda3/envs/ml/lib/libcudart.so\n", + "CUDA SETUP: Highest compute capability among GPUs detected: 7.5\n", + "CUDA SETUP: Detected CUDA version 118\n", + "CUDA SETUP: Loading binary /home/sourab/miniconda3/envs/ml/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/sourab/miniconda3/envs/ml/lib/python3.11/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: Found duplicate ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] files: {PosixPath('/home/sourab/miniconda3/envs/ml/lib/libcudart.so'), PosixPath('/home/sourab/miniconda3/envs/ml/lib/libcudart.so.11.0')}.. We'll flip a coin and try one of these, in order to fail forward.\n", + "Either way, this might cause trouble in the future:\n", + "If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.\n", + " warn(msg)\n" + ] + } + ], + "source": [ + "import argparse\n", + "import json\n", + "import logging\n", + "import math\n", + "import os\n", + "import random\n", + "from pathlib import Path\n", + "from tqdm import tqdm\n", + "\n", + "import datasets\n", + "from datasets import load_dataset, DatasetDict\n", + "\n", + "import evaluate\n", + "import torch\n", + "from torch import nn\n", + "from torch.utils.data import DataLoader\n", + "\n", + "import transformers\n", + "from transformers import AutoTokenizer, AutoModel, default_data_collator, SchedulerType, get_scheduler\n", + "from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry\n", + "from transformers.utils.versions import require_version\n", + "\n", + "from huggingface_hub import Repository, create_repo\n", + "\n", + "from accelerate import Accelerator\n", + "from accelerate.logging import get_logger\n", + "from accelerate.utils import set_seed\n", + "\n", + "from peft import PeftModel\n", + "\n", + "import hnswlib" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c939b4fd", + "metadata": {}, + "outputs": [], + "source": [ + "class AutoModelForSentenceEmbedding(nn.Module):\n", + " def __init__(self, model_name, tokenizer, normalize=True):\n", + " super(AutoModelForSentenceEmbedding, self).__init__()\n", + "\n", + " self.model = AutoModel.from_pretrained(model_name) # , load_in_8bit=True, device_map={\"\":0})\n", + " self.normalize = normalize\n", + " self.tokenizer = tokenizer\n", + "\n", + " def forward(self, **kwargs):\n", + " model_output = self.model(**kwargs)\n", + " embeddings = self.mean_pooling(model_output, kwargs[\"attention_mask\"])\n", + " if self.normalize:\n", + " embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)\n", + "\n", + " return embeddings\n", + "\n", + " def mean_pooling(self, model_output, attention_mask):\n", + " token_embeddings = model_output[0] # First element of model_output contains all token embeddings\n", + " input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n", + " return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n", + "\n", + " def __getattr__(self, name: str):\n", + " \"\"\"Forward missing attributes to the wrapped module.\"\"\"\n", + " try:\n", + " return super().__getattr__(name) # defer to nn.Module's logic\n", + " except AttributeError:\n", + " return getattr(self.model, name)\n", + "\n", + "\n", + "def get_cosing_embeddings(query_embs, product_embs):\n", + " return torch.sum(query_embs * product_embs, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8b5d9256", + "metadata": {}, + "outputs": [], + "source": [ + "model_name_or_path = \"intfloat/e5-large-v2\"\n", + "peft_model_id = \"smangrul/peft_lora_e5_semantic_search\"\n", + "dataset_name = \"smangrul/amazon_esci\"\n", + "max_length = 70\n", + "batch_size = 256" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f190e1ee", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset parquet (/raid/sourab/.cache/huggingface/datasets/smangrul___parquet/smangrul--amazon_esci-321288cabf0cc045/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "43b84641575e4ce6899a3e6f61d7e126", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexproduct_title
00RamPro 10\" All Purpose Utility Air Tires/Wheel...
11MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...
22NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...
332PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...
44(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...
.........
476273476273Chanel No.5 Eau Premiere Spray 50ml/1.7oz
476274476274Steve Madden Designer 15 Inch Carry on Suitcas...
476275476275CHANEL Le Lift Creme Yeux, Black, 0.5 Ounce
476276476276Coco Mademoiselle by Chanel for Women - 3.4 oz...
476277476277Chânél No. 5 by Chânél Eau De Parfum Premiere ...
\n", + "

476278 rows × 2 columns

\n", + "" + ], + "text/plain": [ + " index product_title\n", + "0 0 RamPro 10\" All Purpose Utility Air Tires/Wheel...\n", + "1 1 MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...\n", + "2 2 NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...\n", + "3 3 2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...\n", + "4 4 (Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...\n", + "... ... ...\n", + "476273 476273 Chanel No.5 Eau Premiere Spray 50ml/1.7oz\n", + "476274 476274 Steve Madden Designer 15 Inch Carry on Suitcas...\n", + "476275 476275 CHANEL Le Lift Creme Yeux, Black, 0.5 Ounce\n", + "476276 476276 Coco Mademoiselle by Chanel for Women - 3.4 oz...\n", + "476277 476277 Chânél No. 5 by Chânél Eau De Parfum Premiere ...\n", + "\n", + "[476278 rows x 2 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "product_dataset_for_indexing" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "85840ec6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexproduct_title
3471034710ROK 4-1/2 inch Diamond Saw Blade Set, Pack of 3
277590277590WSGG Medical Goggles, FDA registered, Safety Goggles, Fit Over Glasses, Anti-Fog, Anti-Splash (1 pack)
474000474000iJDMTOY 15W CREE High Power LED Angel Eye Bulbs Compatible With BMW 5 6 7 Series X3 X5 (E39 E60 E63 E65 E53), 7000K Xenon White Headlight Ring Marker Lights
1899718997USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More
208666208666AOGGY Compatible with MacBook Air 13 inch Case A1466/A1369 (2010-2017 Release) Glitter Fluorescent Color Plastic Hard Case, with Older Version MacBook Air 13 inch Keyboard Cover - Gold
326614326614CUTE STONE Little Kitchen Playset, Kitchen Toy Set with Realistic Sound &Light, Play Sink, Cooking Stove with Steam, Play Food and Kitchen Accessories, Great Kitchen Toys for Toddlers Kids
105637105637Milwaukee Electric Tool 2470-21 M12 Cordless Shear Kit, 12 V, Li-Ion
342392342392chouyatou Women's Short Sleeve/Strap Open Bust Bodysuit Shapewear Firm Control Body Shaper (X-Small, Nude Sleeve)
319970319970AMT 256 Hz Medical-Grade Tuning Fork Instrument with Fixed Weights, Non-Magnetic Aluminum Alloy (C 256)
416956416956Timberland HIKER-ROUND 54 BROWN
\n", + "
" + ], + "text/plain": [ + " index \\\n", + "34710 34710 \n", + "277590 277590 \n", + "474000 474000 \n", + "18997 18997 \n", + "208666 208666 \n", + "326614 326614 \n", + "105637 105637 \n", + "342392 342392 \n", + "319970 319970 \n", + "416956 416956 \n", + "\n", + " product_title \n", + "34710 ROK 4-1/2 inch Diamond Saw Blade Set, Pack of 3 \n", + "277590 WSGG Medical Goggles, FDA registered, Safety Goggles, Fit Over Glasses, Anti-Fog, Anti-Splash (1 pack) \n", + "474000 iJDMTOY 15W CREE High Power LED Angel Eye Bulbs Compatible With BMW 5 6 7 Series X3 X5 (E39 E60 E63 E65 E53), 7000K Xenon White Headlight Ring Marker Lights \n", + "18997 USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More \n", + "208666 AOGGY Compatible with MacBook Air 13 inch Case A1466/A1369 (2010-2017 Release) Glitter Fluorescent Color Plastic Hard Case, with Older Version MacBook Air 13 inch Keyboard Cover - Gold \n", + "326614 CUTE STONE Little Kitchen Playset, Kitchen Toy Set with Realistic Sound &Light, Play Sink, Cooking Stove with Steam, Play Food and Kitchen Accessories, Great Kitchen Toys for Toddlers Kids \n", + "105637 Milwaukee Electric Tool 2470-21 M12 Cordless Shear Kit, 12 V, Li-Ion \n", + "342392 chouyatou Women's Short Sleeve/Strap Open Bust Bodysuit Shapewear Firm Control Body Shaper (X-Small, Nude Sleeve) \n", + "319970 AMT 256 Hz Medical-Grade Tuning Fork Instrument with Fixed Weights, Non-Magnetic Aluminum Alloy (C 256) \n", + "416956 Timberland HIKER-ROUND 54 BROWN " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.set_option(\"max_colwidth\", 300)\n", + "product_dataset_for_indexing.sample(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "408b6e00", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running tokenizer on dataset: 0%| | 0/476278 [00:00 k\n", + "\n", + " # Query dataset, k - number of the closest elements (returns 2 numpy arrays)\n", + " labels, distances = search_index.knn_query(query_embeddings, k=k)\n", + "\n", + " return [\n", + " (ids_to_products_dict[label], (1 - distance))\n", + " for label, distance in zip(labels[0], distances[0])\n", + " if (1 - distance) >= threshold\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "1c47f12d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='NLP and ML books'\n", + "cosine_sim_score=0.92 product='Machine Learning: A Journey from Beginner to Advanced Including Deep Learning, Scikit-learn and Tensorflow'\n", + "cosine_sim_score=0.91 product='Mastering Machine Learning with scikit-learn'\n", + "cosine_sim_score=0.91 product='Hands-On Machine Learning with Scikit-Learn and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems'\n", + "cosine_sim_score=0.91 product='Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems'\n", + "cosine_sim_score=0.91 product='Practical Deep Learning: A Python-Based Introduction'\n", + "cosine_sim_score=0.9 product='Machine Learning: A Hands-On, Project-Based Introduction to Machine Learning for Absolute Beginners: Mastering Engineering ML Systems using Scikit-Learn and TensorFlow'\n", + "cosine_sim_score=0.9 product='Mastering Machine Learning with scikit-learn - Second Edition: Apply effective learning algorithms to real-world problems using scikit-learn'\n", + "cosine_sim_score=0.9 product='Mastering Machine Learning on AWS: Advanced machine learning in Python using SageMaker, Apache Spark, and TensorFlow'\n", + "cosine_sim_score=0.9 product='Machine Learning Algorithms: Naive Bayes'\n", + "cosine_sim_score=0.9 product='Fundamentals of Machine Learning for Predictive Data Anayltics: Algorithms, Worked Examples, and Case Studies'\n" + ] + } + ], + "source": [ + "query = \"NLP and ML books\"\n", + "k = 10\n", + "query_embeddings = get_query_embeddings(query, model, tokenizer, device)\n", + "search_results = get_nearest_neighbours(k, product_search_index, query_embeddings, ids_to_products_dict, threshold=0.7)\n", + "\n", + "print(f\"{query=}\")\n", + "for product, cosine_sim_score in search_results:\n", + " print(f\"cosine_sim_score={round(cosine_sim_score,2)} {product=}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9e2dd2c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}