Fine Tuned v0.0.1

Browse files

Files changed (8) hide show

.gitignore +5 -3
Gemma2_2B/finetune.ipynb +514 -0
Gemma2_2B/finetune.py +0 -0
Gemma2_2B/hyperparams.yaml +34 -0
Gemma2_2B/inference.ipynb +303 -0
Gemma2_2B/inference.py +0 -0
pyproject.toml +1 -0
uv.lock +2 -0

.gitignore CHANGED Viewed

@@ -3,6 +3,8 @@
 FER/Images/
 TADBot.code-workspace
 FER/models/checkpoints
-**\*/__pycache__/\*
-**\*/.ipynb_checkpoints/\*
-**\*/.cache/\*

 FER/Images/
 TADBot.code-workspace
 FER/models/checkpoints
+FER/__pycache__
+FER/models/__pycache__
+Gemma2_2B/.cache
+Gemma2_2B/__pycache__
+Gemma2_2B/results

Gemma2_2B/finetune.ipynb ADDED Viewed

	@@ -0,0 +1,514 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import login\n",
+    "from dotenv import load_dotenv\n",
+    "import os\n",
+    "load_dotenv()\n",
+    "\n",
+    "# Login to Hugging Face Hub\n",
+    "login(token=os.getenv(\"HUGGINGFACE_TOKEN\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a39e6120cbea4462999cfa5f887a8015",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "README.md:   0%|          | 0.00/288 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "f:\\TADBot\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:139: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\Nitin Kausik Remella\\.cache\\huggingface\\hub\\datasets--ai-bites--databricks-mini. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "de15e48751c34c36b5d02c2449380d06",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "dolly-mini-train.jsonl:   0%|          | 0.00/5.24M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4094fd4af084a77a5bc3904b5db4197",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split:   0%|          | 0/10544 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['text'],\n",
+       "    num_rows: 1000\n",
+       "})"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "dataset_name = \"ai-bites/databricks-mini\"\n",
+    "dataset = load_dataset(dataset_name, split=\"train[0:1000]\", cache_dir=\".cache/\")\n",
+    "\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import (\n",
+    "    AutoModelForCausalLM,\n",
+    "    AutoTokenizer,\n",
+    "    BitsAndBytesConfig,\n",
+    "    HfArgumentParser,\n",
+    "    TrainingArguments,\n",
+    "    logging,\n",
+    ")\n",
+    "from peft import LoraConfig, PeftModel\n",
+    "from trl import SFTTrainer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import yaml\n",
+    "with open(\"hyperparams.yaml\", 'r') as file:\n",
+    "    hyperparams = yaml.load(file, Loader=yaml.FullLoader)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "compute_dtype = getattr(torch, hyperparams['bnb_4bit_compute_dtype'])\n",
+    "\n",
+    "bnb_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=hyperparams['use_4bit'], # Activates 4-bit precision loading\n",
+    "    bnb_4bit_quant_type=hyperparams['bnb_4bit_quant_type'], # nf4\n",
+    "    bnb_4bit_compute_dtype=compute_dtype, # float16\n",
+    "    bnb_4bit_use_double_quant=hyperparams['use_nested_quant'], # False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Setting BF16 to True\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check GPU compatibility with bfloat16\n",
+    "if compute_dtype == torch.float16 and hyperparams['use_4bit']:\n",
+    "    major, _ = torch.cuda.get_device_capability()\n",
+    "    if major >= 8:\n",
+    "        print(\"Setting BF16 to True\")\n",
+    "        hyperparams['bf16'] = True\n",
+    "    else:\n",
+    "        hyperparams['bf16'] = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9ab84ef6c43249de9726940a78f2717f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    hyperparams['model_name'],\n",
+    "    token=os.getenv(\"HUGGINGFACE_TOKEN\"),\n",
+    "    quantization_config=bnb_config,\n",
+    "    device_map=hyperparams['device_map'],\n",
+    "    cache_dir=\".cache/\",\n",
+    ")\n",
+    "model.config.use_cache = False\n",
+    "model.config.pretraining_tp = 1\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(hyperparams['model_name'], token=os.getenv(\"HUGGINGFACE_TOKEN\"), trust_remote_code=True, cache_dir=\".cache/\")\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "tokenizer.padding_side = \"right\" # Fix weird overflow issue with fp16 training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load LoRA configuration\n",
+    "peft_config = LoraConfig(\n",
+    "    lora_alpha=hyperparams['lora_alpha'],\n",
+    "    lora_dropout=hyperparams['lora_dropout'],\n",
+    "    r=hyperparams['lora_r'],\n",
+    "    bias=\"none\",\n",
+    "    task_type=\"CAUSAL_LM\",\n",
+    "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\"gate_proj\", \"up_proj\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "TrainingArguments(\n",
+       "_n_gpu=1,\n",
+       "accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},\n",
+       "adafactor=False,\n",
+       "adam_beta1=0.9,\n",
+       "adam_beta2=0.999,\n",
+       "adam_epsilon=1e-08,\n",
+       "auto_find_batch_size=False,\n",
+       "average_tokens_across_devices=False,\n",
+       "batch_eval_metrics=False,\n",
+       "bf16=True,\n",
+       "bf16_full_eval=False,\n",
+       "data_seed=None,\n",
+       "dataloader_drop_last=False,\n",
+       "dataloader_num_workers=0,\n",
+       "dataloader_persistent_workers=False,\n",
+       "dataloader_pin_memory=True,\n",
+       "dataloader_prefetch_factor=None,\n",
+       "ddp_backend=None,\n",
+       "ddp_broadcast_buffers=None,\n",
+       "ddp_bucket_cap_mb=None,\n",
+       "ddp_find_unused_parameters=None,\n",
+       "ddp_timeout=1800,\n",
+       "debug=[],\n",
+       "deepspeed=None,\n",
+       "disable_tqdm=False,\n",
+       "dispatch_batches=None,\n",
+       "do_eval=False,\n",
+       "do_predict=False,\n",
+       "do_train=False,\n",
+       "eval_accumulation_steps=None,\n",
+       "eval_delay=0,\n",
+       "eval_do_concat_batches=True,\n",
+       "eval_on_start=False,\n",
+       "eval_steps=None,\n",
+       "eval_strategy=IntervalStrategy.NO,\n",
+       "eval_use_gather_object=False,\n",
+       "evaluation_strategy=None,\n",
+       "fp16=False,\n",
+       "fp16_backend=auto,\n",
+       "fp16_full_eval=False,\n",
+       "fp16_opt_level=O1,\n",
+       "fsdp=[],\n",
+       "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},\n",
+       "fsdp_min_num_params=0,\n",
+       "fsdp_transformer_layer_cls_to_wrap=None,\n",
+       "full_determinism=False,\n",
+       "gradient_accumulation_steps=1,\n",
+       "gradient_checkpointing=False,\n",
+       "gradient_checkpointing_kwargs=None,\n",
+       "greater_is_better=None,\n",
+       "group_by_length=True,\n",
+       "half_precision_backend=auto,\n",
+       "hub_always_push=False,\n",
+       "hub_model_id=None,\n",
+       "hub_private_repo=False,\n",
+       "hub_strategy=HubStrategy.EVERY_SAVE,\n",
+       "hub_token=<HUB_TOKEN>,\n",
+       "ignore_data_skip=False,\n",
+       "include_for_metrics=[],\n",
+       "include_inputs_for_metrics=False,\n",
+       "include_num_input_tokens_seen=False,\n",
+       "include_tokens_per_second=False,\n",
+       "jit_mode_eval=False,\n",
+       "label_names=None,\n",
+       "label_smoothing_factor=0.0,\n",
+       "learning_rate=0.0002,\n",
+       "length_column_name=length,\n",
+       "load_best_model_at_end=False,\n",
+       "local_rank=0,\n",
+       "log_level=passive,\n",
+       "log_level_replica=warning,\n",
+       "log_on_each_node=True,\n",
+       "logging_dir=./results\\runs\\Nov15_13-14-10_FutureGadgetLab,\n",
+       "logging_first_step=False,\n",
+       "logging_nan_inf_filter=True,\n",
+       "logging_steps=25,\n",
+       "logging_strategy=IntervalStrategy.STEPS,\n",
+       "lr_scheduler_kwargs={},\n",
+       "lr_scheduler_type=SchedulerType.CONSTANT,\n",
+       "max_grad_norm=0.3,\n",
+       "max_steps=-1,\n",
+       "metric_for_best_model=None,\n",
+       "mp_parameters=,\n",
+       "neftune_noise_alpha=None,\n",
+       "no_cuda=False,\n",
+       "num_train_epochs=1,\n",
+       "optim=OptimizerNames.PAGED_ADAMW,\n",
+       "optim_args=None,\n",
+       "optim_target_modules=None,\n",
+       "output_dir=./results,\n",
+       "overwrite_output_dir=False,\n",
+       "past_index=-1,\n",
+       "per_device_eval_batch_size=8,\n",
+       "per_device_train_batch_size=2,\n",
+       "prediction_loss_only=False,\n",
+       "push_to_hub=False,\n",
+       "push_to_hub_model_id=None,\n",
+       "push_to_hub_organization=None,\n",
+       "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
+       "ray_scope=last,\n",
+       "remove_unused_columns=True,\n",
+       "report_to=['tensorboard'],\n",
+       "restore_callback_states_from_checkpoint=False,\n",
+       "resume_from_checkpoint=None,\n",
+       "run_name=./results,\n",
+       "save_on_each_node=False,\n",
+       "save_only_model=False,\n",
+       "save_safetensors=True,\n",
+       "save_steps=25,\n",
+       "save_strategy=IntervalStrategy.STEPS,\n",
+       "save_total_limit=None,\n",
+       "seed=42,\n",
+       "skip_memory_metrics=True,\n",
+       "split_batches=None,\n",
+       "tf32=None,\n",
+       "torch_compile=False,\n",
+       "torch_compile_backend=None,\n",
+       "torch_compile_mode=None,\n",
+       "torch_empty_cache_steps=None,\n",
+       "torchdynamo=None,\n",
+       "tpu_metrics_debug=False,\n",
+       "tpu_num_cores=None,\n",
+       "use_cpu=False,\n",
+       "use_ipex=False,\n",
+       "use_legacy_prediction_loop=False,\n",
+       "use_liger_kernel=False,\n",
+       "use_mps_device=False,\n",
+       "warmup_ratio=0.03,\n",
+       "warmup_steps=0,\n",
+       "weight_decay=0.001,\n",
+       ")"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Set training parameters\n",
+    "training_arguments = TrainingArguments(\n",
+    "    output_dir=hyperparams['output_dir'],\n",
+    "    num_train_epochs=hyperparams['num_train_epochs'],\n",
+    "    per_device_train_batch_size=hyperparams['per_device_train_batch_size'],\n",
+    "    gradient_accumulation_steps=hyperparams['gradient_accumulation_steps'],\n",
+    "    optim=hyperparams['optimizer'],\n",
+    "    save_steps=hyperparams['save_steps'],\n",
+    "    logging_steps=hyperparams['logging_steps'],\n",
+    "    learning_rate=float(hyperparams['learning_rate']),\n",
+    "    weight_decay=hyperparams['weight_decay'],\n",
+    "    fp16=hyperparams['fp16'],\n",
+    "    bf16=hyperparams['bf16'],\n",
+    "    max_grad_norm=hyperparams['max_grad_norm'],\n",
+    "    max_steps=hyperparams['max_steps'],\n",
+    "    warmup_ratio=hyperparams['warmup_ratio'],\n",
+    "    group_by_length=hyperparams['group_by_length'],\n",
+    "    lr_scheduler_type=hyperparams['lr_scheduler_type'],\n",
+    "    report_to=\"tensorboard\",\n",
+    ")\n",
+    "training_arguments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "f:\\TADBot\\.venv\\Lib\\site-packages\\huggingface_hub\\utils\\_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length, packing. Will not be supported from version '0.13.0'.\n",
+      "\n",
+      "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n",
+      "  warnings.warn(message, FutureWarning)\n",
+      "f:\\TADBot\\.venv\\Lib\\site-packages\\trl\\trainer\\sft_trainer.py:212: UserWarning: You passed a `packing` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+      "  warnings.warn(\n",
+      "f:\\TADBot\\.venv\\Lib\\site-packages\\trl\\trainer\\sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+      "  warnings.warn(\n",
+      "f:\\TADBot\\.venv\\Lib\\site-packages\\trl\\trainer\\sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "trainer = SFTTrainer(\n",
+    "    model=model,\n",
+    "    train_dataset=dataset,\n",
+    "    peft_config=peft_config,\n",
+    "    dataset_text_field=\"text\",\n",
+    "    # formatting_func=format_prompts_fn,\n",
+    "    max_seq_length=hyperparams['max_seq_length'],\n",
+    "    tokenizer=tokenizer,\n",
+    "    args=training_arguments,\n",
+    "    packing=hyperparams['packing'],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0033f5bb31a7416facfd8a3fd3bd5ad1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1340 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 3.8879, 'grad_norm': 18.030195236206055, 'learning_rate': 0.0002, 'epoch': 0.02}\n",
+      "{'loss': 2.9569, 'grad_norm': 9.667036056518555, 'learning_rate': 0.0002, 'epoch': 0.04}\n",
+      "{'loss': 2.6361, 'grad_norm': 9.089476585388184, 'learning_rate': 0.0002, 'epoch': 0.06}\n",
+      "{'loss': 2.9523, 'grad_norm': 6.053662300109863, 'learning_rate': 0.0002, 'epoch': 0.07}\n",
+      "{'loss': 2.8543, 'grad_norm': 7.764152526855469, 'learning_rate': 0.0002, 'epoch': 0.09}\n",
+      "{'loss': 2.8802, 'grad_norm': 6.539248466491699, 'learning_rate': 0.0002, 'epoch': 0.11}\n",
+      "{'loss': 2.7047, 'grad_norm': 5.485109329223633, 'learning_rate': 0.0002, 'epoch': 0.13}\n",
+      "{'loss': 2.6576, 'grad_norm': 9.22624397277832, 'learning_rate': 0.0002, 'epoch': 0.15}\n",
+      "{'loss': 2.7756, 'grad_norm': 6.477100372314453, 'learning_rate': 0.0002, 'epoch': 0.17}\n",
+      "{'loss': 2.7012, 'grad_norm': 5.891603946685791, 'learning_rate': 0.0002, 'epoch': 0.19}\n",
+      "{'loss': 2.5026, 'grad_norm': 5.75968599319458, 'learning_rate': 0.0002, 'epoch': 0.21}\n",
+      "{'loss': 2.8085, 'grad_norm': 7.938610076904297, 'learning_rate': 0.0002, 'epoch': 0.22}\n",
+      "{'loss': 2.5286, 'grad_norm': 5.600504398345947, 'learning_rate': 0.0002, 'epoch': 0.24}\n",
+      "{'loss': 2.5495, 'grad_norm': 6.746212005615234, 'learning_rate': 0.0002, 'epoch': 0.26}\n",
+      "{'loss': 2.7405, 'grad_norm': 3.8923749923706055, 'learning_rate': 0.0002, 'epoch': 0.28}\n",
+      "{'loss': 2.5657, 'grad_norm': 5.949460506439209, 'learning_rate': 0.0002, 'epoch': 0.3}\n",
+      "{'loss': 2.6052, 'grad_norm': 5.733223915100098, 'learning_rate': 0.0002, 'epoch': 0.32}\n",
+      "{'loss': 2.673, 'grad_norm': 6.0587310791015625, 'learning_rate': 0.0002, 'epoch': 0.34}\n",
+      "{'loss': 2.4631, 'grad_norm': 4.734077453613281, 'learning_rate': 0.0002, 'epoch': 0.35}\n",
+      "{'loss': 2.7288, 'grad_norm': 6.7847700119018555, 'learning_rate': 0.0002, 'epoch': 0.37}\n",
+      "{'loss': 2.7797, 'grad_norm': 5.118943214416504, 'learning_rate': 0.0002, 'epoch': 0.39}\n",
+      "{'loss': 2.8644, 'grad_norm': 5.4167304039001465, 'learning_rate': 0.0002, 'epoch': 0.41}\n",
+      "{'loss': 2.5779, 'grad_norm': 6.73247766494751, 'learning_rate': 0.0002, 'epoch': 0.43}\n",
+      "{'loss': 2.6459, 'grad_norm': 4.644010066986084, 'learning_rate': 0.0002, 'epoch': 0.45}\n",
+      "{'loss': 2.5321, 'grad_norm': 6.347738265991211, 'learning_rate': 0.0002, 'epoch': 0.47}\n",
+      "{'loss': 2.6865, 'grad_norm': 5.185911655426025, 'learning_rate': 0.0002, 'epoch': 0.49}\n",
+      "{'loss': 2.4668, 'grad_norm': 5.355742454528809, 'learning_rate': 0.0002, 'epoch': 0.5}\n",
+      "{'loss': 2.8465, 'grad_norm': 5.4434380531311035, 'learning_rate': 0.0002, 'epoch': 0.52}\n",
+      "{'loss': 2.7376, 'grad_norm': 4.8459882736206055, 'learning_rate': 0.0002, 'epoch': 0.54}\n",
+      "{'loss': 2.5205, 'grad_norm': 5.886116981506348, 'learning_rate': 0.0002, 'epoch': 0.56}\n",
+      "{'loss': 2.7473, 'grad_norm': 4.946981906890869, 'learning_rate': 0.0002, 'epoch': 0.58}\n",
+      "{'loss': 2.6824, 'grad_norm': 6.349016189575195, 'learning_rate': 0.0002, 'epoch': 0.6}\n",
+      "{'loss': 2.6485, 'grad_norm': 5.024953365325928, 'learning_rate': 0.0002, 'epoch': 0.62}\n",
+      "{'loss': 2.7172, 'grad_norm': 5.583380222320557, 'learning_rate': 0.0002, 'epoch': 0.63}\n",
+      "{'loss': 2.5879, 'grad_norm': 6.582890033721924, 'learning_rate': 0.0002, 'epoch': 0.65}\n"
+     ]
+    }
+   ],
+   "source": [
+    "trainer.train()\n",
+    "trainer.model.save_pretrained(hyperparams['new_model_name'])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Gemma2_2B/finetune.py DELETED Viewed

File without changes

Gemma2_2B/hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+model_name: "google/gemma-2-2b-it"
+new_model_name: "gemma-2-2b-ft"
+lora_r: 4
+lora_alpha: 16
+lora_dropout: 0.1
+use_4bit: True
+bnb_4bit_compute_dtype: "float16"
+bnb_4bit_quant_type: "nf4"
+use_nested_quant: False
+output_dir: "./results"
+num_train_epochs: 1
+fp16: False
+bf16: False
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 2
+gradient_accumulation_steps: 1
+gradient_checkpointing: True
+max_grad_norm: 0.3
+learning_rate: 2e-4
+weight_decay: 0.001
+optimizer: "paged_adamw_32bit"
+lr_scheduler_type: "constant"
+max_steps: -1
+warmup_ratio: 0.03
+group_by_length: True
+save_steps: 25
+logging_steps: 25
+max_seq_length: 40
+packing: True
+device_map: "auto"

Gemma2_2B/inference.ipynb ADDED Viewed

	@@ -0,0 +1,303 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import login\n",
+    "from dotenv import load_dotenv\n",
+    "import os\n",
+    "load_dotenv()\n",
+    "\n",
+    "# Login to Hugging Face Hub\n",
+    "login(token=os.getenv(\"HUGGINGFACE_TOKEN\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d00ec085003e409d906784abc1f89dc1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "f:\\TADBot\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:139: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in F:\\TADBot\\Gemma2_2B\\.cache\\models--google--gemma-2-2b-it. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bdee67c51d7547a48e45f17db7fb3734",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ad86eff32cc1447486e69c5f5f90e4a4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "78cab016a2d54731a94ef45e85d65ddd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "52b50ff81d0d481ab475878606935162",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7dfed61b7e0a4338aee7ad14df4d85ca",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9ac1e6a0b72a44d3a8a648bce2138c3d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f0129c204a454f22968aebe59b75ea1a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ca55b303b11347cbbf5970327d2d8a82",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "33601521ca8544e7a98c88506257dd20",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f353232bbf6b4da3ac62e02fa7f58990",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "model_name = \"google/gemma-2-2b-it\"\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", cache_dir=\".cache/\")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=\".cache/\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Gemma2ForCausalLM(\n",
+      "  (model): Gemma2Model(\n",
+      "    (embed_tokens): Embedding(256000, 2304, padding_idx=0)\n",
+      "    (layers): ModuleList(\n",
+      "      (0-25): 26 x Gemma2DecoderLayer(\n",
+      "        (self_attn): Gemma2Attention(\n",
+      "          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)\n",
+      "          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)\n",
+      "          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)\n",
+      "          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)\n",
+      "          (rotary_emb): Gemma2RotaryEmbedding()\n",
+      "        )\n",
+      "        (mlp): Gemma2MLP(\n",
+      "          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)\n",
+      "          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)\n",
+      "          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)\n",
+      "          (act_fn): PytorchGELUTanh()\n",
+      "        )\n",
+      "        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+      "        (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+      "        (post_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+      "        (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+      "      )\n",
+      "    )\n",
+      "    (norm): Gemma2RMSNorm((2304,), eps=1e-06)\n",
+      "  )\n",
+      "  (lm_head): Linear(in_features=2304, out_features=256000, bias=False)\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<bos>What should I do on a trip to Europe?\n",
+      "\n",
+      "That's a great question!  To give you the best advice, I need a little more information. Tell me about:\n",
+      "\n",
+      "**1. Your Interests:** \n",
+      "   * What kind of things do you enjoy doing? (History, art, food, nightlife, nature, adventure, relaxation, etc.)\n",
+      "   * Are there any specific places or activities you've always wanted to experience?\n",
+      "\n",
+      "**2. Your Travel Style:**\n",
+      "   * Do you prefer to travel on your own, with a partner, or with a group?\n",
+      "   * Do you like to plan everything in advance or be more spontaneous?\n",
+      "   * What's your budget like?\n",
+      "\n",
+      "**3. Your Trip Details:**\n",
+      "   * How long will you be traveling for?\n",
+      "   * What time of year are you planning to go?\n",
+      "   * Do you have any specific destinations in mind?\n",
+      "\n",
+      "Once I have this information, I can give you personalized recommendations for your European adventure! \n",
+      "<end_of_turn>\n",
+      "CPU times: total: 7.23 s\n",
+      "Wall time: 7.56 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "input_text = \"What should I do on a trip to Europe?\"\n",
+    "\n",
+    "input_ids = tokenizer(input_text, return_tensors=\"pt\").to(\"cuda\")\n",
+    "outputs = model.generate(**input_ids, max_length=2048)\n",
+    "print(tokenizer.decode(outputs[0]))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Gemma2_2B/inference.py DELETED Viewed

File without changes

pyproject.toml CHANGED Viewed

@@ -25,6 +25,7 @@ dependencies = [
     "python-dotenv>=1.0.1",
     "ipykernel>=6.29.5",
     "ipywidgets>=8.1.5",
 ]
 [tool.uv.sources]

     "python-dotenv>=1.0.1",
     "ipykernel>=6.29.5",
     "ipywidgets>=8.1.5",
+    "pyyaml>=6.0.2",
 ]
 [tool.uv.sources]

uv.lock CHANGED Viewed

@@ -800,6 +800,7 @@ dependencies = [
     { name = "numpy", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
     { name = "peft", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
     { name = "python-dotenv", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
     { name = "ruff", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
     { name = "tensorboard", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
     { name = "thop", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
@@ -836,6 +837,7 @@ requires-dist = [
     { name = "numpy", specifier = ">=1.26.4" },
     { name = "peft", specifier = ">=0.13.2" },
     { name = "python-dotenv", specifier = ">=1.0.1" },
     { name = "ruff", specifier = ">=0.7.3" },
     { name = "tensorboard", specifier = ">=2.18.0" },
     { name = "thop", specifier = ">=0.1.1.post2209072238" },

     { name = "numpy", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
     { name = "peft", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
     { name = "python-dotenv", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
+    { name = "pyyaml", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
     { name = "ruff", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
     { name = "tensorboard", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
     { name = "thop", marker = "(platform_machine != 'aarch64' and python_full_version >= '3.12') or (platform_system != 'Linux' and python_full_version >= '3.12') or platform_system == 'Darwin' or (platform_machine == 'aarch64' and platform_system == 'Linux')" },
     { name = "numpy", specifier = ">=1.26.4" },
     { name = "peft", specifier = ">=0.13.2" },
     { name = "python-dotenv", specifier = ">=1.0.1" },
+    { name = "pyyaml", specifier = ">=6.0.2" },
     { name = "ruff", specifier = ">=0.7.3" },
     { name = "tensorboard", specifier = ">=2.18.0" },
     { name = "thop", specifier = ">=0.1.1.post2209072238" },