{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "from transformers import AutoTokenizer" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"4,5,6,7\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "basepath = \"/data/jyk/aac_dataset/clotho/encodec/\"\n", "tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "data_files = {\"train\": \"csv/train_short.csv\", \"validation\": \"csv/valid_short.csv\"}" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-8533483370f473b7/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)\n", "100%|██████████| 2/2 [00:00<00:00, 923.96it/s]\n" ] } ], "source": [ "raw_dataset = load_dataset(\"csv\", data_files=data_files)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['file_path', 'caption'],\n", " num_rows: 19175\n", "})" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_dataset['train']" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from data.collator import EncodecCollator\n", "import numpy as np\n", "import os" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def preprocessing(example):\n", " path = example['file_path']\n", " encodec = np.load(os.path.join(basepath, path))\n", " if encodec.shape[0]>1022:\n", " encodec = encodec[:1022, :]\n", " attention_mask = np.ones(encodec.shape[0]+2)\n", " target_text = tokenizer(text_target=example['caption'])\n", "\n", " return {'input_ids': encodec , 'attention_mask': attention_mask, 'labels': target_text['input_ids'], 'decoder_attention_mask': target_text['attention_mask']}\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " \r" ] } ], "source": [ "train_dataset = raw_dataset['train'].map(preprocessing, num_proc=16)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "train_dataset.set_format(\"np\", columns=['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-8533483370f473b7/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-a3db71731640afd3_*_of_00016.arrow\n" ] } ], "source": [ "valid_dataset = raw_dataset['validation'].map(preprocessing, num_proc=16)\n", "valid_dataset.set_format(\"np\", columns=['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of AudioBartForConditionalGeneration were not initialized from the model checkpoint at bart/model and are newly initialized: ['model.encodec_embeddings.6.weight', 'model.encodec_embeddings.4.weight', 'model.encodec_embeddings.1.weight', 'model.encodec_embeddings.7.weight', 'model.encodec_embeddings.5.weight', 'model.encodec_embeddings.3.weight', 'model.encodec_embeddings.2.weight', 'model.encodec_embeddings.0.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "from modeling.audiobart import AudioBartForConditionalGeneration\n", "model = AudioBartForConditionalGeneration.from_pretrained('bart/model')" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "414688256\n" ] } ], "source": [ "from utils import count_parameters\n", "print(count_parameters(model))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "data_collator = EncodecCollator(tokenizer=tokenizer, model=model, return_tensors=\"pt\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "training_args = Seq2SeqTrainingArguments('summary_test', per_gpu_train_batch_size=16)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.\n" ] } ], "source": [ "trainer = Seq2SeqTrainer(\n", " model, training_args, train_dataset=valid_dataset, eval_dataset=valid_dataset, data_collator=data_collator, tokenizer=tokenizer\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }