{ "cells": [ { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "#! pip install openai" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "from dotenv import load_dotenv\n", "from openai import OpenAI\n", "import os" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Preparación para Fine-Tuning" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "load_dotenv()\n", "API_KEY = os.getenv('OPENAI_KEY')" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "client = OpenAI()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Separamos en Training y Validation cada file" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "# Función para separar la data\n", "def dividir_training_validation(ruta_archivo, proporcion_training=0.8):\n", " # Leer todas las líneas del archivo\n", " with open(ruta_archivo, 'r', encoding='utf-8') as file:\n", " lineas = file.readlines()\n", "\n", " # Calcular el punto de corte para el conjunto de entrenamiento\n", " corte = int(len(lineas) * proporcion_training)\n", "\n", " # Dividir las líneas en conjuntos de entrenamiento y validación\n", " lineas_training = lineas[:corte]\n", " lineas_validation = lineas[corte:]\n", "\n", " # Crear archivos para training y validation\n", " ruta_archivo_base = ruta_archivo.replace('.jsonl', '')\n", " archivo_training = f'{ruta_archivo_base}_train.jsonl'\n", " archivo_validation = f'{ruta_archivo_base}_val.jsonl'\n", "\n", " # Escribir el conjunto de entrenamiento\n", " with open(archivo_training, 'w', encoding='utf-8') as file:\n", " file.writelines(lineas_training)\n", "\n", " # Escribir el conjunto de validación\n", " with open(archivo_validation, 'w', encoding='utf-8') as file:\n", " file.writelines(lineas_validation)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "# Llamadas a la función para crear la separación\n", "#dividir_training_validation('Training_Data/Training_Prompts.jsonl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Subimos files de entrenamiento y validación" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\nupload_val_response = client.files.create(\\n file=open(\"Training_Data/Training_Prompts_val.jsonl\", \"rb\"),\\n purpose=\"fine-tune\"\\n)\\n'" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Para Training\n", "upload_train_response = client.files.create(\n", " file=open(\"Training_Data/Training_Prompts.jsonl\", \"rb\"),\n", " purpose=\"fine-tune\"\n", ")\n", "\n", "# Para Validation\n", "\"\"\"\n", "upload_val_response = client.files.create(\n", " file=open(\"Training_Data/Training_Prompts_val.jsonl\", \"rb\"),\n", " purpose=\"fine-tune\"\n", ")\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training file id:\tfile-exTRUxeCWwPXQRRThzhgWIM3\n" ] } ], "source": [ "train_file_id = upload_train_response.id\n", "#val_file_id = upload_val_response.id\n", "\n", "print(f'Training file id:\\t{train_file_id}')\n", "#print(f'Validation file id:\\t{val_file_id}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Trabajo de fine-tuning" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "fine_tune_response = client.fine_tuning.jobs.create(\n", " training_file=train_file_id, \n", " #validation_file=val_file_id,\n", " model=\"ft:gpt-3.5-turbo-1106:personal:carse:8U71tg31\", \n", " #suffix=\"CARSE\",\n", " hyperparameters={\n", " \"n_epochs\":5\n", " }\n", ")" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fine-tune id:\tftjob-Q2icwONwvwSm87GrJwKWz7iC\n" ] } ], "source": [ "fine_tune_id = fine_tune_response.id\n", "\n", "print(f'Fine-tune id:\\t{fine_tune_id}')" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "FineTuningJob(id='ftjob-Q2icwONwvwSm87GrJwKWz7iC', created_at=1702193022, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=5, batch_size=2, learning_rate_multiplier=2), model='ft:gpt-3.5-turbo-1106:personal:carse:8U71tg31', object='fine_tuning.job', organization_id='org-IXFDgE8ZZcQzb9yKJmEuFxvC', result_files=[], status='running', trained_tokens=None, training_file='file-exTRUxeCWwPXQRRThzhgWIM3', validation_file=None)" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Retrieve the state of a fine-tune\n", "client.fine_tuning.jobs.retrieve(fine_tune_id)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-croFh7WoWockQEWeBDvXwGiS', created_at=1702194442, level='info', message='Step 701/1875: training loss=1.77', object='fine_tuning.job.event', data={'step': 701, 'train_loss': 1.7663604021072388, 'train_mean_token_accuracy': 0.4166666567325592}, type='metrics'), FineTuningJobEvent(id='ftevent-NP6nA8Xl5wRSbywnScVrjPbo', created_at=1702194263, level='info', message='Step 601/1875: training loss=1.06', object='fine_tuning.job.event', data={'step': 601, 'train_loss': 1.0635706186294556, 'train_mean_token_accuracy': 0.7547169923782349}, type='metrics'), FineTuningJobEvent(id='ftevent-GStfnspE3KFJZcgVJJfwQUym', created_at=1702194085, level='info', message='Step 501/1875: training loss=1.98', object='fine_tuning.job.event', data={'step': 501, 'train_loss': 1.9760814905166626, 'train_mean_token_accuracy': 0.6052631735801697}, type='metrics'), FineTuningJobEvent(id='ftevent-t8KEMvBhN4GpbjolgnF4FhxY', created_at=1702193910, level='info', message='Step 401/1875: training loss=1.30', object='fine_tuning.job.event', data={'step': 401, 'train_loss': 1.2954835891723633, 'train_mean_token_accuracy': 0.6326530575752258}, type='metrics'), FineTuningJobEvent(id='ftevent-DEfvjFzsJGHPY4nec8drnQfw', created_at=1702193733, level='info', message='Step 301/1875: training loss=1.69', object='fine_tuning.job.event', data={'step': 301, 'train_loss': 1.6948037147521973, 'train_mean_token_accuracy': 0.4871794879436493}, type='metrics'), FineTuningJobEvent(id='ftevent-MnFJc3qTZ2EvaBBwzLrDJs9v', created_at=1702193558, level='info', message='Step 201/1875: training loss=2.27', object='fine_tuning.job.event', data={'step': 201, 'train_loss': 2.2718074321746826, 'train_mean_token_accuracy': 0.5102040767669678}, type='metrics'), FineTuningJobEvent(id='ftevent-V3xt82MrAL6jryV0UDcDzW1k', created_at=1702193385, level='info', message='Step 101/1875: training loss=1.79', object='fine_tuning.job.event', data={'step': 101, 'train_loss': 1.7939976453781128, 'train_mean_token_accuracy': 0.5}, type='metrics'), FineTuningJobEvent(id='ftevent-v3izLGiuwF9VcFITMFhqOgWz', created_at=1702193215, level='info', message='Step 1/1875: training loss=1.27', object='fine_tuning.job.event', data={'step': 1, 'train_loss': 1.274328351020813, 'train_mean_token_accuracy': 0.75}, type='metrics'), FineTuningJobEvent(id='ftevent-gjF6biaEA6HFpX4zAnreVR14', created_at=1702193115, level='info', message='Fine-tuning job started', object='fine_tuning.job.event', data=None, type='message'), FineTuningJobEvent(id='ftevent-0nQ0YxnmfiSDDboQZPm9h2nG', created_at=1702193114, level='info', message='Files validated, moving job to queued state', object='fine_tuning.job.event', data={}, type='message')], object='list', has_more=True)" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# List up to 10 events from a fine-tuning job\n", "client.fine_tuning.jobs.list_events(fine_tuning_job_id=fine_tune_id, limit=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Probamos modelo aquí" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "system_content = \"CARSE es un chatbot que imita el estilo en que Sebas conversa. Fue creado con amor solo para Carmen. Su objetivo es ofrecer una experiencia de chat divertida y familiar que exprese la personalidad, el humor y el amor de Sebas hacia Carmen.\"\n", "\n", "response = client.chat.completions.create(\n", " model=\"ft:gpt-3.5-turbo-1106:personal::8U9RIZTw\",\n", " #temperature=1,\n", " \n", " messages=[\n", " {\"role\": \"system\", \"content\": system_content},\n", " {\"role\": \"user\", \"content\": \"Tengo mucho sueño :(\"}\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ay amorcito :(((((\\nIntenta acabar pronto tus cosas para que duermas mucho en casita\n" ] } ], "source": [ "print(f'{response.choices[0].message.content}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 2 }