S-MurilloG commited on
Commit
af6a4a2
1 Parent(s): a25f6f4

Creating Training File

Browse files
Files changed (1) hide show
  1. CARSE_00_Cleaning.ipynb +23 -66
CARSE_00_Cleaning.ipynb CHANGED
@@ -21,7 +21,7 @@
21
  },
22
  {
23
  "cell_type": "code",
24
- "execution_count": 3,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
@@ -64,7 +64,7 @@
64
  },
65
  {
66
  "cell_type": "code",
67
- "execution_count": 4,
68
  "metadata": {},
69
  "outputs": [
70
  {
@@ -115,7 +115,7 @@
115
  },
116
  {
117
  "cell_type": "code",
118
- "execution_count": 5,
119
  "metadata": {},
120
  "outputs": [],
121
  "source": [
@@ -152,7 +152,7 @@
152
  },
153
  {
154
  "cell_type": "code",
155
- "execution_count": 6,
156
  "metadata": {},
157
  "outputs": [
158
  {
@@ -209,7 +209,7 @@
209
  },
210
  {
211
  "cell_type": "code",
212
- "execution_count": 7,
213
  "metadata": {},
214
  "outputs": [],
215
  "source": [
@@ -230,7 +230,7 @@
230
  },
231
  {
232
  "cell_type": "code",
233
- "execution_count": 8,
234
  "metadata": {},
235
  "outputs": [
236
  {
@@ -288,7 +288,7 @@
288
  },
289
  {
290
  "cell_type": "code",
291
- "execution_count": 9,
292
  "metadata": {},
293
  "outputs": [],
294
  "source": [
@@ -345,7 +345,7 @@
345
  },
346
  {
347
  "cell_type": "code",
348
- "execution_count": 10,
349
  "metadata": {},
350
  "outputs": [
351
  {
@@ -461,7 +461,7 @@
461
  "10 Vale mi amor, disfruta tu baño\\nSabes que me e... "
462
  ]
463
  },
464
- "execution_count": 10,
465
  "metadata": {},
466
  "output_type": "execute_result"
467
  }
@@ -505,7 +505,7 @@
505
  },
506
  {
507
  "cell_type": "code",
508
- "execution_count": 12,
509
  "metadata": {},
510
  "outputs": [],
511
  "source": [
@@ -529,7 +529,7 @@
529
  },
530
  {
531
  "cell_type": "code",
532
- "execution_count": 13,
533
  "metadata": {},
534
  "outputs": [
535
  {
@@ -648,7 +648,7 @@
648
  "[670 rows x 2 columns]"
649
  ]
650
  },
651
- "execution_count": 13,
652
  "metadata": {},
653
  "output_type": "execute_result"
654
  }
@@ -667,7 +667,7 @@
667
  },
668
  {
669
  "cell_type": "code",
670
- "execution_count": 14,
671
  "metadata": {},
672
  "outputs": [],
673
  "source": [
@@ -683,7 +683,7 @@
683
  },
684
  {
685
  "cell_type": "code",
686
- "execution_count": 15,
687
  "metadata": {},
688
  "outputs": [
689
  {
@@ -827,7 +827,7 @@
827
  "[670 rows x 3 columns]"
828
  ]
829
  },
830
- "execution_count": 15,
831
  "metadata": {},
832
  "output_type": "execute_result"
833
  }
@@ -840,7 +840,7 @@
840
  },
841
  {
842
  "cell_type": "code",
843
- "execution_count": 16,
844
  "metadata": {},
845
  "outputs": [],
846
  "source": [
@@ -853,7 +853,7 @@
853
  },
854
  {
855
  "cell_type": "code",
856
- "execution_count": 17,
857
  "metadata": {},
858
  "outputs": [
859
  {
@@ -997,7 +997,7 @@
997
  "[670 rows x 3 columns]"
998
  ]
999
  },
1000
- "execution_count": 17,
1001
  "metadata": {},
1002
  "output_type": "execute_result"
1003
  }
@@ -1010,7 +1010,7 @@
1010
  },
1011
  {
1012
  "cell_type": "code",
1013
- "execution_count": 18,
1014
  "metadata": {},
1015
  "outputs": [
1016
  {
@@ -1154,7 +1154,7 @@
1154
  "[670 rows x 3 columns]"
1155
  ]
1156
  },
1157
- "execution_count": 18,
1158
  "metadata": {},
1159
  "output_type": "execute_result"
1160
  }
@@ -1174,7 +1174,7 @@
1174
  },
1175
  {
1176
  "cell_type": "code",
1177
- "execution_count": 19,
1178
  "metadata": {},
1179
  "outputs": [],
1180
  "source": [
@@ -1198,7 +1198,7 @@
1198
  },
1199
  {
1200
  "cell_type": "code",
1201
- "execution_count": 20,
1202
  "metadata": {},
1203
  "outputs": [],
1204
  "source": [
@@ -1207,16 +1207,9 @@
1207
  "crear_json(chat_df,nombre_json)\n"
1208
  ]
1209
  },
1210
- {
1211
- "cell_type": "markdown",
1212
- "metadata": {},
1213
- "source": [
1214
- "### Separación del File de entrenamiento en partes iguales"
1215
- ]
1216
- },
1217
  {
1218
  "cell_type": "code",
1219
- "execution_count": 22,
1220
  "metadata": {},
1221
  "outputs": [
1222
  {
@@ -1240,42 +1233,6 @@
1240
  " print(\"Archivo no encontrado. Por favor verifica la ruta.\")"
1241
  ]
1242
  },
1243
- {
1244
- "cell_type": "code",
1245
- "execution_count": 2,
1246
- "metadata": {},
1247
- "outputs": [],
1248
- "source": [
1249
- "def dividir_jsonl(ruta_archivo, num_partes):\n",
1250
- " # Leer todas las líneas del archivo\n",
1251
- " with open(ruta_archivo, 'r', encoding='utf-8') as file:\n",
1252
- " lineas = file.readlines()\n",
1253
- "\n",
1254
- " # Calcular el tamaño de cada parte\n",
1255
- " tamano_parte = len(lineas) // num_partes\n",
1256
- "\n",
1257
- " for i in range(num_partes):\n",
1258
- " # Calcular el inicio y el fin de cada parte\n",
1259
- " inicio = i * tamano_parte\n",
1260
- " fin = (i + 1) * tamano_parte if i != num_partes - 1 else len(lineas)\n",
1261
- "\n",
1262
- " # Nombre del nuevo archivo\n",
1263
- " nombre_nuevo_archivo = ruta_archivo.replace('.jsonl', f'_{i + 1}.jsonl')\n",
1264
- "\n",
1265
- " # Escribir las líneas en el nuevo archivo\n",
1266
- " with open(nombre_nuevo_archivo, 'w', encoding='utf-8') as nuevo_archivo:\n",
1267
- " nuevo_archivo.writelines(lineas[inicio:fin])"
1268
- ]
1269
- },
1270
- {
1271
- "cell_type": "code",
1272
- "execution_count": 3,
1273
- "metadata": {},
1274
- "outputs": [],
1275
- "source": [
1276
- "dividir_jsonl('Training_Data/Training_Prompts.jsonl', 3)"
1277
- ]
1278
- },
1279
  {
1280
  "cell_type": "code",
1281
  "execution_count": null,
 
21
  },
22
  {
23
  "cell_type": "code",
24
+ "execution_count": 2,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
 
64
  },
65
  {
66
  "cell_type": "code",
67
+ "execution_count": 3,
68
  "metadata": {},
69
  "outputs": [
70
  {
 
115
  },
116
  {
117
  "cell_type": "code",
118
+ "execution_count": 4,
119
  "metadata": {},
120
  "outputs": [],
121
  "source": [
 
152
  },
153
  {
154
  "cell_type": "code",
155
+ "execution_count": 5,
156
  "metadata": {},
157
  "outputs": [
158
  {
 
209
  },
210
  {
211
  "cell_type": "code",
212
+ "execution_count": 6,
213
  "metadata": {},
214
  "outputs": [],
215
  "source": [
 
230
  },
231
  {
232
  "cell_type": "code",
233
+ "execution_count": 7,
234
  "metadata": {},
235
  "outputs": [
236
  {
 
288
  },
289
  {
290
  "cell_type": "code",
291
+ "execution_count": 8,
292
  "metadata": {},
293
  "outputs": [],
294
  "source": [
 
345
  },
346
  {
347
  "cell_type": "code",
348
+ "execution_count": 9,
349
  "metadata": {},
350
  "outputs": [
351
  {
 
461
  "10 Vale mi amor, disfruta tu baño\\nSabes que me e... "
462
  ]
463
  },
464
+ "execution_count": 9,
465
  "metadata": {},
466
  "output_type": "execute_result"
467
  }
 
505
  },
506
  {
507
  "cell_type": "code",
508
+ "execution_count": 10,
509
  "metadata": {},
510
  "outputs": [],
511
  "source": [
 
529
  },
530
  {
531
  "cell_type": "code",
532
+ "execution_count": 11,
533
  "metadata": {},
534
  "outputs": [
535
  {
 
648
  "[670 rows x 2 columns]"
649
  ]
650
  },
651
+ "execution_count": 11,
652
  "metadata": {},
653
  "output_type": "execute_result"
654
  }
 
667
  },
668
  {
669
  "cell_type": "code",
670
+ "execution_count": 12,
671
  "metadata": {},
672
  "outputs": [],
673
  "source": [
 
683
  },
684
  {
685
  "cell_type": "code",
686
+ "execution_count": 13,
687
  "metadata": {},
688
  "outputs": [
689
  {
 
827
  "[670 rows x 3 columns]"
828
  ]
829
  },
830
+ "execution_count": 13,
831
  "metadata": {},
832
  "output_type": "execute_result"
833
  }
 
840
  },
841
  {
842
  "cell_type": "code",
843
+ "execution_count": 14,
844
  "metadata": {},
845
  "outputs": [],
846
  "source": [
 
853
  },
854
  {
855
  "cell_type": "code",
856
+ "execution_count": 15,
857
  "metadata": {},
858
  "outputs": [
859
  {
 
997
  "[670 rows x 3 columns]"
998
  ]
999
  },
1000
+ "execution_count": 15,
1001
  "metadata": {},
1002
  "output_type": "execute_result"
1003
  }
 
1010
  },
1011
  {
1012
  "cell_type": "code",
1013
+ "execution_count": 16,
1014
  "metadata": {},
1015
  "outputs": [
1016
  {
 
1154
  "[670 rows x 3 columns]"
1155
  ]
1156
  },
1157
+ "execution_count": 16,
1158
  "metadata": {},
1159
  "output_type": "execute_result"
1160
  }
 
1174
  },
1175
  {
1176
  "cell_type": "code",
1177
+ "execution_count": 17,
1178
  "metadata": {},
1179
  "outputs": [],
1180
  "source": [
 
1198
  },
1199
  {
1200
  "cell_type": "code",
1201
+ "execution_count": 18,
1202
  "metadata": {},
1203
  "outputs": [],
1204
  "source": [
 
1207
  "crear_json(chat_df,nombre_json)\n"
1208
  ]
1209
  },
 
 
 
 
 
 
 
1210
  {
1211
  "cell_type": "code",
1212
+ "execution_count": 19,
1213
  "metadata": {},
1214
  "outputs": [
1215
  {
 
1233
  " print(\"Archivo no encontrado. Por favor verifica la ruta.\")"
1234
  ]
1235
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1236
  {
1237
  "cell_type": "code",
1238
  "execution_count": null,