Spaces:
Sleeping
Sleeping
S-MurilloG
commited on
Commit
•
af6a4a2
1
Parent(s):
a25f6f4
Creating Training File
Browse files- CARSE_00_Cleaning.ipynb +23 -66
CARSE_00_Cleaning.ipynb
CHANGED
@@ -21,7 +21,7 @@
|
|
21 |
},
|
22 |
{
|
23 |
"cell_type": "code",
|
24 |
-
"execution_count":
|
25 |
"metadata": {},
|
26 |
"outputs": [],
|
27 |
"source": [
|
@@ -64,7 +64,7 @@
|
|
64 |
},
|
65 |
{
|
66 |
"cell_type": "code",
|
67 |
-
"execution_count":
|
68 |
"metadata": {},
|
69 |
"outputs": [
|
70 |
{
|
@@ -115,7 +115,7 @@
|
|
115 |
},
|
116 |
{
|
117 |
"cell_type": "code",
|
118 |
-
"execution_count":
|
119 |
"metadata": {},
|
120 |
"outputs": [],
|
121 |
"source": [
|
@@ -152,7 +152,7 @@
|
|
152 |
},
|
153 |
{
|
154 |
"cell_type": "code",
|
155 |
-
"execution_count":
|
156 |
"metadata": {},
|
157 |
"outputs": [
|
158 |
{
|
@@ -209,7 +209,7 @@
|
|
209 |
},
|
210 |
{
|
211 |
"cell_type": "code",
|
212 |
-
"execution_count":
|
213 |
"metadata": {},
|
214 |
"outputs": [],
|
215 |
"source": [
|
@@ -230,7 +230,7 @@
|
|
230 |
},
|
231 |
{
|
232 |
"cell_type": "code",
|
233 |
-
"execution_count":
|
234 |
"metadata": {},
|
235 |
"outputs": [
|
236 |
{
|
@@ -288,7 +288,7 @@
|
|
288 |
},
|
289 |
{
|
290 |
"cell_type": "code",
|
291 |
-
"execution_count":
|
292 |
"metadata": {},
|
293 |
"outputs": [],
|
294 |
"source": [
|
@@ -345,7 +345,7 @@
|
|
345 |
},
|
346 |
{
|
347 |
"cell_type": "code",
|
348 |
-
"execution_count":
|
349 |
"metadata": {},
|
350 |
"outputs": [
|
351 |
{
|
@@ -461,7 +461,7 @@
|
|
461 |
"10 Vale mi amor, disfruta tu baño\\nSabes que me e... "
|
462 |
]
|
463 |
},
|
464 |
-
"execution_count":
|
465 |
"metadata": {},
|
466 |
"output_type": "execute_result"
|
467 |
}
|
@@ -505,7 +505,7 @@
|
|
505 |
},
|
506 |
{
|
507 |
"cell_type": "code",
|
508 |
-
"execution_count":
|
509 |
"metadata": {},
|
510 |
"outputs": [],
|
511 |
"source": [
|
@@ -529,7 +529,7 @@
|
|
529 |
},
|
530 |
{
|
531 |
"cell_type": "code",
|
532 |
-
"execution_count":
|
533 |
"metadata": {},
|
534 |
"outputs": [
|
535 |
{
|
@@ -648,7 +648,7 @@
|
|
648 |
"[670 rows x 2 columns]"
|
649 |
]
|
650 |
},
|
651 |
-
"execution_count":
|
652 |
"metadata": {},
|
653 |
"output_type": "execute_result"
|
654 |
}
|
@@ -667,7 +667,7 @@
|
|
667 |
},
|
668 |
{
|
669 |
"cell_type": "code",
|
670 |
-
"execution_count":
|
671 |
"metadata": {},
|
672 |
"outputs": [],
|
673 |
"source": [
|
@@ -683,7 +683,7 @@
|
|
683 |
},
|
684 |
{
|
685 |
"cell_type": "code",
|
686 |
-
"execution_count":
|
687 |
"metadata": {},
|
688 |
"outputs": [
|
689 |
{
|
@@ -827,7 +827,7 @@
|
|
827 |
"[670 rows x 3 columns]"
|
828 |
]
|
829 |
},
|
830 |
-
"execution_count":
|
831 |
"metadata": {},
|
832 |
"output_type": "execute_result"
|
833 |
}
|
@@ -840,7 +840,7 @@
|
|
840 |
},
|
841 |
{
|
842 |
"cell_type": "code",
|
843 |
-
"execution_count":
|
844 |
"metadata": {},
|
845 |
"outputs": [],
|
846 |
"source": [
|
@@ -853,7 +853,7 @@
|
|
853 |
},
|
854 |
{
|
855 |
"cell_type": "code",
|
856 |
-
"execution_count":
|
857 |
"metadata": {},
|
858 |
"outputs": [
|
859 |
{
|
@@ -997,7 +997,7 @@
|
|
997 |
"[670 rows x 3 columns]"
|
998 |
]
|
999 |
},
|
1000 |
-
"execution_count":
|
1001 |
"metadata": {},
|
1002 |
"output_type": "execute_result"
|
1003 |
}
|
@@ -1010,7 +1010,7 @@
|
|
1010 |
},
|
1011 |
{
|
1012 |
"cell_type": "code",
|
1013 |
-
"execution_count":
|
1014 |
"metadata": {},
|
1015 |
"outputs": [
|
1016 |
{
|
@@ -1154,7 +1154,7 @@
|
|
1154 |
"[670 rows x 3 columns]"
|
1155 |
]
|
1156 |
},
|
1157 |
-
"execution_count":
|
1158 |
"metadata": {},
|
1159 |
"output_type": "execute_result"
|
1160 |
}
|
@@ -1174,7 +1174,7 @@
|
|
1174 |
},
|
1175 |
{
|
1176 |
"cell_type": "code",
|
1177 |
-
"execution_count":
|
1178 |
"metadata": {},
|
1179 |
"outputs": [],
|
1180 |
"source": [
|
@@ -1198,7 +1198,7 @@
|
|
1198 |
},
|
1199 |
{
|
1200 |
"cell_type": "code",
|
1201 |
-
"execution_count":
|
1202 |
"metadata": {},
|
1203 |
"outputs": [],
|
1204 |
"source": [
|
@@ -1207,16 +1207,9 @@
|
|
1207 |
"crear_json(chat_df,nombre_json)\n"
|
1208 |
]
|
1209 |
},
|
1210 |
-
{
|
1211 |
-
"cell_type": "markdown",
|
1212 |
-
"metadata": {},
|
1213 |
-
"source": [
|
1214 |
-
"### Separación del File de entrenamiento en partes iguales"
|
1215 |
-
]
|
1216 |
-
},
|
1217 |
{
|
1218 |
"cell_type": "code",
|
1219 |
-
"execution_count":
|
1220 |
"metadata": {},
|
1221 |
"outputs": [
|
1222 |
{
|
@@ -1240,42 +1233,6 @@
|
|
1240 |
" print(\"Archivo no encontrado. Por favor verifica la ruta.\")"
|
1241 |
]
|
1242 |
},
|
1243 |
-
{
|
1244 |
-
"cell_type": "code",
|
1245 |
-
"execution_count": 2,
|
1246 |
-
"metadata": {},
|
1247 |
-
"outputs": [],
|
1248 |
-
"source": [
|
1249 |
-
"def dividir_jsonl(ruta_archivo, num_partes):\n",
|
1250 |
-
" # Leer todas las líneas del archivo\n",
|
1251 |
-
" with open(ruta_archivo, 'r', encoding='utf-8') as file:\n",
|
1252 |
-
" lineas = file.readlines()\n",
|
1253 |
-
"\n",
|
1254 |
-
" # Calcular el tamaño de cada parte\n",
|
1255 |
-
" tamano_parte = len(lineas) // num_partes\n",
|
1256 |
-
"\n",
|
1257 |
-
" for i in range(num_partes):\n",
|
1258 |
-
" # Calcular el inicio y el fin de cada parte\n",
|
1259 |
-
" inicio = i * tamano_parte\n",
|
1260 |
-
" fin = (i + 1) * tamano_parte if i != num_partes - 1 else len(lineas)\n",
|
1261 |
-
"\n",
|
1262 |
-
" # Nombre del nuevo archivo\n",
|
1263 |
-
" nombre_nuevo_archivo = ruta_archivo.replace('.jsonl', f'_{i + 1}.jsonl')\n",
|
1264 |
-
"\n",
|
1265 |
-
" # Escribir las líneas en el nuevo archivo\n",
|
1266 |
-
" with open(nombre_nuevo_archivo, 'w', encoding='utf-8') as nuevo_archivo:\n",
|
1267 |
-
" nuevo_archivo.writelines(lineas[inicio:fin])"
|
1268 |
-
]
|
1269 |
-
},
|
1270 |
-
{
|
1271 |
-
"cell_type": "code",
|
1272 |
-
"execution_count": 3,
|
1273 |
-
"metadata": {},
|
1274 |
-
"outputs": [],
|
1275 |
-
"source": [
|
1276 |
-
"dividir_jsonl('Training_Data/Training_Prompts.jsonl', 3)"
|
1277 |
-
]
|
1278 |
-
},
|
1279 |
{
|
1280 |
"cell_type": "code",
|
1281 |
"execution_count": null,
|
|
|
21 |
},
|
22 |
{
|
23 |
"cell_type": "code",
|
24 |
+
"execution_count": 2,
|
25 |
"metadata": {},
|
26 |
"outputs": [],
|
27 |
"source": [
|
|
|
64 |
},
|
65 |
{
|
66 |
"cell_type": "code",
|
67 |
+
"execution_count": 3,
|
68 |
"metadata": {},
|
69 |
"outputs": [
|
70 |
{
|
|
|
115 |
},
|
116 |
{
|
117 |
"cell_type": "code",
|
118 |
+
"execution_count": 4,
|
119 |
"metadata": {},
|
120 |
"outputs": [],
|
121 |
"source": [
|
|
|
152 |
},
|
153 |
{
|
154 |
"cell_type": "code",
|
155 |
+
"execution_count": 5,
|
156 |
"metadata": {},
|
157 |
"outputs": [
|
158 |
{
|
|
|
209 |
},
|
210 |
{
|
211 |
"cell_type": "code",
|
212 |
+
"execution_count": 6,
|
213 |
"metadata": {},
|
214 |
"outputs": [],
|
215 |
"source": [
|
|
|
230 |
},
|
231 |
{
|
232 |
"cell_type": "code",
|
233 |
+
"execution_count": 7,
|
234 |
"metadata": {},
|
235 |
"outputs": [
|
236 |
{
|
|
|
288 |
},
|
289 |
{
|
290 |
"cell_type": "code",
|
291 |
+
"execution_count": 8,
|
292 |
"metadata": {},
|
293 |
"outputs": [],
|
294 |
"source": [
|
|
|
345 |
},
|
346 |
{
|
347 |
"cell_type": "code",
|
348 |
+
"execution_count": 9,
|
349 |
"metadata": {},
|
350 |
"outputs": [
|
351 |
{
|
|
|
461 |
"10 Vale mi amor, disfruta tu baño\\nSabes que me e... "
|
462 |
]
|
463 |
},
|
464 |
+
"execution_count": 9,
|
465 |
"metadata": {},
|
466 |
"output_type": "execute_result"
|
467 |
}
|
|
|
505 |
},
|
506 |
{
|
507 |
"cell_type": "code",
|
508 |
+
"execution_count": 10,
|
509 |
"metadata": {},
|
510 |
"outputs": [],
|
511 |
"source": [
|
|
|
529 |
},
|
530 |
{
|
531 |
"cell_type": "code",
|
532 |
+
"execution_count": 11,
|
533 |
"metadata": {},
|
534 |
"outputs": [
|
535 |
{
|
|
|
648 |
"[670 rows x 2 columns]"
|
649 |
]
|
650 |
},
|
651 |
+
"execution_count": 11,
|
652 |
"metadata": {},
|
653 |
"output_type": "execute_result"
|
654 |
}
|
|
|
667 |
},
|
668 |
{
|
669 |
"cell_type": "code",
|
670 |
+
"execution_count": 12,
|
671 |
"metadata": {},
|
672 |
"outputs": [],
|
673 |
"source": [
|
|
|
683 |
},
|
684 |
{
|
685 |
"cell_type": "code",
|
686 |
+
"execution_count": 13,
|
687 |
"metadata": {},
|
688 |
"outputs": [
|
689 |
{
|
|
|
827 |
"[670 rows x 3 columns]"
|
828 |
]
|
829 |
},
|
830 |
+
"execution_count": 13,
|
831 |
"metadata": {},
|
832 |
"output_type": "execute_result"
|
833 |
}
|
|
|
840 |
},
|
841 |
{
|
842 |
"cell_type": "code",
|
843 |
+
"execution_count": 14,
|
844 |
"metadata": {},
|
845 |
"outputs": [],
|
846 |
"source": [
|
|
|
853 |
},
|
854 |
{
|
855 |
"cell_type": "code",
|
856 |
+
"execution_count": 15,
|
857 |
"metadata": {},
|
858 |
"outputs": [
|
859 |
{
|
|
|
997 |
"[670 rows x 3 columns]"
|
998 |
]
|
999 |
},
|
1000 |
+
"execution_count": 15,
|
1001 |
"metadata": {},
|
1002 |
"output_type": "execute_result"
|
1003 |
}
|
|
|
1010 |
},
|
1011 |
{
|
1012 |
"cell_type": "code",
|
1013 |
+
"execution_count": 16,
|
1014 |
"metadata": {},
|
1015 |
"outputs": [
|
1016 |
{
|
|
|
1154 |
"[670 rows x 3 columns]"
|
1155 |
]
|
1156 |
},
|
1157 |
+
"execution_count": 16,
|
1158 |
"metadata": {},
|
1159 |
"output_type": "execute_result"
|
1160 |
}
|
|
|
1174 |
},
|
1175 |
{
|
1176 |
"cell_type": "code",
|
1177 |
+
"execution_count": 17,
|
1178 |
"metadata": {},
|
1179 |
"outputs": [],
|
1180 |
"source": [
|
|
|
1198 |
},
|
1199 |
{
|
1200 |
"cell_type": "code",
|
1201 |
+
"execution_count": 18,
|
1202 |
"metadata": {},
|
1203 |
"outputs": [],
|
1204 |
"source": [
|
|
|
1207 |
"crear_json(chat_df,nombre_json)\n"
|
1208 |
]
|
1209 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1210 |
{
|
1211 |
"cell_type": "code",
|
1212 |
+
"execution_count": 19,
|
1213 |
"metadata": {},
|
1214 |
"outputs": [
|
1215 |
{
|
|
|
1233 |
" print(\"Archivo no encontrado. Por favor verifica la ruta.\")"
|
1234 |
]
|
1235 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1236 |
{
|
1237 |
"cell_type": "code",
|
1238 |
"execution_count": null,
|