File size: 1,744 Bytes
77ba698 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-05-15T07:49:59.747703Z",
"start_time": "2024-05-15T07:49:59.134058Z"
}
},
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv('/home/gui/hf_dev/datatrove/blogpost/data/decont_ngrams-per_dump.csv')"
],
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-15T07:51:52.324884Z",
"start_time": "2024-05-15T07:51:52.283371Z"
}
},
"cell_type": "code",
"source": "df = df.groupby([\"ngram\", \"task\"], as_index=False)[\"count\"].sum().sort_values(\"count\", ascending=False)",
"id": "c691b2709c417bf4",
"execution_count": 8,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-15T07:52:17.954219Z",
"start_time": "2024-05-15T07:52:17.938060Z"
}
},
"cell_type": "code",
"source": "df.to_csv('/home/gui/hf_dev/datatrove/blogpost/data/decont_ngrams-global.csv', index=False)",
"id": "9c0dfcd486f8e260",
"execution_count": 9,
"outputs": []
},
{
"metadata": {},
"cell_type": "code",
"execution_count": null,
"source": "",
"id": "d5fef0e4bc91a43e",
"outputs": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|