{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2024-05-15T07:49:59.747703Z", "start_time": "2024-05-15T07:49:59.134058Z" } }, "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv('/home/gui/hf_dev/datatrove/blogpost/data/decont_ngrams-per_dump.csv')" ], "execution_count": 1, "outputs": [] }, { "metadata": { "ExecuteTime": { "end_time": "2024-05-15T07:51:52.324884Z", "start_time": "2024-05-15T07:51:52.283371Z" } }, "cell_type": "code", "source": "df = df.groupby([\"ngram\", \"task\"], as_index=False)[\"count\"].sum().sort_values(\"count\", ascending=False)", "id": "c691b2709c417bf4", "execution_count": 8, "outputs": [] }, { "metadata": { "ExecuteTime": { "end_time": "2024-05-15T07:52:17.954219Z", "start_time": "2024-05-15T07:52:17.938060Z" } }, "cell_type": "code", "source": "df.to_csv('/home/gui/hf_dev/datatrove/blogpost/data/decont_ngrams-global.csv', index=False)", "id": "9c0dfcd486f8e260", "execution_count": 9, "outputs": [] }, { "metadata": {}, "cell_type": "code", "execution_count": null, "source": "", "id": "d5fef0e4bc91a43e", "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }