diff --git "a/decision-trees-tutorial.ipynb" "b/decision-trees-tutorial.ipynb"
new file mode 100644--- /dev/null
+++ "b/decision-trees-tutorial.ipynb"
@@ -0,0 +1 @@
+{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.9","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"## Decision Trees tutorial & improving hosting with skops 🌲\n\nIn this notebook I will walk you through decision trees and how to inspect them, and we will later improve model hosting using [skops](https://skops.readthedocs.io/en/stable/). ","metadata":{}},{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_kg_hide-input":true,"_kg_hide-output":true,"execution":{"iopub.status.busy":"2022-12-01T13:21:07.411748Z","iopub.execute_input":"2022-12-01T13:21:07.412350Z","iopub.status.idle":"2022-12-01T13:21:07.419860Z","shell.execute_reply.started":"2022-12-01T13:21:07.412261Z","shell.execute_reply":"2022-12-01T13:21:07.418325Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"!pip install skops","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2022-12-01T13:21:09.851317Z","iopub.execute_input":"2022-12-01T13:21:09.851890Z","iopub.status.idle":"2022-12-01T13:21:15.803438Z","shell.execute_reply.started":"2022-12-01T13:21:09.851859Z","shell.execute_reply":"2022-12-01T13:21:15.802081Z"},"trusted":true},"execution_count":2,"outputs":[{"name":"stdout","text":"Requirement already satisfied: skops in /opt/conda/lib/python3.7/site-packages (0.3.0)\nRequirement already satisfied: tabulate>=0.8.8 in /opt/conda/lib/python3.7/site-packages (from skops) (0.8.8)\nRequirement already satisfied: typing-extensions>=3.7 in /opt/conda/lib/python3.7/site-packages (from skops) (3.7.4.3)\nRequirement already satisfied: huggingface-hub>=0.10.1 in /opt/conda/lib/python3.7/site-packages (from skops) (0.11.1)\nRequirement already satisfied: scikit-learn>=0.24 in /opt/conda/lib/python3.7/site-packages (from skops) (0.24.1)\nRequirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.10.1->skops) (21.3)\nRequirement already satisfied: tqdm in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.10.1->skops) (4.55.1)\nRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.10.1->skops) (5.3.1)\nRequirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.10.1->skops) (2.25.1)\nRequirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.10.1->skops) (3.3.0)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.7/site-packages (from huggingface-hub>=0.10.1->skops) (3.0.12)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.7/site-packages (from packaging>=20.9->huggingface-hub>=0.10.1->skops) (2.4.7)\nRequirement already satisfied: scipy>=0.19.1 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.24->skops) (1.5.4)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.24->skops) (2.1.0)\nRequirement already satisfied: joblib>=0.11 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.24->skops) (1.0.0)\nRequirement already satisfied: numpy>=1.13.3 in /opt/conda/lib/python3.7/site-packages (from scikit-learn>=0.24->skops) (1.19.5)\nRequirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->huggingface-hub>=0.10.1->skops) (3.4.0)\nRequirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->huggingface-hub>=0.10.1->skops) (1.26.2)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests->huggingface-hub>=0.10.1->skops) (2020.12.5)\nRequirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->huggingface-hub>=0.10.1->skops) (2.10)\nRequirement already satisfied: chardet<5,>=3.0.2 in /opt/conda/lib/python3.7/site-packages (from requests->huggingface-hub>=0.10.1->skops) (3.0.4)\n\u001b[33mWARNING: You are using pip version 21.0.1; however, version 22.3.1 is available.\nYou should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.\u001b[0m\n","output_type":"stream"}]},{"cell_type":"markdown","source":"We will use breast cancer dataset from sklearn datasets. We will load the dataset and split. ","metadata":{}},{"cell_type":"code","source":"from sklearn.datasets import load_breast_cancer\nfrom sklearn.model_selection import train_test_split","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:21:16.196274Z","iopub.execute_input":"2022-12-01T13:21:16.196592Z","iopub.status.idle":"2022-12-01T13:21:16.523656Z","shell.execute_reply.started":"2022-12-01T13:21:16.196567Z","shell.execute_reply":"2022-12-01T13:21:16.522085Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"cancer = load_breast_cancer()\ndata = pd.DataFrame(cancer.data, columns=[cancer.feature_names])\ndata.head()","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:21:16.668054Z","iopub.execute_input":"2022-12-01T13:21:16.668383Z","iopub.status.idle":"2022-12-01T13:21:16.719596Z","shell.execute_reply.started":"2022-12-01T13:21:16.668356Z","shell.execute_reply":"2022-12-01T13:21:16.717624Z"},"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":" mean radius mean texture mean perimeter mean area mean smoothness \\\n0 17.99 10.38 122.80 1001.0 0.11840 \n1 20.57 17.77 132.90 1326.0 0.08474 \n2 19.69 21.25 130.00 1203.0 0.10960 \n3 11.42 20.38 77.58 386.1 0.14250 \n4 20.29 14.34 135.10 1297.0 0.10030 \n\n mean compactness mean concavity mean concave points mean symmetry \\\n0 0.27760 0.3001 0.14710 0.2419 \n1 0.07864 0.0869 0.07017 0.1812 \n2 0.15990 0.1974 0.12790 0.2069 \n3 0.28390 0.2414 0.10520 0.2597 \n4 0.13280 0.1980 0.10430 0.1809 \n\n mean fractal dimension ... worst radius worst texture worst perimeter \\\n0 0.07871 ... 25.38 17.33 184.60 \n1 0.05667 ... 24.99 23.41 158.80 \n2 0.05999 ... 23.57 25.53 152.50 \n3 0.09744 ... 14.91 26.50 98.87 \n4 0.05883 ... 22.54 16.67 152.20 \n\n worst area worst smoothness worst compactness worst concavity \\\n0 2019.0 0.1622 0.6656 0.7119 \n1 1956.0 0.1238 0.1866 0.2416 \n2 1709.0 0.1444 0.4245 0.4504 \n3 567.7 0.2098 0.8663 0.6869 \n4 1575.0 0.1374 0.2050 0.4000 \n\n worst concave points worst symmetry worst fractal dimension \n0 0.2654 0.4601 0.11890 \n1 0.1860 0.2750 0.08902 \n2 0.2430 0.3613 0.08758 \n3 0.2575 0.6638 0.17300 \n4 0.1625 0.2364 0.07678 \n\n[5 rows x 30 columns]","text/html":"
\n\n
\n \n \n \n mean radius \n mean texture \n mean perimeter \n mean area \n mean smoothness \n mean compactness \n mean concavity \n mean concave points \n mean symmetry \n mean fractal dimension \n ... \n worst radius \n worst texture \n worst perimeter \n worst area \n worst smoothness \n worst compactness \n worst concavity \n worst concave points \n worst symmetry \n worst fractal dimension \n \n \n \n \n 0 \n 17.99 \n 10.38 \n 122.80 \n 1001.0 \n 0.11840 \n 0.27760 \n 0.3001 \n 0.14710 \n 0.2419 \n 0.07871 \n ... \n 25.38 \n 17.33 \n 184.60 \n 2019.0 \n 0.1622 \n 0.6656 \n 0.7119 \n 0.2654 \n 0.4601 \n 0.11890 \n \n \n 1 \n 20.57 \n 17.77 \n 132.90 \n 1326.0 \n 0.08474 \n 0.07864 \n 0.0869 \n 0.07017 \n 0.1812 \n 0.05667 \n ... \n 24.99 \n 23.41 \n 158.80 \n 1956.0 \n 0.1238 \n 0.1866 \n 0.2416 \n 0.1860 \n 0.2750 \n 0.08902 \n \n \n 2 \n 19.69 \n 21.25 \n 130.00 \n 1203.0 \n 0.10960 \n 0.15990 \n 0.1974 \n 0.12790 \n 0.2069 \n 0.05999 \n ... \n 23.57 \n 25.53 \n 152.50 \n 1709.0 \n 0.1444 \n 0.4245 \n 0.4504 \n 0.2430 \n 0.3613 \n 0.08758 \n \n \n 3 \n 11.42 \n 20.38 \n 77.58 \n 386.1 \n 0.14250 \n 0.28390 \n 0.2414 \n 0.10520 \n 0.2597 \n 0.09744 \n ... \n 14.91 \n 26.50 \n 98.87 \n 567.7 \n 0.2098 \n 0.8663 \n 0.6869 \n 0.2575 \n 0.6638 \n 0.17300 \n \n \n 4 \n 20.29 \n 14.34 \n 135.10 \n 1297.0 \n 0.10030 \n 0.13280 \n 0.1980 \n 0.10430 \n 0.1809 \n 0.05883 \n ... \n 22.54 \n 16.67 \n 152.20 \n 1575.0 \n 0.1374 \n 0.2050 \n 0.4000 \n 0.1625 \n 0.2364 \n 0.07678 \n \n \n
\n
5 rows × 30 columns
\n
"},"metadata":{}}]},{"cell_type":"code","source":"X = cancer.data\ny = cancer.target\nX_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, \n random_state=42)","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:21:17.243233Z","iopub.execute_input":"2022-12-01T13:21:17.243595Z","iopub.status.idle":"2022-12-01T13:21:17.251729Z","shell.execute_reply.started":"2022-12-01T13:21:17.243563Z","shell.execute_reply":"2022-12-01T13:21:17.250403Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"from sklearn.tree import DecisionTreeClassifier\ntree = DecisionTreeClassifier(random_state=0)\ntree.fit(X_train, y_train)","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:21:18.976344Z","iopub.execute_input":"2022-12-01T13:21:18.976921Z","iopub.status.idle":"2022-12-01T13:21:19.137843Z","shell.execute_reply.started":"2022-12-01T13:21:18.976882Z","shell.execute_reply":"2022-12-01T13:21:19.135814Z"},"trusted":true},"execution_count":6,"outputs":[{"execution_count":6,"output_type":"execute_result","data":{"text/plain":"DecisionTreeClassifier(random_state=0)"},"metadata":{}}]},{"cell_type":"markdown","source":"## Evaluate and Inspect the Model","metadata":{}},{"cell_type":"code","source":"from sklearn.metrics import classification_report\ny_pred = tree.predict(X_test)\nprint(classification_report(y_test, y_pred))","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:21:19.705292Z","iopub.execute_input":"2022-12-01T13:21:19.705688Z","iopub.status.idle":"2022-12-01T13:21:19.718621Z","shell.execute_reply.started":"2022-12-01T13:21:19.705652Z","shell.execute_reply":"2022-12-01T13:21:19.717510Z"},"trusted":true},"execution_count":7,"outputs":[{"name":"stdout","text":" precision recall f1-score support\n\n 0 0.91 0.92 0.92 53\n 1 0.96 0.94 0.95 90\n\n accuracy 0.94 143\n macro avg 0.93 0.93 0.93 143\nweighted avg 0.94 0.94 0.94 143\n\n","output_type":"stream"}]},{"cell_type":"code","source":"report = pd.DataFrame.from_dict(classification_report(y_test, y_pred, output_dict = True))","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:21:35.456594Z","iopub.execute_input":"2022-12-01T13:21:35.457151Z","iopub.status.idle":"2022-12-01T13:21:35.469930Z","shell.execute_reply.started":"2022-12-01T13:21:35.457116Z","shell.execute_reply":"2022-12-01T13:21:35.468844Z"},"trusted":true},"execution_count":9,"outputs":[]},{"cell_type":"code","source":"print(report)","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:21:36.065318Z","iopub.execute_input":"2022-12-01T13:21:36.065648Z","iopub.status.idle":"2022-12-01T13:21:36.073465Z","shell.execute_reply.started":"2022-12-01T13:21:36.065622Z","shell.execute_reply":"2022-12-01T13:21:36.072161Z"},"trusted":true},"execution_count":10,"outputs":[{"name":"stdout","text":" 0 1 accuracy macro avg weighted avg\nprecision 0.907407 0.955056 0.937063 0.931232 0.937396\nrecall 0.924528 0.944444 0.937063 0.934486 0.937063\nf1-score 0.915888 0.949721 0.937063 0.932804 0.937181\nsupport 53.000000 90.000000 0.937063 143.000000 143.000000\n","output_type":"stream"}]},{"cell_type":"code","source":"from sklearn.tree import export_graphviz\nexport_graphviz(tree, out_file=\"tree.dot\", class_names=[\"malignant\", \"benign\"],\n feature_names=cancer.feature_names, impurity=False, filled=True)","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:21:59.435038Z","iopub.execute_input":"2022-12-01T13:21:59.435400Z","iopub.status.idle":"2022-12-01T13:21:59.447564Z","shell.execute_reply.started":"2022-12-01T13:21:59.435368Z","shell.execute_reply":"2022-12-01T13:21:59.446135Z"},"trusted":true},"execution_count":11,"outputs":[]},{"cell_type":"code","source":"import pydot\nimport graphviz\n\n(graph,) = pydot.graph_from_dot_file('tree.dot')\n\nwith open(\"tree.dot\") as f:\n dot_graph = f.read()\ndisplay(graphviz.Source(dot_graph))","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:22:16.395803Z","iopub.execute_input":"2022-12-01T13:22:16.396179Z","iopub.status.idle":"2022-12-01T13:22:16.630158Z","shell.execute_reply.started":"2022-12-01T13:22:16.396144Z","shell.execute_reply":"2022-12-01T13:22:16.628958Z"},"_kg_hide-output":true,"trusted":true},"execution_count":12,"outputs":[{"output_type":"display_data","data":{"text/plain":"","image/svg+xml":"\n\n\n\n\n\nTree \n \n\n\n0 \n \nworst radius <= 16.795 \nsamples = 426 \nvalue = [159, 267] \nclass = benign \n \n\n\n1 \n \nworst concave points <= 0.136 \nsamples = 284 \nvalue = [25, 259] \nclass = benign \n \n\n\n0->1 \n \n \nTrue \n \n\n\n28 \n \ntexture error <= 0.473 \nsamples = 142 \nvalue = [134, 8] \nclass = malignant \n \n\n\n0->28 \n \n \nFalse \n \n\n\n2 \n \nradius error <= 1.048 \nsamples = 252 \nvalue = [4, 248] \nclass = benign \n \n\n\n1->2 \n \n \n \n\n\n17 \n \nworst texture <= 25.62 \nsamples = 32 \nvalue = [21, 11] \nclass = malignant \n \n\n\n1->17 \n \n \n \n\n\n3 \n \nsmoothness error <= 0.003 \nsamples = 251 \nvalue = [3, 248] \nclass = benign \n \n\n\n2->3 \n \n \n \n\n\n16 \n \nsamples = 1 \nvalue = [1, 0] \nclass = malignant \n \n\n\n2->16 \n \n \n \n\n\n4 \n \nmean texture <= 19.9 \nsamples = 4 \nvalue = [1, 3] \nclass = benign \n \n\n\n3->4 \n \n \n \n\n\n7 \n \narea error <= 48.7 \nsamples = 247 \nvalue = [2, 245] \nclass = benign \n \n\n\n3->7 \n \n \n \n\n\n5 \n \nsamples = 3 \nvalue = [0, 3] \nclass = benign \n \n\n\n4->5 \n \n \n \n\n\n6 \n \nsamples = 1 \nvalue = [1, 0] \nclass = malignant \n \n\n\n4->6 \n \n \n \n\n\n8 \n \nworst texture <= 33.35 \nsamples = 243 \nvalue = [1, 242] \nclass = benign \n \n\n\n7->8 \n \n \n \n\n\n13 \n \nmean concavity <= 0.029 \nsamples = 4 \nvalue = [1, 3] \nclass = benign \n \n\n\n7->13 \n \n \n \n\n\n9 \n \nsamples = 225 \nvalue = [0, 225] \nclass = benign \n \n\n\n8->9 \n \n \n \n\n\n10 \n \nworst texture <= 33.8 \nsamples = 18 \nvalue = [1, 17] \nclass = benign \n \n\n\n8->10 \n \n \n \n\n\n11 \n \nsamples = 1 \nvalue = [1, 0] \nclass = malignant \n \n\n\n10->11 \n \n \n \n\n\n12 \n \nsamples = 17 \nvalue = [0, 17] \nclass = benign \n \n\n\n10->12 \n \n \n \n\n\n14 \n \nsamples = 1 \nvalue = [1, 0] \nclass = malignant \n \n\n\n13->14 \n \n \n \n\n\n15 \n \nsamples = 3 \nvalue = [0, 3] \nclass = benign \n \n\n\n13->15 \n \n \n \n\n\n18 \n \nworst area <= 817.1 \nsamples = 12 \nvalue = [3, 9] \nclass = benign \n \n\n\n17->18 \n \n \n \n\n\n23 \n \nworst symmetry <= 0.268 \nsamples = 20 \nvalue = [18, 2] \nclass = malignant \n \n\n\n17->23 \n \n \n \n\n\n19 \n \nmean smoothness <= 0.123 \nsamples = 10 \nvalue = [1, 9] \nclass = benign \n \n\n\n18->19 \n \n \n \n\n\n22 \n \nsamples = 2 \nvalue = [2, 0] \nclass = malignant \n \n\n\n18->22 \n \n \n \n\n\n20 \n \nsamples = 9 \nvalue = [0, 9] \nclass = benign \n \n\n\n19->20 \n \n \n \n\n\n21 \n \nsamples = 1 \nvalue = [1, 0] \nclass = malignant \n \n\n\n19->21 \n \n \n \n\n\n24 \n \nfractal dimension error <= 0.002 \nsamples = 3 \nvalue = [1, 2] \nclass = benign \n \n\n\n23->24 \n \n \n \n\n\n27 \n \nsamples = 17 \nvalue = [17, 0] \nclass = malignant \n \n\n\n23->27 \n \n \n \n\n\n25 \n \nsamples = 1 \nvalue = [1, 0] \nclass = malignant \n \n\n\n24->25 \n \n \n \n\n\n26 \n \nsamples = 2 \nvalue = [0, 2] \nclass = benign \n \n\n\n24->26 \n \n \n \n\n\n29 \n \nsamples = 5 \nvalue = [0, 5] \nclass = benign \n \n\n\n28->29 \n \n \n \n\n\n30 \n \nworst concavity <= 0.191 \nsamples = 137 \nvalue = [134, 3] \nclass = malignant \n \n\n\n28->30 \n \n \n \n\n\n31 \n \nworst texture <= 30.975 \nsamples = 5 \nvalue = [2, 3] \nclass = benign \n \n\n\n30->31 \n \n \n \n\n\n34 \n \nsamples = 132 \nvalue = [132, 0] \nclass = malignant \n \n\n\n30->34 \n \n \n \n\n\n32 \n \nsamples = 3 \nvalue = [0, 3] \nclass = benign \n \n\n\n31->32 \n \n \n \n\n\n33 \n \nsamples = 2 \nvalue = [2, 0] \nclass = malignant \n \n\n\n31->33 \n \n \n \n \n \n"},"metadata":{}}]},{"cell_type":"code","source":"print(\"Feature importances:\")\nprint(tree.feature_importances_)","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:22:27.114215Z","iopub.execute_input":"2022-12-01T13:22:27.114789Z","iopub.status.idle":"2022-12-01T13:22:27.123148Z","shell.execute_reply.started":"2022-12-01T13:22:27.114749Z","shell.execute_reply":"2022-12-01T13:22:27.121501Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stdout","text":"Feature importances:\n[0. 0.00752597 0. 0. 0.00903116 0.\n 0.00752597 0. 0. 0. 0.00975731 0.04630969\n 0. 0.00238745 0.00231135 0. 0. 0.\n 0. 0.00668975 0.69546322 0.05383211 0. 0.01354675\n 0. 0. 0.01740312 0.11684357 0.01137258 0. ]\n","output_type":"stream"}]},{"cell_type":"code","source":"import matplotlib.pyplot as plt\n#bar chart of feature importances\ndef plot_feature_importances_cancer(model):\n n_features = cancer.data.shape[1]\n plt.figure(figsize=(8,20))\n plt.barh(np.arange(n_features), model.feature_importances_, align='center')\n plt.yticks(np.arange(n_features), cancer.feature_names)\n plt.xlabel(\"Feature importance\")\n plt.ylabel(\"Feature\")\n plt.ylim(-1, n_features)\n plt.savefig('testfig.png',dpi=300, bbox_inches = \"tight\")\n\nplot_feature_importances_cancer(tree)","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:22:29.532843Z","iopub.execute_input":"2022-12-01T13:22:29.533165Z","iopub.status.idle":"2022-12-01T13:22:31.008940Z","shell.execute_reply.started":"2022-12-01T13:22:29.533139Z","shell.execute_reply":"2022-12-01T13:22:31.008108Z"},"trusted":true},"execution_count":14,"outputs":[{"output_type":"display_data","data":{"text/plain":"","image/png":"\n"},"metadata":{"needs_background":"light"}}]},{"cell_type":"markdown","source":"**We will now apply cost complexity pruning (post-pruning) to our tree to reduce the size and overfitting.**","metadata":{}},{"cell_type":"code","source":"path = tree.cost_complexity_pruning_path(X_train, y_train)\nccp_alphas, impurities = path.ccp_alphas, path.impurities","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:22:36.829875Z","iopub.execute_input":"2022-12-01T13:22:36.830481Z","iopub.status.idle":"2022-12-01T13:22:36.846093Z","shell.execute_reply.started":"2022-12-01T13:22:36.830450Z","shell.execute_reply":"2022-12-01T13:22:36.843969Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"print(ccp_alphas)","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:22:39.720460Z","iopub.execute_input":"2022-12-01T13:22:39.720843Z","iopub.status.idle":"2022-12-01T13:22:39.733881Z","shell.execute_reply.started":"2022-12-01T13:22:39.720804Z","shell.execute_reply":"2022-12-01T13:22:39.730426Z"},"trusted":true},"execution_count":16,"outputs":[{"name":"stdout","text":"[0. 0.00231936 0.00312989 0.00422535 0.00456509 0.00532081\n 0.0056338 0.00633803 0.00814228 0.01487676 0.02166662 0.05466684\n 0.32538187]\n","output_type":"stream"}]},{"cell_type":"markdown","source":"## Model hosting using skops 🤗\nWe will now initialize a repository and save a model and a model card in it. ","metadata":{}},{"cell_type":"code","source":"from skops import hub_utils, card\nimport os\nimport joblib\n\n# create a directory to initialize our repo\nlocal_repo = \"./model_dir\"\n# save the model\npkl_path = \"./model.pkl\"\njoblib.dump(tree, pkl_path)\n\n# initialize the repository \nhub_utils.init(model=pkl_path, \n task=\"tabular-classification\",\n requirements=[\"scikit-learn\"], \n dst=local_repo,\n data=X_train)\n\n# see what's inside the repository\nprint(os.listdir(local_repo))","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:23:19.683597Z","iopub.execute_input":"2022-12-01T13:23:19.684289Z","iopub.status.idle":"2022-12-01T13:23:19.694628Z","shell.execute_reply.started":"2022-12-01T13:23:19.684241Z","shell.execute_reply":"2022-12-01T13:23:19.693726Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"['config.json', 'model.pkl']\n","output_type":"stream"}]},{"cell_type":"markdown","source":"We will now initialize a model card and add information.","metadata":{}},{"cell_type":"code","source":"from pathlib import Path\nmodel_card = card.Card(tree, metadata=card.metadata_from_config(Path(local_repo)))","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:23:23.949248Z","iopub.execute_input":"2022-12-01T13:23:23.949851Z","iopub.status.idle":"2022-12-01T13:23:23.956738Z","shell.execute_reply.started":"2022-12-01T13:23:23.949805Z","shell.execute_reply":"2022-12-01T13:23:23.955196Z"},"trusted":true},"execution_count":20,"outputs":[]},{"cell_type":"code","source":"description = \"This is a Decision Tree Classifier trained on breast cancer dataset and pruned with CCP.\"\nlimitations = \"This model is trained for educational purposes.\"\nmodel_card.add(model_description = description,\n limitations = limitations)\n","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:23:25.733610Z","iopub.execute_input":"2022-12-01T13:23:25.733946Z","iopub.status.idle":"2022-12-01T13:23:25.742664Z","shell.execute_reply.started":"2022-12-01T13:23:25.733916Z","shell.execute_reply":"2022-12-01T13:23:25.741153Z"},"trusted":true},"execution_count":21,"outputs":[{"execution_count":21,"output_type":"execute_result","data":{"text/plain":"Card(\n model=DecisionTreeClassifier(random_state=0),\n metadata.library_name=sklearn,\n metadata.tags=['sklearn', 'skops', 'tabular-classification'],\n metadata.model_file=model.pkl,\n metadata.widget={...},\n model_description='This is a Decisi...cancer dataset and pruned with CCP.',\n limitations='This model is trained for educational purposes.',\n)"},"metadata":{}}]},{"cell_type":"markdown","source":"We will add the plots we've visualized above.","metadata":{}},{"cell_type":"code","source":"# save feature importance bar chart\nplot_feature_importances_cancer(tree)\nplt.savefig(Path(local_repo) / 'feature_importances.png')\n# save graph\ngraph.write_png(Path(local_repo) / 'tree.png')\n\n# write the plots to model card\nmodel_card.add_plot(**{\"Feature Importances\": 'feature_importances.png',\n \"Tree Splits\": 'tree.png'})","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2022-12-01T13:23:30.323398Z","iopub.execute_input":"2022-12-01T13:23:30.323754Z","iopub.status.idle":"2022-12-01T13:23:32.254691Z","shell.execute_reply.started":"2022-12-01T13:23:30.323721Z","shell.execute_reply":"2022-12-01T13:23:32.253135Z"},"trusted":true},"execution_count":22,"outputs":[{"execution_count":22,"output_type":"execute_result","data":{"text/plain":"Card(\n model=DecisionTreeClassifier(random_state=0),\n metadata.library_name=sklearn,\n metadata.tags=['sklearn', 'skops', 'tabular-classification'],\n metadata.model_file=model.pkl,\n metadata.widget={...},\n model_description='This is a Decisi...cancer dataset and pruned with CCP.',\n limitations='This model is trained for educational purposes.',\n Feature Importances='feature_importances.png',\n Tree Splits='tree.png',\n)"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"","image/png":"\n"},"metadata":{"needs_background":"light"}}]},{"cell_type":"markdown","source":"We can save confusion matrix.","metadata":{}},{"cell_type":"code","source":"from sklearn.metrics import (\n ConfusionMatrixDisplay,\n accuracy_score,\n classification_report,\n confusion_matrix,\n f1_score,\n)\n# add metrics to our model card\naccuracy = accuracy_score(y_test, y_pred)\nf1 = f1_score(y_test, y_pred, average=\"micro\")\nmodel_card.add_metrics(**{\"accuracy\": accuracy, \"f1 score\": f1})","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:23:41.422503Z","iopub.execute_input":"2022-12-01T13:23:41.422904Z","iopub.status.idle":"2022-12-01T13:23:41.436927Z","shell.execute_reply.started":"2022-12-01T13:23:41.422867Z","shell.execute_reply":"2022-12-01T13:23:41.435224Z"},"trusted":true},"execution_count":23,"outputs":[{"execution_count":23,"output_type":"execute_result","data":{"text/plain":"Card(\n model=DecisionTreeClassifier(random_state=0),\n metadata.library_name=sklearn,\n metadata.tags=['sklearn', 'skops', 'tabular-classification'],\n metadata.model_file=model.pkl,\n metadata.widget={...},\n model_description='This is a Decisi...cancer dataset and pruned with CCP.',\n limitations='This model is trained for educational purposes.',\n Feature Importances='feature_importances.png',\n Tree Splits='tree.png',\n)"},"metadata":{}}]},{"cell_type":"code","source":"cm = confusion_matrix(y_test, y_pred, labels=tree.classes_)\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree.classes_)\ndisp.plot()\n# save the figure to repo\ndisp.figure_.savefig(Path(local_repo) / \"confusion_matrix.png\")\n# write the figure to model card\nmodel_card.add_plot(**{\"Confusion Matrix\": \"confusion_matrix.png\"})","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2022-12-01T13:23:44.711392Z","iopub.execute_input":"2022-12-01T13:23:44.711727Z","iopub.status.idle":"2022-12-01T13:23:44.947598Z","shell.execute_reply.started":"2022-12-01T13:23:44.711697Z","shell.execute_reply":"2022-12-01T13:23:44.945571Z"},"trusted":true},"execution_count":24,"outputs":[{"execution_count":24,"output_type":"execute_result","data":{"text/plain":"Card(\n model=DecisionTreeClassifier(random_state=0),\n metadata.library_name=sklearn,\n metadata.tags=['sklearn', 'skops', 'tabular-classification'],\n metadata.model_file=model.pkl,\n metadata.widget={...},\n model_description='This is a Decisi...cancer dataset and pruned with CCP.',\n limitations='This model is trained for educational purposes.',\n Feature Importances='feature_importances.png',\n Tree Splits='tree.png',\n Confusion Matrix='confusion_matrix.png',\n)"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"","image/png":"\n"},"metadata":{"needs_background":"light"}}]},{"cell_type":"markdown","source":"We can now save the model card and push our repository to 🤗Hub!","metadata":{}},{"cell_type":"code","source":"model_card.save(Path(local_repo) / \"README.md\")","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:23:53.269801Z","iopub.execute_input":"2022-12-01T13:23:53.270182Z","iopub.status.idle":"2022-12-01T13:23:53.327379Z","shell.execute_reply.started":"2022-12-01T13:23:53.270150Z","shell.execute_reply":"2022-12-01T13:23:53.326180Z"},"trusted":true},"execution_count":25,"outputs":[]},{"cell_type":"markdown","source":"We will now push the model to 🤗Hub. For this, we firstly need to authenticate ourselves. Then, we can push our model!","metadata":{}},{"cell_type":"code","source":"from huggingface_hub import notebook_login\nnotebook_login()","metadata":{"execution":{"iopub.status.busy":"2022-12-01T13:23:54.521519Z","iopub.execute_input":"2022-12-01T13:23:54.521921Z","iopub.status.idle":"2022-12-01T13:23:54.578232Z","shell.execute_reply.started":"2022-12-01T13:23:54.521883Z","shell.execute_reply":"2022-12-01T13:23:54.576764Z"},"trusted":true},"execution_count":26,"outputs":[{"output_type":"display_data","data":{"text/plain":"VBox(children=(HTML(value='