ultima versao

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

create_alloy_model.ipynb +256 -0
database/tawos/README.md +48 -0
database/tawos/deep/ALOY_deep-se.csv +0 -0
database/tawos/deep/APSTUD_deep-se.csv +0 -0
database/tawos/deep/CLI_deep-se.csv +0 -0
database/tawos/deep/CLOV_deep-se.csv +0 -0
database/tawos/deep/COMPASS_deep-se.csv +0 -0
database/tawos/deep/CONFCLOUD_deep-se.csv +0 -0
database/tawos/deep/CONFSERVER_deep-se.csv +0 -0
database/tawos/deep/DAEMON_deep-se.csv +0 -0
database/tawos/deep/DM_deep-se.csv +0 -0
database/tawos/deep/DNN_deep-se.csv +0 -0
database/tawos/deep/DURACLOUD_deep-se.csv +0 -0
database/tawos/deep/EVG_deep-se.csv +0 -0
database/tawos/deep/FAB_deep-se.csv +0 -0
database/tawos/deep/MDL_deep-se.csv +0 -0
database/tawos/deep/MESOS_deep-se.csv +0 -0
database/tawos/deep/MULE_deep-se.csv +0 -0
database/tawos/deep/NEXUS_deep-se.csv +0 -0
database/tawos/deep/SERVER_deep-se.csv +0 -0
database/tawos/deep/STL_deep-se.csv +0 -0
database/tawos/deep/TIDOC_deep-se.csv +0 -0
database/tawos/deep/TIMOB_deep-se.csv +0 -0
database/tawos/deep/TISTUD_deep-se.csv +0 -0
database/tawos/deep/XD_deep-se.csv +0 -0
database/tawos/tfidf/ALOY_tfidf-se.csv +0 -0
database/tawos/tfidf/APSTUD_tfidf-se.csv +0 -0
database/tawos/tfidf/CLI_tfidf-se.csv +0 -0
database/tawos/tfidf/CLOV_tfidf-se.csv +0 -0
database/tawos/tfidf/COMPASS_tfidf-se.csv +0 -0
database/tawos/tfidf/CONFCLOUD_tfidf-se.csv +0 -0
database/tawos/tfidf/CONFSERVER_tfidf-se.csv +0 -0
database/tawos/tfidf/DAEMON_tfidf-se.csv +0 -0
database/tawos/tfidf/DM_tfidf-se.csv +0 -0
database/tawos/tfidf/DNN_tfidf-se.csv +0 -0
database/tawos/tfidf/DURACLOUD_tfidf-se.csv +0 -0
database/tawos/tfidf/EVG_tfidf-se.csv +0 -0
database/tawos/tfidf/FAB_tfidf-se.csv +0 -0
database/tawos/tfidf/MDL_tfidf-se.csv +0 -0
database/tawos/tfidf/MESOS_tfidf-se.csv +0 -0
database/tawos/tfidf/MULE_tfidf-se.csv +0 -0
database/tawos/tfidf/NEXUS_tfidf-se.csv +0 -0
database/tawos/tfidf/SERVER_tfidf-se.csv +0 -0
database/tawos/tfidf/STL_tfidf-se.csv +0 -0
database/tawos/tfidf/TIDOC_tfidf-se.csv +0 -0
database/tawos/tfidf/TIMOB_tfidf-se.csv +0 -0
database/tawos/tfidf/TISTUD_tfidf-se.csv +0 -0
database/tawos/tfidf/XD_tfidf-se.csv +0 -0
model_tawos_aloy_mbr.joblib → models/tawos/aloy/model_tawos_aloy_mbr.joblib +1 -1
model_tawos_aloy_neosp.joblib → models/tawos/aloy/model_tawos_aloy_median.joblib +2 -2

create_alloy_model.ipynb ADDED Viewed

	@@ -0,0 +1,256 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: 'database\\\\tawos\\\\deep\\\\ALOY_deep-se.csv'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[1;32mc:\\Projetos\\model_effort_tawos\\model_creators\\create_alloy_model.ipynb Cell 1\u001b[0m in \u001b[0;36m2\n\u001b[0;32m      <a href='vscode-notebook-cell:/c%3A/Projetos/model_effort_tawos/model_creators/create_alloy_model.ipynb#W0sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[1;32m----> <a href='vscode-notebook-cell:/c%3A/Projetos/model_effort_tawos/model_creators/create_alloy_model.ipynb#W0sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m df \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39;49mread_csv(\u001b[39m\"\u001b[39;49m\u001b[39mdatabase\u001b[39;49m\u001b[39m\\\\\u001b[39;49;00m\u001b[39mtawos\u001b[39;49m\u001b[39m\\\\\u001b[39;49;00m\u001b[39mdeep\u001b[39;49m\u001b[39m\\\\\u001b[39;49;00m\u001b[39mALOY_deep-se.csv\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m      <a href='vscode-notebook-cell:/c%3A/Projetos/model_effort_tawos/model_creators/create_alloy_model.ipynb#W0sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m df\u001b[39m.\u001b[39minfo()\n",
+      "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[0;32m    899\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m    900\u001b[0m     dialect,\n\u001b[0;32m    901\u001b[0m     delimiter,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    908\u001b[0m     dtype_backend\u001b[39m=\u001b[39mdtype_backend,\n\u001b[0;32m    909\u001b[0m )\n\u001b[0;32m    910\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m--> 912\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
+      "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:577\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m    574\u001b[0m _validate_names(kwds\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mnames\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[0;32m    576\u001b[0m \u001b[39m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 577\u001b[0m parser \u001b[39m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwds)\n\u001b[0;32m    579\u001b[0m \u001b[39mif\u001b[39;00m chunksize \u001b[39mor\u001b[39;00m iterator:\n\u001b[0;32m    580\u001b[0m     \u001b[39mreturn\u001b[39;00m parser\n",
+      "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:1407\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m   1404\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptions[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m kwds[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m   1406\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles: IOHandles \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m-> 1407\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_engine \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_engine(f, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mengine)\n",
+      "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:1661\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m   1659\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m mode:\n\u001b[0;32m   1660\u001b[0m         mode \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m-> 1661\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39m=\u001b[39m get_handle(\n\u001b[0;32m   1662\u001b[0m     f,\n\u001b[0;32m   1663\u001b[0m     mode,\n\u001b[0;32m   1664\u001b[0m     encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m   1665\u001b[0m     compression\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mcompression\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m   1666\u001b[0m     memory_map\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mmemory_map\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mFalse\u001b[39;49;00m),\n\u001b[0;32m   1667\u001b[0m     is_text\u001b[39m=\u001b[39;49mis_text,\n\u001b[0;32m   1668\u001b[0m     errors\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding_errors\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mstrict\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[0;32m   1669\u001b[0m     storage_options\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mstorage_options\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m   1670\u001b[0m )\n\u001b[0;32m   1671\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m   1672\u001b[0m f \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles\u001b[39m.\u001b[39mhandle\n",
+      "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\pandas\\io\\common.py:859\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m    854\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(handle, \u001b[39mstr\u001b[39m):\n\u001b[0;32m    855\u001b[0m     \u001b[39m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m    856\u001b[0m     \u001b[39m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m    857\u001b[0m     \u001b[39mif\u001b[39;00m ioargs\u001b[39m.\u001b[39mencoding \u001b[39mand\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m ioargs\u001b[39m.\u001b[39mmode:\n\u001b[0;32m    858\u001b[0m         \u001b[39m# Encoding\u001b[39;00m\n\u001b[1;32m--> 859\u001b[0m         handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39;49m(\n\u001b[0;32m    860\u001b[0m             handle,\n\u001b[0;32m    861\u001b[0m             ioargs\u001b[39m.\u001b[39;49mmode,\n\u001b[0;32m    862\u001b[0m             encoding\u001b[39m=\u001b[39;49mioargs\u001b[39m.\u001b[39;49mencoding,\n\u001b[0;32m    863\u001b[0m             errors\u001b[39m=\u001b[39;49merrors,\n\u001b[0;32m    864\u001b[0m             newline\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m    865\u001b[0m         )\n\u001b[0;32m    866\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[0;32m    867\u001b[0m         \u001b[39m# Binary mode\u001b[39;00m\n\u001b[0;32m    868\u001b[0m         handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(handle, ioargs\u001b[39m.\u001b[39mmode)\n",
+      "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'database\\\\tawos\\\\deep\\\\ALOY_deep-se.csv'"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\ALOY_deep-se.csv\")\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pré-Processamento"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from string import punctuation\n",
+    "\n",
+    "def escape_tags_and_content(text):\n",
+    "    \"\"\"Escape tags and their content containing text, which is not written in natural language, such as code snippets\"\"\"\n",
+    "\n",
+    "    NO_TEXT_TAGS = \"code\", \"noformat\"\n",
+    "    for tag in NO_TEXT_TAGS:\n",
+    "        regex_matching_tag = re.compile(\"\\{%s(.*?)\\}(.*?)\\{%s\\}\" % (tag, tag), re.DOTALL)\n",
+    "        text = re.sub(regex_matching_tag, \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_tags(text):\n",
+    "    \"\"\"Escape markup tags, but retain their content\"\"\"\n",
+    "\n",
+    "    ESCAPE_TAGS = \"color\", \"quote\", \"anchor\", \"panel\"\n",
+    "    for tag in  ESCAPE_TAGS:\n",
+    "        text = re.sub(\"\\{%s(.*?)\\}\" % tag, \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_strings(text):\n",
+    "    \"\"\"Escape line breaks, tabulators, slashes and JIRA heading markup symbols\"\"\"\n",
+    "\n",
+    "    ESCAPE_STRINGS = \"\\\\r\", \"\\\\n\", \"\\\\t\", \"\\\\f\", \"\\\\v\", \"\\\"\", \"\\\\\\\\\", \"h1. \", \"h2. \", \"h3. \", \"h4. \", \"h5. \", \"h6. \"\n",
+    "    for escape_string in ESCAPE_STRINGS:\n",
+    "        text = text.replace(escape_string, \" \")\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_links(text):\n",
+    "    \"\"\"Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' \"\"\"\n",
+    "\n",
+    "    LINK_STARTERS = r\"\\#\", r\"\\^\", r\"http\\:\\/\\/\", r\"https\\:\\/\\/\", r\"malto\\:\", r\"file\\:\", r\"\\~\"\n",
+    "    for link_starter in LINK_STARTERS:\n",
+    "        text = re.sub(\"\\[(.*?\\\\|)?%s(.*?)\\]\" % link_starter, \"\", text)\n",
+    "        text = re.sub(r\"\\bhttps?://\\S+\", \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_hex_character_codes(text):\n",
+    "    \"\"\"Escape characters outside the latin alphabet which are converted to hex code representation\"\"\"\n",
+    "\n",
+    "    return re.sub(r\"\\\\x\\w\\w\", \"\", text)\n",
+    "\n",
+    "def escape_punctuation_boundaries(text):\n",
+    "    \"\"\"Remove all punctuation marks from the beginning and end of words,\n",
+    "    except for trailing period at the end of words\"\"\"\n",
+    "\n",
+    "    return \" \".join([word.strip(punctuation.replace(\".\", \"\")).lstrip(\".\") for word in text.split()])\n",
+    "\n",
+    "def escape_odd_spaces(text):\n",
+    "    \"\"\"Replace several consequent spaces with one space\n",
+    "    and remove spaces from string start and end\"\"\"\n",
+    "\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)\n",
+    "    text = text.strip()\n",
+    "\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Criação do Modelo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['models/tawos/aloy/model_tawos_aloy_tfidf_linear.joblib']"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.dummy import DummyRegressor\n",
+    "from nltk.corpus import stopwords\n",
+    "from textblob import TextBlob\n",
+    "import textstat\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn import svm\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.feature_selection import SelectKBest\n",
+    "import pandas as pd\n",
+    "from joblib import dump\n",
+    "\n",
+    "# carregando os dados\n",
+    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\ALOY_deep-se.csv\")\n",
+    "\n",
+    "# criação de uma nova coluna\n",
+    "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
+    "\n",
+    "# pré-processamento\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n",
+    "\n",
+    "# removendo stop-words\n",
+    "stop = stopwords.words('english')\n",
+    "df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n",
+    "\n",
+    "# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n",
+    "df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n",
+    "y = df[\"storypoint_\"]\n",
+    "df = df.drop(columns=['storypoint_'])\n",
+    "\n",
+    "# 5º coluna -> extração das features para o neosp\n",
+    "df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n",
+    "df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n",
+    "df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n",
+    "df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n",
+    "df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n",
+    "df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n",
+    "df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n",
+    "df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n",
+    "df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n",
+    "df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n",
+    "df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n",
+    "# 16º colunas\n",
+    "\n",
+    "# Extração das features para o TFIDF\n",
+    "vectorizer = TfidfVectorizer()\n",
+    "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
+    "\n",
+    "dump(vectorizer, \"models/tawos/aloy/vectorizer_tfidf.joblib\")\n",
+    "\n",
+    "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
+    "\n",
+    "# Juntando as features do neosp com o tfidf\n",
+    "df = df.join(df_vec)\n",
+    "X = df\n",
+    "\n",
+    "############ MbR\n",
+    "\n",
+    "model = DummyRegressor(strategy=\"mean\")\n",
+    "model.fit(X, y)\n",
+    "dump(model, \"models/tawos/aloy/model_tawos_aloy_mbr.joblib\")\n",
+    "\n",
+    "############ Mediana\n",
+    "\n",
+    "model = DummyRegressor(strategy=\"median\")\n",
+    "model.fit(X, y)\n",
+    "dump(model, \"models/tawos/aloy/model_tawos_aloy_median.joblib\")\n",
+    "\n",
+    "########### NEOSP-SVR\n",
+    "\n",
+    "model = svm.SVR()\n",
+    "model.fit(X[X.columns[5:16]], y)\n",
+    "dump(model, \"models/tawos/aloy/model_tawos_aloy_neosp_svr.joblib\")\n",
+    "\n",
+    "########### NEOSP-LR\n",
+    "\n",
+    "model = LinearRegression()\n",
+    "model.fit(X[X.columns[5:16]], y)\n",
+    "dump(model, \"models/tawos/aloy/model_tawos_aloy_neosp_linear.joblib\")\n",
+    "\n",
+    "############ TFIDF-SVM\n",
+    "\n",
+    "model = svm.SVR()\n",
+    "model.fit(X[X.columns[17:]], y)\n",
+    "dump(model, \"models/tawos/aloy/model_tawos_aloy_tfidf_svr.joblib\")\n",
+    "\n",
+    "############ TFIDF-LR\n",
+    "\n",
+    "model = LinearRegression()\n",
+    "model.fit(X[X.columns[17:]], y)\n",
+    "dump(model, \"models/tawos/aloy/model_tawos_aloy_tfidf_linear.joblib\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

database/tawos/README.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# Tawosi Dataset
+This directory consists 46 files, two files per each of the 26-3\*=23 projects:
+- 23 files with "\_deep-se" suffix are prepared to be used by Deep-SE.
+- 23 files with "\_tfidf-se" suffix are prepared to be used by TF/IDF-SE.
+\* <sub>One of the repositories including three projects has been removed from the public domain during the time that the manuscript for this study [1] was under revision. Therefore, although the paper reports the results for all 26 projects, the replication package includes 23 projects as we refrain from publishing the data for the three remaining projects in accordance with The General Data Protection Regulation.</sub>
+These 23 files are collected from 12 open source repositories by Tawosi et al. up until August, 2020.
+The files named after their project key as "[project key]\_[approach].csv" e.g. MESOS_deep-se.csv, which is the set of issues collected from Appache repository Mesos project, and contains the features that Deep-SE needs for prediction. The following table shows the list of projects and the repositories where the project was collected from.
+## Project list
+| Repository   | Project                           | Key        | File for Deep-SE          | File for TF/IDF-SE        |
+|--------------|-----------------------------------|------------|---------------------------|---------------------------|
+| Apache       | Mesos                             | MESOS      | MESOS_deeep-se.csv        | MESOS_tfidf-se.csv        |
+| Apache       | Alloy                             | ALOY       | ALOY_deeep-se.csv         | ALOY_tfidf-se.csv         |
+| Appcelerator | Appcelerator studio               | TISTUD     | TISTUD_deeep-se.csv       | TISTUD_tfidf-se.csv       |
+| Appcelerator | Aptana studio                     | APSTUD     | APSTUD_deeep-se.csv       | APSTUD_tfidf-se.csv       |
+| Appcelerator | Command-Line Interface            | CLI        | CLI_deeep-se.csv          | CLI_tfidf-se.csv          |
+| Appcelerator | Daemon                            | DAEMON     | DAEMON_deeep-se.csv       | DAEMON_tfidf-se.csv       |
+| Appcelerator | Documentation                     | TIDOC      | TIDOC_deeep-se.csv        | TIDOC_tfidf-se.csv        |
+| Appcelerator | Titanium                          | TIMOB      | TIMOB_deeep-se.csv        | TIMOB_tfidf-se.csv        |
+| Atlassian    | Clover                            | CLOV       | CLOV_deeep-se.csv         | CLOV_tfidf-se.csv         |
+| Atlassian    | Confluence Cloud                  | CONFCLOUD  | CONFCLOUD_deeep-se.csv    | CONFCLOUD_tfidf-se.csv    |
+| Atlassian    | Confluence Server and Data Center | CONFSERVER | CONFSERVER_deeep-se.csv   | CONFSERVER_tfidf-se.csv   |
+| DNNSoftware  | DNN                               | DNN        | DNN_deeep-se.csv          | DNN_tfidf-se.csv          |
+| Duraspace    | Duracloud                         | DURACLOUD  | DURACLOUD_deeep-se.csv    | DURACLOUD_tfidf-se.csv    |
+| Hyperledger  | Fabric                            | FAB        | FAB_deeep-se.csv          | FAB_tfidf-se.csv          |
+| Hyperledger  | Sawtooth                          | STL        | STL_deeep-se.csv          | STL_tfidf-se.csv          |
+| Lsstcorp     | Data management                   | DM         | DM_deeep-se.csv           | DM_tfidf-se.csv           |
+| MongoDB      | Compass                           | COMPASS    | COMPASS_deeep-se.csv      | COMPASS_tfidf-se.csv      |
+| MongoDB      | Core Server                       | SERVER     | SERVER_deeep-se.csv       | SERVER_tfidf-se.csv       |
+| MongoDB      | Evergreen                         | EVG        | EVG_deeep-se.csv          | EVG_tfidf-se.csv          |
+| Moodle       | Moodle                            | MDL        | MDL_deeep-se.csv          | MDL_tfidf-se.csv          |
+| Mulesoft     | Mule                              | MULE       | MULE_deeep-se.csv         | MULE_tfidf-se.csv         |
+| Sonatype     | Sonatype’s Nexus                  | NEXUS      | NEXUS_deeep-se.csv        | NEXUS_tfidf-se.csv        |
+| Spring       | Spring XD                         | XD         | XD_deeep-se.csv           | XD_tfidf-se.csv           |
+## Content of the files
+- Each csv file for Deep-SE approach contains 4 columns: *issuekey*, *created*, *title*, *description*, and *storypoint*.
+- Each csv file for TF/IDF-SE approach contains more than 4 columns: starting with *issuekey*, *created*, *storypoint*, *context*, *codesnippet*, and a set of one-hot columns for issue type (header starting with t\_) followed by component(s) (header starting with c\_).
+- The issues are sorted based on issue's creation time (i.e. the former issues was created before the latter issues).
+[1] Vali Tawosi, Rebecca Moussa, and Federica Sarro. "Agile Effort Estimation: Have We Solved the Problem Yet? Insights From A Replication Study." IEEE Transactions on Software Engineering, no. TBA (2022): pp. TBA.

database/tawos/deep/ALOY_deep-se.csv ADDED Viewed