giseldo commited on Sep 1, 2023

Commit

00114fc

•

1 Parent(s): d10a474

ultima versao

Browse files

Files changed (26) hide show

create_APSTUD_model.ipynb +262 -0
create_CLI_model.ipynb +262 -0
create_TIMOB_model.ipynb +35 -5
create_XD_model.ipynb +36 -6
create_alloy_model.ipynb +35 -5
models/tawos/APSTUD/model_tawos_APSTUD_mbr.joblib +3 -0
models/tawos/APSTUD/model_tawos_APSTUD_median.joblib +3 -0
models/tawos/APSTUD/model_tawos_APSTUD_neosp_linear.joblib +3 -0
models/tawos/APSTUD/model_tawos_APSTUD_neosp_svr.joblib +3 -0
models/tawos/APSTUD/model_tawos_APSTUD_tfidf_linear.joblib +3 -0
models/tawos/APSTUD/model_tawos_APSTUD_tfidf_svr.joblib +3 -0
models/tawos/APSTUD/vectorizer_tawos_APSTUD_tfidf.joblib +3 -0
models/tawos/CLI/model_tawos_CLI_mbr.joblib +3 -0
models/tawos/CLI/model_tawos_CLI_median.joblib +3 -0
models/tawos/CLI/model_tawos_CLI_neosp_linear.joblib +3 -0
models/tawos/CLI/model_tawos_CLI_neosp_svr.joblib +3 -0
models/tawos/CLI/model_tawos_CLI_tfidf_linear.joblib +3 -0
models/tawos/CLI/model_tawos_CLI_tfidf_svr.joblib +3 -0
models/tawos/CLI/vectorizer_tawos_CLI_tfidf.joblib +3 -0
models/tawos/TIMOB/model_tawos_TIMOB_mbr.joblib +3 -0
models/tawos/TIMOB/model_tawos_TIMOB_median.joblib +3 -0
models/tawos/TIMOB/model_tawos_TIMOB_neosp_linear.joblib +3 -0
models/tawos/TIMOB/model_tawos_TIMOB_neosp_svr.joblib +3 -0
models/tawos/TIMOB/model_tawos_TIMOB_tfidf_linear.joblib +3 -0
models/tawos/TIMOB/model_tawos_TIMOB_tfidf_svr.joblib +3 -0
models/tawos/TIMOB/vectorizer_tawos_TIMOB_tfidf.joblib +3 -0

create_APSTUD_model.ipynb ADDED Viewed

	@@ -0,0 +1,262 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 476 entries, 0 to 475\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column       Non-Null Count  Dtype \n",
+      "---  ------       --------------  ----- \n",
+      " 0   issuekey     476 non-null    object\n",
+      " 1   created      476 non-null    object\n",
+      " 2   title        476 non-null    object\n",
+      " 3   description  476 non-null    object\n",
+      " 4   storypoint   476 non-null    int64 \n",
+      "dtypes: int64(1), object(4)\n",
+      "memory usage: 18.7+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "project_name = \"APSTUD\"\n",
+    "\n",
+    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
+    "\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pré-Processamento"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from string import punctuation\n",
+    "\n",
+    "def escape_tags_and_content(text):\n",
+    "    \"\"\"Escape tags and their content containing text, which is not written in natural language, such as code snippets\"\"\"\n",
+    "\n",
+    "    NO_TEXT_TAGS = \"code\", \"noformat\"\n",
+    "    for tag in NO_TEXT_TAGS:\n",
+    "        regex_matching_tag = re.compile(\"\\{%s(.*?)\\}(.*?)\\{%s\\}\" % (tag, tag), re.DOTALL)\n",
+    "        text = re.sub(regex_matching_tag, \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_tags(text):\n",
+    "    \"\"\"Escape markup tags, but retain their content\"\"\"\n",
+    "\n",
+    "    ESCAPE_TAGS = \"color\", \"quote\", \"anchor\", \"panel\"\n",
+    "    for tag in  ESCAPE_TAGS:\n",
+    "        text = re.sub(\"\\{%s(.*?)\\}\" % tag, \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_strings(text):\n",
+    "    \"\"\"Escape line breaks, tabulators, slashes and JIRA heading markup symbols\"\"\"\n",
+    "\n",
+    "    ESCAPE_STRINGS = \"\\\\r\", \"\\\\n\", \"\\\\t\", \"\\\\f\", \"\\\\v\", \"\\\"\", \"\\\\\\\\\", \"h1. \", \"h2. \", \"h3. \", \"h4. \", \"h5. \", \"h6. \"\n",
+    "    for escape_string in ESCAPE_STRINGS:\n",
+    "        text = text.replace(escape_string, \" \")\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_links(text):\n",
+    "    \"\"\"Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' \"\"\"\n",
+    "\n",
+    "    LINK_STARTERS = r\"\\#\", r\"\\^\", r\"http\\:\\/\\/\", r\"https\\:\\/\\/\", r\"malto\\:\", r\"file\\:\", r\"\\~\"\n",
+    "    for link_starter in LINK_STARTERS:\n",
+    "        text = re.sub(\"\\[(.*?\\\\|)?%s(.*?)\\]\" % link_starter, \"\", text)\n",
+    "        text = re.sub(r\"\\bhttps?://\\S+\", \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_hex_character_codes(text):\n",
+    "    \"\"\"Escape characters outside the latin alphabet which are converted to hex code representation\"\"\"\n",
+    "\n",
+    "    return re.sub(r\"\\\\x\\w\\w\", \"\", text)\n",
+    "\n",
+    "def escape_punctuation_boundaries(text):\n",
+    "    \"\"\"Remove all punctuation marks from the beginning and end of words,\n",
+    "    except for trailing period at the end of words\"\"\"\n",
+    "\n",
+    "    return \" \".join([word.strip(punctuation.replace(\".\", \"\")).lstrip(\".\") for word in text.split()])\n",
+    "\n",
+    "def escape_odd_spaces(text):\n",
+    "    \"\"\"Replace several consequent spaces with one space\n",
+    "    and remove spaces from string start and end\"\"\"\n",
+    "\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)\n",
+    "    text = text.strip()\n",
+    "\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Criação do Modelo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['models/tawos/APSTUD/model_tawos_APSTUD_tfidf_linear.joblib']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.dummy import DummyRegressor\n",
+    "from nltk.corpus import stopwords\n",
+    "from textblob import TextBlob\n",
+    "import textstat\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn import svm\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.feature_selection import SelectKBest\n",
+    "import pandas as pd\n",
+    "from joblib import dump\n",
+    "\n",
+    "# carregando os dados\n",
+    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
+    "\n",
+    "# criação de uma nova coluna\n",
+    "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
+    "\n",
+    "# pré-processamento\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n",
+    "\n",
+    "# removendo stop-words\n",
+    "stop = stopwords.words('english')\n",
+    "df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n",
+    "\n",
+    "# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n",
+    "df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n",
+    "y = df[\"storypoint_\"]\n",
+    "df = df.drop(columns=['storypoint_'])\n",
+    "\n",
+    "# 5º coluna -> extração das features para o neosp\n",
+    "df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n",
+    "df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n",
+    "df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n",
+    "df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n",
+    "df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n",
+    "df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n",
+    "df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n",
+    "df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n",
+    "df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n",
+    "df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n",
+    "df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n",
+    "# 16º colunas\n",
+    "\n",
+    "# Extração das features para o TFIDF\n",
+    "vectorizer = TfidfVectorizer()\n",
+    "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
+    "#dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
+    "dump(vectorizer, \"models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
+    "\n",
+    "# Juntando as features do neosp com o tfidf\n",
+    "df = df.join(df_vec)\n",
+    "X = df\n",
+    "\n",
+    "############ MbR\n",
+    "\n",
+    "model = DummyRegressor(strategy=\"mean\")\n",
+    "model.fit(X, y)\n",
+    "#dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_mbr.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "############ Mediana\n",
+    "\n",
+    "model = DummyRegressor(strategy=\"median\")\n",
+    "model.fit(X, y)\n",
+    "#dump(model, \"model_tawos_aloy_median.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_median.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "########### NEOSP-SVR\n",
+    "\n",
+    "model = svm.SVR()\n",
+    "model.fit(X[X.columns[5:16]], y)\n",
+    "#dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_svr.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "########### NEOSP-LR\n",
+    "\n",
+    "model = LinearRegression()\n",
+    "model.fit(X[X.columns[5:16]], y)\n",
+    "#dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_linear.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "############ TFIDF-SVM\n",
+    "\n",
+    "model = svm.SVR()\n",
+    "model.fit(X[X.columns[16:]], y)\n",
+    "#dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_svr.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "############ TFIDF-LR\n",
+    "\n",
+    "model = LinearRegression()\n",
+    "model.fit(X[X.columns[16:]], y)\n",
+    "#dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_linear.joblib\".format(project_name, project_name))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

create_CLI_model.ipynb ADDED Viewed

	@@ -0,0 +1,262 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 293 entries, 0 to 292\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column       Non-Null Count  Dtype \n",
+      "---  ------       --------------  ----- \n",
+      " 0   issuekey     293 non-null    object\n",
+      " 1   created      293 non-null    object\n",
+      " 2   title        293 non-null    object\n",
+      " 3   description  293 non-null    object\n",
+      " 4   storypoint   293 non-null    int64 \n",
+      "dtypes: int64(1), object(4)\n",
+      "memory usage: 11.6+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "project_name = \"CLI\"\n",
+    "\n",
+    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
+    "\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pré-Processamento"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from string import punctuation\n",
+    "\n",
+    "def escape_tags_and_content(text):\n",
+    "    \"\"\"Escape tags and their content containing text, which is not written in natural language, such as code snippets\"\"\"\n",
+    "\n",
+    "    NO_TEXT_TAGS = \"code\", \"noformat\"\n",
+    "    for tag in NO_TEXT_TAGS:\n",
+    "        regex_matching_tag = re.compile(\"\\{%s(.*?)\\}(.*?)\\{%s\\}\" % (tag, tag), re.DOTALL)\n",
+    "        text = re.sub(regex_matching_tag, \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_tags(text):\n",
+    "    \"\"\"Escape markup tags, but retain their content\"\"\"\n",
+    "\n",
+    "    ESCAPE_TAGS = \"color\", \"quote\", \"anchor\", \"panel\"\n",
+    "    for tag in  ESCAPE_TAGS:\n",
+    "        text = re.sub(\"\\{%s(.*?)\\}\" % tag, \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_strings(text):\n",
+    "    \"\"\"Escape line breaks, tabulators, slashes and JIRA heading markup symbols\"\"\"\n",
+    "\n",
+    "    ESCAPE_STRINGS = \"\\\\r\", \"\\\\n\", \"\\\\t\", \"\\\\f\", \"\\\\v\", \"\\\"\", \"\\\\\\\\\", \"h1. \", \"h2. \", \"h3. \", \"h4. \", \"h5. \", \"h6. \"\n",
+    "    for escape_string in ESCAPE_STRINGS:\n",
+    "        text = text.replace(escape_string, \" \")\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_links(text):\n",
+    "    \"\"\"Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' \"\"\"\n",
+    "\n",
+    "    LINK_STARTERS = r\"\\#\", r\"\\^\", r\"http\\:\\/\\/\", r\"https\\:\\/\\/\", r\"malto\\:\", r\"file\\:\", r\"\\~\"\n",
+    "    for link_starter in LINK_STARTERS:\n",
+    "        text = re.sub(\"\\[(.*?\\\\|)?%s(.*?)\\]\" % link_starter, \"\", text)\n",
+    "        text = re.sub(r\"\\bhttps?://\\S+\", \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_hex_character_codes(text):\n",
+    "    \"\"\"Escape characters outside the latin alphabet which are converted to hex code representation\"\"\"\n",
+    "\n",
+    "    return re.sub(r\"\\\\x\\w\\w\", \"\", text)\n",
+    "\n",
+    "def escape_punctuation_boundaries(text):\n",
+    "    \"\"\"Remove all punctuation marks from the beginning and end of words,\n",
+    "    except for trailing period at the end of words\"\"\"\n",
+    "\n",
+    "    return \" \".join([word.strip(punctuation.replace(\".\", \"\")).lstrip(\".\") for word in text.split()])\n",
+    "\n",
+    "def escape_odd_spaces(text):\n",
+    "    \"\"\"Replace several consequent spaces with one space\n",
+    "    and remove spaces from string start and end\"\"\"\n",
+    "\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)\n",
+    "    text = text.strip()\n",
+    "\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Criação do Modelo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['models/tawos/CLI/model_tawos_CLI_tfidf_linear.joblib']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.dummy import DummyRegressor\n",
+    "from nltk.corpus import stopwords\n",
+    "from textblob import TextBlob\n",
+    "import textstat\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn import svm\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.feature_selection import SelectKBest\n",
+    "import pandas as pd\n",
+    "from joblib import dump\n",
+    "\n",
+    "# carregando os dados\n",
+    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
+    "\n",
+    "# criação de uma nova coluna\n",
+    "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
+    "\n",
+    "# pré-processamento\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n",
+    "\n",
+    "# removendo stop-words\n",
+    "stop = stopwords.words('english')\n",
+    "df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n",
+    "\n",
+    "# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n",
+    "df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n",
+    "y = df[\"storypoint_\"]\n",
+    "df = df.drop(columns=['storypoint_'])\n",
+    "\n",
+    "# 5º coluna -> extração das features para o neosp\n",
+    "df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n",
+    "df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n",
+    "df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n",
+    "df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n",
+    "df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n",
+    "df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n",
+    "df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n",
+    "df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n",
+    "df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n",
+    "df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n",
+    "df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n",
+    "# 16º colunas\n",
+    "\n",
+    "# Extração das features para o TFIDF\n",
+    "vectorizer = TfidfVectorizer()\n",
+    "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
+    "#dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
+    "dump(vectorizer, \"models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
+    "\n",
+    "# Juntando as features do neosp com o tfidf\n",
+    "df = df.join(df_vec)\n",
+    "X = df\n",
+    "\n",
+    "############ MbR\n",
+    "\n",
+    "model = DummyRegressor(strategy=\"mean\")\n",
+    "model.fit(X, y)\n",
+    "#dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_mbr.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "############ Mediana\n",
+    "\n",
+    "model = DummyRegressor(strategy=\"median\")\n",
+    "model.fit(X, y)\n",
+    "#dump(model, \"model_tawos_aloy_median.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_median.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "########### NEOSP-SVR\n",
+    "\n",
+    "model = svm.SVR()\n",
+    "model.fit(X[X.columns[5:16]], y)\n",
+    "#dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_svr.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "########### NEOSP-LR\n",
+    "\n",
+    "model = LinearRegression()\n",
+    "model.fit(X[X.columns[5:16]], y)\n",
+    "#dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_linear.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "############ TFIDF-SVM\n",
+    "\n",
+    "model = svm.SVR()\n",
+    "model.fit(X[X.columns[16:]], y)\n",
+    "#dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_svr.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "############ TFIDF-LR\n",
+    "\n",
+    "model = LinearRegression()\n",
+    "model.fit(X[X.columns[16:]], y)\n",
+    "#dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_linear.joblib\".format(project_name, project_name))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

create_TIMOB_model.ipynb CHANGED Viewed

@@ -2,9 +2,28 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "import pandas as pd\n",
     "project_name = \"TIMOB\"\n",
@@ -23,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -98,9 +117,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "from sklearn.dummy import DummyRegressor\n",
     "from nltk.corpus import stopwords\n",

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 3915 entries, 0 to 3914\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column       Non-Null Count  Dtype  \n",
+      "---  ------       --------------  -----  \n",
+      " 0   issuekey     3915 non-null   object \n",
+      " 1   created      3915 non-null   object \n",
+      " 2   title        3915 non-null   object \n",
+      " 3   description  3915 non-null   object \n",
+      " 4   storypoint   3915 non-null   float64\n",
+      "dtypes: float64(1), object(4)\n",
+      "memory usage: 153.1+ KB\n"
+     ]
+    }
+   ],
    "source": [
     "import pandas as pd\n",
     "project_name = \"TIMOB\"\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['models/tawos/TIMOB/model_tawos_TIMOB_tfidf_linear.joblib']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from sklearn.dummy import DummyRegressor\n",
     "from nltk.corpus import stopwords\n",

create_XD_model.ipynb CHANGED Viewed

@@ -2,12 +2,31 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "project_name = \"TIMOB\"\n",
     "\n",
     "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
     "\n",
@@ -23,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -98,9 +117,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "from sklearn.dummy import DummyRegressor\n",
     "from nltk.corpus import stopwords\n",

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 811 entries, 0 to 810\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column       Non-Null Count  Dtype \n",
+      "---  ------       --------------  ----- \n",
+      " 0   issuekey     811 non-null    object\n",
+      " 1   created      811 non-null    object\n",
+      " 2   title        811 non-null    object\n",
+      " 3   description  811 non-null    object\n",
+      " 4   storypoint   811 non-null    int64 \n",
+      "dtypes: int64(1), object(4)\n",
+      "memory usage: 31.8+ KB\n"
+     ]
+    }
+   ],
    "source": [
     "import pandas as pd\n",
+    "project_name = \"XD\"\n",
     "\n",
     "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['models/tawos/XD/model_tawos_XD_tfidf_linear.joblib']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from sklearn.dummy import DummyRegressor\n",
     "from nltk.corpus import stopwords\n",

create_alloy_model.ipynb CHANGED Viewed

@@ -2,9 +2,28 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "import pandas as pd\n",
     "project_name = \"ALOY\"\n",
@@ -21,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -96,9 +115,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
     "from sklearn.dummy import DummyRegressor\n",
     "from nltk.corpus import stopwords\n",

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 10,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 241 entries, 0 to 240\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column       Non-Null Count  Dtype \n",
+      "---  ------       --------------  ----- \n",
+      " 0   issuekey     241 non-null    object\n",
+      " 1   created      241 non-null    object\n",
+      " 2   title        241 non-null    object\n",
+      " 3   description  241 non-null    object\n",
+      " 4   storypoint   241 non-null    int64 \n",
+      "dtypes: int64(1), object(4)\n",
+      "memory usage: 9.5+ KB\n"
+     ]
+    }
+   ],
    "source": [
     "import pandas as pd\n",
     "project_name = \"ALOY\"\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['models/tawos/ALOY/model_tawos_ALOY_tfidf_linear.joblib']"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from sklearn.dummy import DummyRegressor\n",
     "from nltk.corpus import stopwords\n",

models/tawos/APSTUD/model_tawos_APSTUD_mbr.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7acac58dc585421fbd6a591c8ec452275e9c4e48ae37c5dd82497c0ab35cc6b3
+size 383

models/tawos/APSTUD/model_tawos_APSTUD_median.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6571c714502722b037a8acd8cbf088c366257eb1061179d542a657eea7aba33
+size 383

models/tawos/APSTUD/model_tawos_APSTUD_neosp_linear.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e13246fcc693b4894adca7b7bf2eec614fc6a3ab96b58860146471b6b458550
+size 1280

models/tawos/APSTUD/model_tawos_APSTUD_neosp_svr.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80ca226d62be4cd400503dbbf0705617bb3b9e9694a2059ce7c41bdeb5ab9a84
+size 48180

models/tawos/APSTUD/model_tawos_APSTUD_tfidf_linear.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b103fa59d6908c65ba7c761c6a316f207b92a043a4ab5b8b45881bae91971c84
+size 137848

models/tawos/APSTUD/model_tawos_APSTUD_tfidf_svr.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d7c47cb8057e26871533f974cc1589166ac35ba35b124f961f02de08a07f207
+size 19491164

models/tawos/APSTUD/vectorizer_tawos_APSTUD_tfidf.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61778540a81a78a549cf7c03df66b5eef8cc66202e072d083ab57aef64399649
+size 155196

models/tawos/CLI/model_tawos_CLI_mbr.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45703b7823252f7922bc9c977d04b9b120b71304bdce17cc28344caa35fabbbe
+size 383

models/tawos/CLI/model_tawos_CLI_median.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f33655a67e4d587143f615b1604a45fc5cac5b70b0c8e999b47a953a43511e43
+size 383

models/tawos/CLI/model_tawos_CLI_neosp_linear.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a4a3292bebabee20e9e36430e6554f3de8da5fb422571484f6a95c227e81576
+size 1280

models/tawos/CLI/model_tawos_CLI_neosp_svr.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f0b063bb04d30f576212a7154e6961c919e75595f6e143c42e4c19bc06a0844
+size 29171

models/tawos/CLI/model_tawos_CLI_tfidf_linear.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfdc99bda09e610d6e429b394fae12c28898e7e28f4467c097d0e314522240ba
+size 60608

models/tawos/CLI/model_tawos_CLI_tfidf_svr.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f4e2eef9d29a97978d4690b2502a7c3f2249d4d7a9bb5a2332875634919268c
+size 5148515

models/tawos/CLI/vectorizer_tawos_CLI_tfidf.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd970eb344f06bd09a584eceb7237b1705c88452b7a1f0ea2e7b434d9c8dbae
+size 68396

models/tawos/TIMOB/model_tawos_TIMOB_mbr.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:102c44692f2343dba35472d1eb958558c33726394f70a796bf0b8f4aea4f930e
+size 383

models/tawos/TIMOB/model_tawos_TIMOB_median.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b953e6d331fe351298ed5347ee1248ed7b925103d17579f22e1af38f9969c6a7
+size 383

models/tawos/TIMOB/model_tawos_TIMOB_neosp_linear.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46b123d78db5341847c348fcd4c4717b735e7a56b183d99c4fc2df11ea4cfdfc
+size 1280

models/tawos/TIMOB/model_tawos_TIMOB_neosp_svr.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37fc0c3bfcad3e6422f9d764331d739bd99d618449abb1e6c2c8c197d41ae1d4
+size 392692

models/tawos/TIMOB/model_tawos_TIMOB_tfidf_linear.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4ade1379f8b7b1322fe6ec3c8b6a858d9a7f3201f7ee14f2d795df33801f108
+size 427664

models/tawos/TIMOB/model_tawos_TIMOB_tfidf_svr.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d8509d7867bbc0600873a3b423e76541ed9a93131b85e9b518800a8114bb8b6
+size 447037100

models/tawos/TIMOB/vectorizer_tawos_TIMOB_tfidf.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb23410643efcc29e0f566115da31efd5958f03fbfaea577569d2a6c29157110
+size 455625