ultima versao

Browse files

Files changed (11) hide show

create_TIMOB_model.ipynb +232 -0
create_XD_model.ipynb +232 -0
create_alloy_model.ipynb +22 -56
models/tawos/XD/model_tawos_XD_mbr.joblib +3 -0
models/tawos/XD/model_tawos_XD_median.joblib +3 -0
models/tawos/XD/model_tawos_XD_neosp_linear.joblib +3 -0
models/tawos/XD/model_tawos_XD_neosp_svr.joblib +3 -0
models/tawos/XD/model_tawos_XD_tfidf_linear.joblib +3 -0
models/tawos/XD/model_tawos_XD_tfidf_svr.joblib +3 -0
models/tawos/XD/vectorizer_tawos_XD_tfidf.joblib +3 -0
models/tawos/aloy/{vectorizer_tfidf.joblib → vectorizer_tawos_ALOY_tfidf.joblib} +1 -1

create_TIMOB_model.ipynb ADDED Viewed

	@@ -0,0 +1,232 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "project_name = \"TIMOB\"\n",
+    "\n",
+    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
+    "\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pré-Processamento"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from string import punctuation\n",
+    "\n",
+    "def escape_tags_and_content(text):\n",
+    "    \"\"\"Escape tags and their content containing text, which is not written in natural language, such as code snippets\"\"\"\n",
+    "\n",
+    "    NO_TEXT_TAGS = \"code\", \"noformat\"\n",
+    "    for tag in NO_TEXT_TAGS:\n",
+    "        regex_matching_tag = re.compile(\"\\{%s(.*?)\\}(.*?)\\{%s\\}\" % (tag, tag), re.DOTALL)\n",
+    "        text = re.sub(regex_matching_tag, \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_tags(text):\n",
+    "    \"\"\"Escape markup tags, but retain their content\"\"\"\n",
+    "\n",
+    "    ESCAPE_TAGS = \"color\", \"quote\", \"anchor\", \"panel\"\n",
+    "    for tag in  ESCAPE_TAGS:\n",
+    "        text = re.sub(\"\\{%s(.*?)\\}\" % tag, \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_strings(text):\n",
+    "    \"\"\"Escape line breaks, tabulators, slashes and JIRA heading markup symbols\"\"\"\n",
+    "\n",
+    "    ESCAPE_STRINGS = \"\\\\r\", \"\\\\n\", \"\\\\t\", \"\\\\f\", \"\\\\v\", \"\\\"\", \"\\\\\\\\\", \"h1. \", \"h2. \", \"h3. \", \"h4. \", \"h5. \", \"h6. \"\n",
+    "    for escape_string in ESCAPE_STRINGS:\n",
+    "        text = text.replace(escape_string, \" \")\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_links(text):\n",
+    "    \"\"\"Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' \"\"\"\n",
+    "\n",
+    "    LINK_STARTERS = r\"\\#\", r\"\\^\", r\"http\\:\\/\\/\", r\"https\\:\\/\\/\", r\"malto\\:\", r\"file\\:\", r\"\\~\"\n",
+    "    for link_starter in LINK_STARTERS:\n",
+    "        text = re.sub(\"\\[(.*?\\\\|)?%s(.*?)\\]\" % link_starter, \"\", text)\n",
+    "        text = re.sub(r\"\\bhttps?://\\S+\", \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_hex_character_codes(text):\n",
+    "    \"\"\"Escape characters outside the latin alphabet which are converted to hex code representation\"\"\"\n",
+    "\n",
+    "    return re.sub(r\"\\\\x\\w\\w\", \"\", text)\n",
+    "\n",
+    "def escape_punctuation_boundaries(text):\n",
+    "    \"\"\"Remove all punctuation marks from the beginning and end of words,\n",
+    "    except for trailing period at the end of words\"\"\"\n",
+    "\n",
+    "    return \" \".join([word.strip(punctuation.replace(\".\", \"\")).lstrip(\".\") for word in text.split()])\n",
+    "\n",
+    "def escape_odd_spaces(text):\n",
+    "    \"\"\"Replace several consequent spaces with one space\n",
+    "    and remove spaces from string start and end\"\"\"\n",
+    "\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)\n",
+    "    text = text.strip()\n",
+    "\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Criação do Modelo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.dummy import DummyRegressor\n",
+    "from nltk.corpus import stopwords\n",
+    "from textblob import TextBlob\n",
+    "import textstat\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn import svm\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.feature_selection import SelectKBest\n",
+    "import pandas as pd\n",
+    "from joblib import dump\n",
+    "\n",
+    "# carregando os dados\n",
+    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
+    "\n",
+    "# criação de uma nova coluna\n",
+    "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
+    "\n",
+    "# pré-processamento\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n",
+    "\n",
+    "# removendo stop-words\n",
+    "stop = stopwords.words('english')\n",
+    "df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n",
+    "\n",
+    "# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n",
+    "df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n",
+    "y = df[\"storypoint_\"]\n",
+    "df = df.drop(columns=['storypoint_'])\n",
+    "\n",
+    "# 5º coluna -> extração das features para o neosp\n",
+    "df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n",
+    "df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n",
+    "df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n",
+    "df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n",
+    "df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n",
+    "df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n",
+    "df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n",
+    "df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n",
+    "df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n",
+    "df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n",
+    "df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n",
+    "# 16º colunas\n",
+    "\n",
+    "# Extração das features para o TFIDF\n",
+    "vectorizer = TfidfVectorizer()\n",
+    "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
+    "#dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
+    "dump(vectorizer, \"models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
+    "\n",
+    "# Juntando as features do neosp com o tfidf\n",
+    "df = df.join(df_vec)\n",
+    "X = df\n",
+    "\n",
+    "############ MbR\n",
+    "\n",
+    "model = DummyRegressor(strategy=\"mean\")\n",
+    "model.fit(X, y)\n",
+    "#dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_mbr.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "############ Mediana\n",
+    "\n",
+    "model = DummyRegressor(strategy=\"median\")\n",
+    "model.fit(X, y)\n",
+    "#dump(model, \"model_tawos_aloy_median.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_median.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "########### NEOSP-SVR\n",
+    "\n",
+    "model = svm.SVR()\n",
+    "model.fit(X[X.columns[5:16]], y)\n",
+    "#dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_svr.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "########### NEOSP-LR\n",
+    "\n",
+    "model = LinearRegression()\n",
+    "model.fit(X[X.columns[5:16]], y)\n",
+    "#dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_linear.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "############ TFIDF-SVM\n",
+    "\n",
+    "model = svm.SVR()\n",
+    "model.fit(X[X.columns[16:]], y)\n",
+    "#dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_svr.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "############ TFIDF-LR\n",
+    "\n",
+    "model = LinearRegression()\n",
+    "model.fit(X[X.columns[16:]], y)\n",
+    "#dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_linear.joblib\".format(project_name, project_name))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

create_XD_model.ipynb ADDED Viewed

	@@ -0,0 +1,232 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "project_name = \"TIMOB\"\n",
+    "\n",
+    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
+    "\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pré-Processamento"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from string import punctuation\n",
+    "\n",
+    "def escape_tags_and_content(text):\n",
+    "    \"\"\"Escape tags and their content containing text, which is not written in natural language, such as code snippets\"\"\"\n",
+    "\n",
+    "    NO_TEXT_TAGS = \"code\", \"noformat\"\n",
+    "    for tag in NO_TEXT_TAGS:\n",
+    "        regex_matching_tag = re.compile(\"\\{%s(.*?)\\}(.*?)\\{%s\\}\" % (tag, tag), re.DOTALL)\n",
+    "        text = re.sub(regex_matching_tag, \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_tags(text):\n",
+    "    \"\"\"Escape markup tags, but retain their content\"\"\"\n",
+    "\n",
+    "    ESCAPE_TAGS = \"color\", \"quote\", \"anchor\", \"panel\"\n",
+    "    for tag in  ESCAPE_TAGS:\n",
+    "        text = re.sub(\"\\{%s(.*?)\\}\" % tag, \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_strings(text):\n",
+    "    \"\"\"Escape line breaks, tabulators, slashes and JIRA heading markup symbols\"\"\"\n",
+    "\n",
+    "    ESCAPE_STRINGS = \"\\\\r\", \"\\\\n\", \"\\\\t\", \"\\\\f\", \"\\\\v\", \"\\\"\", \"\\\\\\\\\", \"h1. \", \"h2. \", \"h3. \", \"h4. \", \"h5. \", \"h6. \"\n",
+    "    for escape_string in ESCAPE_STRINGS:\n",
+    "        text = text.replace(escape_string, \" \")\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_links(text):\n",
+    "    \"\"\"Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' \"\"\"\n",
+    "\n",
+    "    LINK_STARTERS = r\"\\#\", r\"\\^\", r\"http\\:\\/\\/\", r\"https\\:\\/\\/\", r\"malto\\:\", r\"file\\:\", r\"\\~\"\n",
+    "    for link_starter in LINK_STARTERS:\n",
+    "        text = re.sub(\"\\[(.*?\\\\|)?%s(.*?)\\]\" % link_starter, \"\", text)\n",
+    "        text = re.sub(r\"\\bhttps?://\\S+\", \"\", text)\n",
+    "\n",
+    "    return text\n",
+    "\n",
+    "def escape_hex_character_codes(text):\n",
+    "    \"\"\"Escape characters outside the latin alphabet which are converted to hex code representation\"\"\"\n",
+    "\n",
+    "    return re.sub(r\"\\\\x\\w\\w\", \"\", text)\n",
+    "\n",
+    "def escape_punctuation_boundaries(text):\n",
+    "    \"\"\"Remove all punctuation marks from the beginning and end of words,\n",
+    "    except for trailing period at the end of words\"\"\"\n",
+    "\n",
+    "    return \" \".join([word.strip(punctuation.replace(\".\", \"\")).lstrip(\".\") for word in text.split()])\n",
+    "\n",
+    "def escape_odd_spaces(text):\n",
+    "    \"\"\"Replace several consequent spaces with one space\n",
+    "    and remove spaces from string start and end\"\"\"\n",
+    "\n",
+    "    text = re.sub(r\"\\s+\", \" \", text)\n",
+    "    text = text.strip()\n",
+    "\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Criação do Modelo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.dummy import DummyRegressor\n",
+    "from nltk.corpus import stopwords\n",
+    "from textblob import TextBlob\n",
+    "import textstat\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn import svm\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.feature_selection import SelectKBest\n",
+    "import pandas as pd\n",
+    "from joblib import dump\n",
+    "\n",
+    "# carregando os dados\n",
+    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
+    "\n",
+    "# criação de uma nova coluna\n",
+    "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
+    "\n",
+    "# pré-processamento\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n",
+    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n",
+    "\n",
+    "# removendo stop-words\n",
+    "stop = stopwords.words('english')\n",
+    "df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n",
+    "\n",
+    "# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n",
+    "df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n",
+    "y = df[\"storypoint_\"]\n",
+    "df = df.drop(columns=['storypoint_'])\n",
+    "\n",
+    "# 5º coluna -> extração das features para o neosp\n",
+    "df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n",
+    "df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n",
+    "df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n",
+    "df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n",
+    "df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n",
+    "df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n",
+    "df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n",
+    "df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n",
+    "df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n",
+    "df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n",
+    "df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n",
+    "# 16º colunas\n",
+    "\n",
+    "# Extração das features para o TFIDF\n",
+    "vectorizer = TfidfVectorizer()\n",
+    "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
+    "#dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
+    "dump(vectorizer, \"models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
+    "\n",
+    "# Juntando as features do neosp com o tfidf\n",
+    "df = df.join(df_vec)\n",
+    "X = df\n",
+    "\n",
+    "############ MbR\n",
+    "\n",
+    "model = DummyRegressor(strategy=\"mean\")\n",
+    "model.fit(X, y)\n",
+    "#dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_mbr.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "############ Mediana\n",
+    "\n",
+    "model = DummyRegressor(strategy=\"median\")\n",
+    "model.fit(X, y)\n",
+    "#dump(model, \"model_tawos_aloy_median.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_median.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "########### NEOSP-SVR\n",
+    "\n",
+    "model = svm.SVR()\n",
+    "model.fit(X[X.columns[5:16]], y)\n",
+    "#dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_svr.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "########### NEOSP-LR\n",
+    "\n",
+    "model = LinearRegression()\n",
+    "model.fit(X[X.columns[5:16]], y)\n",
+    "#dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_linear.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "############ TFIDF-SVM\n",
+    "\n",
+    "model = svm.SVR()\n",
+    "model.fit(X[X.columns[16:]], y)\n",
+    "#dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_svr.joblib\".format(project_name, project_name))\n",
+    "\n",
+    "############ TFIDF-LR\n",
+    "\n",
+    "model = LinearRegression()\n",
+    "model.fit(X[X.columns[16:]], y)\n",
+    "#dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_linear.joblib\".format(project_name, project_name))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

create_alloy_model.ipynb CHANGED Viewed

@@ -2,31 +2,13 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<class 'pandas.core.frame.DataFrame'>\n",
-      "RangeIndex: 241 entries, 0 to 240\n",
-      "Data columns (total 5 columns):\n",
-      " #   Column       Non-Null Count  Dtype \n",
-      "---  ------       --------------  ----- \n",
-      " 0   issuekey     241 non-null    object\n",
-      " 1   created      241 non-null    object\n",
-      " 2   title        241 non-null    object\n",
-      " 3   description  241 non-null    object\n",
-      " 4   storypoint   241 non-null    int64 \n",
-      "dtypes: int64(1), object(4)\n",
-      "memory usage: 9.5+ KB\n"
-     ]
-    }
-   ],
    "source": [
     "import pandas as pd\n",
-    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\ALOY_deep-se.csv\")\n",
     "df.info()"
    ]
   },
@@ -39,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -114,20 +96,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['model_tawos_aloy_tfidf_linear.joblib']"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "from sklearn.dummy import DummyRegressor\n",
     "from nltk.corpus import stopwords\n",
@@ -141,7 +112,7 @@
     "from joblib import dump\n",
     "\n",
     "# carregando os dados\n",
-    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\ALOY_deep-se.csv\")\n",
     "\n",
     "# criação de uma nova coluna\n",
     "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
@@ -181,8 +152,8 @@
     "# Extração das features para o TFIDF\n",
     "vectorizer = TfidfVectorizer()\n",
     "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
-    "dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
-    "#dump(vectorizer, \"models/tawos/aloy/vectorizer_tfidf.joblib\")\n",
     "\n",
     "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
     "\n",
@@ -194,49 +165,44 @@
     "\n",
     "model = DummyRegressor(strategy=\"mean\")\n",
     "model.fit(X, y)\n",
-    "dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
-    "#dump(model, \"models/tawos/aloy/model_tawos_aloy_mbr.joblib\")\n",
     "\n",
     "############ Mediana\n",
     "\n",
     "model = DummyRegressor(strategy=\"median\")\n",
     "model.fit(X, y)\n",
-    "dump(model, \"model_tawos_aloy_median.joblib\")\n",
-    "#dump(model, \"models/tawos/aloy/model_tawos_aloy_median.joblib\")\n",
     "\n",
     "########### NEOSP-SVR\n",
     "\n",
     "model = svm.SVR()\n",
     "model.fit(X[X.columns[5:16]], y)\n",
-    "dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
-    "#dump(model, \"models/tawos/aloy/model_tawos_aloy_neosp_svr.joblib\")\n",
     "\n",
     "########### NEOSP-LR\n",
     "\n",
     "model = LinearRegression()\n",
     "model.fit(X[X.columns[5:16]], y)\n",
-    "dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
-    "#dump(model, \"models/tawos/aloy/model_tawos_aloy_neosp_linear.joblib\")\n",
     "\n",
     "############ TFIDF-SVM\n",
     "\n",
     "model = svm.SVR()\n",
     "model.fit(X[X.columns[16:]], y)\n",
-    "dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
-    "#dump(model, \"models/tawos/aloy/model_tawos_aloy_tfidf_svr.joblib\")\n",
     "\n",
     "############ TFIDF-LR\n",
     "\n",
     "model = LinearRegression()\n",
     "model.fit(X[X.columns[16:]], y)\n",
-    "dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
-    "#dump(model, \"models/tawos/aloy/model_tawos_aloy_tfidf_linear.joblib\")\n"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
   }
  ],
  "metadata": {

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "import pandas as pd\n",
+    "project_name = \"ALOY\"\n",
+    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
     "df.info()"
    ]
   },
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "from sklearn.dummy import DummyRegressor\n",
     "from nltk.corpus import stopwords\n",
     "from joblib import dump\n",
     "\n",
     "# carregando os dados\n",
+    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
     "\n",
     "# criação de uma nova coluna\n",
     "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
     "# Extração das features para o TFIDF\n",
     "vectorizer = TfidfVectorizer()\n",
     "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
+    "#dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
+    "dump(vectorizer, \"models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib\".format(project_name, project_name))\n",
     "\n",
     "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
     "\n",
     "\n",
     "model = DummyRegressor(strategy=\"mean\")\n",
     "model.fit(X, y)\n",
+    "#dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_mbr.joblib\".format(project_name, project_name))\n",
     "\n",
     "############ Mediana\n",
     "\n",
     "model = DummyRegressor(strategy=\"median\")\n",
     "model.fit(X, y)\n",
+    "#dump(model, \"model_tawos_aloy_median.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_median.joblib\".format(project_name, project_name))\n",
     "\n",
     "########### NEOSP-SVR\n",
     "\n",
     "model = svm.SVR()\n",
     "model.fit(X[X.columns[5:16]], y)\n",
+    "#dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_svr.joblib\".format(project_name, project_name))\n",
     "\n",
     "########### NEOSP-LR\n",
     "\n",
     "model = LinearRegression()\n",
     "model.fit(X[X.columns[5:16]], y)\n",
+    "#dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_linear.joblib\".format(project_name, project_name))\n",
     "\n",
     "############ TFIDF-SVM\n",
     "\n",
     "model = svm.SVR()\n",
     "model.fit(X[X.columns[16:]], y)\n",
+    "#dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_svr.joblib\".format(project_name, project_name))\n",
     "\n",
     "############ TFIDF-LR\n",
     "\n",
     "model = LinearRegression()\n",
     "model.fit(X[X.columns[16:]], y)\n",
+    "#dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
+    "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_linear.joblib\".format(project_name, project_name))\n"
    ]
   }
  ],
  "metadata": {

models/tawos/XD/model_tawos_XD_mbr.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfd1f37c3a508425a3d21f5dee3ed029754a752a70be1c1a9c2564c96009df98
+size 383

models/tawos/XD/model_tawos_XD_median.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f33655a67e4d587143f615b1604a45fc5cac5b70b0c8e999b47a953a43511e43
+size 383

models/tawos/XD/model_tawos_XD_neosp_linear.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aab222c9966e14bf870d7e25b3339e6d86b185557cbb66fc0a8330d6206523bd
+size 1280

models/tawos/XD/model_tawos_XD_neosp_svr.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e764a03294eb8f6461ff716305e35b620fb7c51350946832f99954c45a4fcd3b
+size 86524

models/tawos/XD/model_tawos_XD_tfidf_linear.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dd5c88b409a9a93b4bf754b50569df165c1fca36fd4f53e9f78e50f58d2f493
+size 170304

models/tawos/XD/model_tawos_XD_tfidf_svr.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8868f069e4ae8533bb5bafef7d8fefd064006414413a971a0a828d8316fa88a0
+size 37738316

models/tawos/XD/vectorizer_tawos_XD_tfidf.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d12ff0ea3dddc7319dd2fea9547ecc96fc5bdbc15a263fa124b2929db436e3bb
+size 189024

models/tawos/aloy/{vectorizer_tfidf.joblib → vectorizer_tawos_ALOY_tfidf.joblib} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0d3eac7515bb5f3fb045d49e665f92bae8d26e259c74c4cdf01acded7a2ea410
 size 68159

 version https://git-lfs.github.com/spec/v1
+oid sha256:6416ad0d2dd928218d5bee8a2f5776d5985b8d464611da8e517cb4b78e4f01c7
 size 68159