giseldo commited on
Commit
7012389
1 Parent(s): b682809

ultima versao

Browse files
create_TIMOB_model.ipynb ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "project_name = \"TIMOB\"\n",
11
+ "\n",
12
+ "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
13
+ "\n",
14
+ "df.info()"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "markdown",
19
+ "metadata": {},
20
+ "source": [
21
+ "# Pré-Processamento"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": null,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "import re\n",
31
+ "from string import punctuation\n",
32
+ "\n",
33
+ "def escape_tags_and_content(text):\n",
34
+ " \"\"\"Escape tags and their content containing text, which is not written in natural language, such as code snippets\"\"\"\n",
35
+ "\n",
36
+ " NO_TEXT_TAGS = \"code\", \"noformat\"\n",
37
+ " for tag in NO_TEXT_TAGS:\n",
38
+ " regex_matching_tag = re.compile(\"\\{%s(.*?)\\}(.*?)\\{%s\\}\" % (tag, tag), re.DOTALL)\n",
39
+ " text = re.sub(regex_matching_tag, \"\", text)\n",
40
+ "\n",
41
+ " return text\n",
42
+ "\n",
43
+ "def escape_tags(text):\n",
44
+ " \"\"\"Escape markup tags, but retain their content\"\"\"\n",
45
+ "\n",
46
+ " ESCAPE_TAGS = \"color\", \"quote\", \"anchor\", \"panel\"\n",
47
+ " for tag in ESCAPE_TAGS:\n",
48
+ " text = re.sub(\"\\{%s(.*?)\\}\" % tag, \"\", text)\n",
49
+ "\n",
50
+ " return text\n",
51
+ "\n",
52
+ "def escape_strings(text):\n",
53
+ " \"\"\"Escape line breaks, tabulators, slashes and JIRA heading markup symbols\"\"\"\n",
54
+ "\n",
55
+ " ESCAPE_STRINGS = \"\\\\r\", \"\\\\n\", \"\\\\t\", \"\\\\f\", \"\\\\v\", \"\\\"\", \"\\\\\\\\\", \"h1. \", \"h2. \", \"h3. \", \"h4. \", \"h5. \", \"h6. \"\n",
56
+ " for escape_string in ESCAPE_STRINGS:\n",
57
+ " text = text.replace(escape_string, \" \")\n",
58
+ "\n",
59
+ " return text\n",
60
+ "\n",
61
+ "def escape_links(text):\n",
62
+ " \"\"\"Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' \"\"\"\n",
63
+ "\n",
64
+ " LINK_STARTERS = r\"\\#\", r\"\\^\", r\"http\\:\\/\\/\", r\"https\\:\\/\\/\", r\"malto\\:\", r\"file\\:\", r\"\\~\"\n",
65
+ " for link_starter in LINK_STARTERS:\n",
66
+ " text = re.sub(\"\\[(.*?\\\\|)?%s(.*?)\\]\" % link_starter, \"\", text)\n",
67
+ " text = re.sub(r\"\\bhttps?://\\S+\", \"\", text)\n",
68
+ "\n",
69
+ " return text\n",
70
+ "\n",
71
+ "def escape_hex_character_codes(text):\n",
72
+ " \"\"\"Escape characters outside the latin alphabet which are converted to hex code representation\"\"\"\n",
73
+ "\n",
74
+ " return re.sub(r\"\\\\x\\w\\w\", \"\", text)\n",
75
+ "\n",
76
+ "def escape_punctuation_boundaries(text):\n",
77
+ " \"\"\"Remove all punctuation marks from the beginning and end of words,\n",
78
+ " except for trailing period at the end of words\"\"\"\n",
79
+ "\n",
80
+ " return \" \".join([word.strip(punctuation.replace(\".\", \"\")).lstrip(\".\") for word in text.split()])\n",
81
+ "\n",
82
+ "def escape_odd_spaces(text):\n",
83
+ " \"\"\"Replace several consequent spaces with one space\n",
84
+ " and remove spaces from string start and end\"\"\"\n",
85
+ "\n",
86
+ " text = re.sub(r\"\\s+\", \" \", text)\n",
87
+ " text = text.strip()\n",
88
+ "\n",
89
+ " return text"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "metadata": {},
95
+ "source": [
96
+ "# Criação do Modelo"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": null,
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "from sklearn.dummy import DummyRegressor\n",
106
+ "from nltk.corpus import stopwords\n",
107
+ "from textblob import TextBlob\n",
108
+ "import textstat\n",
109
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
110
+ "from sklearn import svm\n",
111
+ "from sklearn.linear_model import LinearRegression\n",
112
+ "from sklearn.feature_selection import SelectKBest\n",
113
+ "import pandas as pd\n",
114
+ "from joblib import dump\n",
115
+ "\n",
116
+ "# carregando os dados\n",
117
+ "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
118
+ "\n",
119
+ "# criação de uma nova coluna\n",
120
+ "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
121
+ "\n",
122
+ "# pré-processamento\n",
123
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n",
124
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n",
125
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n",
126
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n",
127
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n",
128
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n",
129
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n",
130
+ "\n",
131
+ "# removendo stop-words\n",
132
+ "stop = stopwords.words('english')\n",
133
+ "df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n",
134
+ "\n",
135
+ "# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n",
136
+ "df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n",
137
+ "y = df[\"storypoint_\"]\n",
138
+ "df = df.drop(columns=['storypoint_'])\n",
139
+ "\n",
140
+ "# 5º coluna -> extração das features para o neosp\n",
141
+ "df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n",
142
+ "df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n",
143
+ "df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n",
144
+ "df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n",
145
+ "df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n",
146
+ "df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n",
147
+ "df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n",
148
+ "df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n",
149
+ "df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n",
150
+ "df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n",
151
+ "df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n",
152
+ "# 16º colunas\n",
153
+ "\n",
154
+ "# Extração das features para o TFIDF\n",
155
+ "vectorizer = TfidfVectorizer()\n",
156
+ "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
157
+ "#dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
158
+ "dump(vectorizer, \"models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib\".format(project_name, project_name))\n",
159
+ "\n",
160
+ "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
161
+ "\n",
162
+ "# Juntando as features do neosp com o tfidf\n",
163
+ "df = df.join(df_vec)\n",
164
+ "X = df\n",
165
+ "\n",
166
+ "############ MbR\n",
167
+ "\n",
168
+ "model = DummyRegressor(strategy=\"mean\")\n",
169
+ "model.fit(X, y)\n",
170
+ "#dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
171
+ "dump(model, \"models/tawos/{}/model_tawos_{}_mbr.joblib\".format(project_name, project_name))\n",
172
+ "\n",
173
+ "############ Mediana\n",
174
+ "\n",
175
+ "model = DummyRegressor(strategy=\"median\")\n",
176
+ "model.fit(X, y)\n",
177
+ "#dump(model, \"model_tawos_aloy_median.joblib\")\n",
178
+ "dump(model, \"models/tawos/{}/model_tawos_{}_median.joblib\".format(project_name, project_name))\n",
179
+ "\n",
180
+ "########### NEOSP-SVR\n",
181
+ "\n",
182
+ "model = svm.SVR()\n",
183
+ "model.fit(X[X.columns[5:16]], y)\n",
184
+ "#dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
185
+ "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_svr.joblib\".format(project_name, project_name))\n",
186
+ "\n",
187
+ "########### NEOSP-LR\n",
188
+ "\n",
189
+ "model = LinearRegression()\n",
190
+ "model.fit(X[X.columns[5:16]], y)\n",
191
+ "#dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
192
+ "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_linear.joblib\".format(project_name, project_name))\n",
193
+ "\n",
194
+ "############ TFIDF-SVM\n",
195
+ "\n",
196
+ "model = svm.SVR()\n",
197
+ "model.fit(X[X.columns[16:]], y)\n",
198
+ "#dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
199
+ "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_svr.joblib\".format(project_name, project_name))\n",
200
+ "\n",
201
+ "############ TFIDF-LR\n",
202
+ "\n",
203
+ "model = LinearRegression()\n",
204
+ "model.fit(X[X.columns[16:]], y)\n",
205
+ "#dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
206
+ "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_linear.joblib\".format(project_name, project_name))"
207
+ ]
208
+ }
209
+ ],
210
+ "metadata": {
211
+ "kernelspec": {
212
+ "display_name": "Python 3",
213
+ "language": "python",
214
+ "name": "python3"
215
+ },
216
+ "language_info": {
217
+ "codemirror_mode": {
218
+ "name": "ipython",
219
+ "version": 3
220
+ },
221
+ "file_extension": ".py",
222
+ "mimetype": "text/x-python",
223
+ "name": "python",
224
+ "nbconvert_exporter": "python",
225
+ "pygments_lexer": "ipython3",
226
+ "version": "3.10.11"
227
+ },
228
+ "orig_nbformat": 4
229
+ },
230
+ "nbformat": 4,
231
+ "nbformat_minor": 2
232
+ }
create_XD_model.ipynb ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "project_name = \"TIMOB\"\n",
11
+ "\n",
12
+ "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
13
+ "\n",
14
+ "df.info()"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "markdown",
19
+ "metadata": {},
20
+ "source": [
21
+ "# Pré-Processamento"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": null,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "import re\n",
31
+ "from string import punctuation\n",
32
+ "\n",
33
+ "def escape_tags_and_content(text):\n",
34
+ " \"\"\"Escape tags and their content containing text, which is not written in natural language, such as code snippets\"\"\"\n",
35
+ "\n",
36
+ " NO_TEXT_TAGS = \"code\", \"noformat\"\n",
37
+ " for tag in NO_TEXT_TAGS:\n",
38
+ " regex_matching_tag = re.compile(\"\\{%s(.*?)\\}(.*?)\\{%s\\}\" % (tag, tag), re.DOTALL)\n",
39
+ " text = re.sub(regex_matching_tag, \"\", text)\n",
40
+ "\n",
41
+ " return text\n",
42
+ "\n",
43
+ "def escape_tags(text):\n",
44
+ " \"\"\"Escape markup tags, but retain their content\"\"\"\n",
45
+ "\n",
46
+ " ESCAPE_TAGS = \"color\", \"quote\", \"anchor\", \"panel\"\n",
47
+ " for tag in ESCAPE_TAGS:\n",
48
+ " text = re.sub(\"\\{%s(.*?)\\}\" % tag, \"\", text)\n",
49
+ "\n",
50
+ " return text\n",
51
+ "\n",
52
+ "def escape_strings(text):\n",
53
+ " \"\"\"Escape line breaks, tabulators, slashes and JIRA heading markup symbols\"\"\"\n",
54
+ "\n",
55
+ " ESCAPE_STRINGS = \"\\\\r\", \"\\\\n\", \"\\\\t\", \"\\\\f\", \"\\\\v\", \"\\\"\", \"\\\\\\\\\", \"h1. \", \"h2. \", \"h3. \", \"h4. \", \"h5. \", \"h6. \"\n",
56
+ " for escape_string in ESCAPE_STRINGS:\n",
57
+ " text = text.replace(escape_string, \" \")\n",
58
+ "\n",
59
+ " return text\n",
60
+ "\n",
61
+ "def escape_links(text):\n",
62
+ " \"\"\"Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' \"\"\"\n",
63
+ "\n",
64
+ " LINK_STARTERS = r\"\\#\", r\"\\^\", r\"http\\:\\/\\/\", r\"https\\:\\/\\/\", r\"malto\\:\", r\"file\\:\", r\"\\~\"\n",
65
+ " for link_starter in LINK_STARTERS:\n",
66
+ " text = re.sub(\"\\[(.*?\\\\|)?%s(.*?)\\]\" % link_starter, \"\", text)\n",
67
+ " text = re.sub(r\"\\bhttps?://\\S+\", \"\", text)\n",
68
+ "\n",
69
+ " return text\n",
70
+ "\n",
71
+ "def escape_hex_character_codes(text):\n",
72
+ " \"\"\"Escape characters outside the latin alphabet which are converted to hex code representation\"\"\"\n",
73
+ "\n",
74
+ " return re.sub(r\"\\\\x\\w\\w\", \"\", text)\n",
75
+ "\n",
76
+ "def escape_punctuation_boundaries(text):\n",
77
+ " \"\"\"Remove all punctuation marks from the beginning and end of words,\n",
78
+ " except for trailing period at the end of words\"\"\"\n",
79
+ "\n",
80
+ " return \" \".join([word.strip(punctuation.replace(\".\", \"\")).lstrip(\".\") for word in text.split()])\n",
81
+ "\n",
82
+ "def escape_odd_spaces(text):\n",
83
+ " \"\"\"Replace several consequent spaces with one space\n",
84
+ " and remove spaces from string start and end\"\"\"\n",
85
+ "\n",
86
+ " text = re.sub(r\"\\s+\", \" \", text)\n",
87
+ " text = text.strip()\n",
88
+ "\n",
89
+ " return text"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "metadata": {},
95
+ "source": [
96
+ "# Criação do Modelo"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": null,
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "from sklearn.dummy import DummyRegressor\n",
106
+ "from nltk.corpus import stopwords\n",
107
+ "from textblob import TextBlob\n",
108
+ "import textstat\n",
109
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
110
+ "from sklearn import svm\n",
111
+ "from sklearn.linear_model import LinearRegression\n",
112
+ "from sklearn.feature_selection import SelectKBest\n",
113
+ "import pandas as pd\n",
114
+ "from joblib import dump\n",
115
+ "\n",
116
+ "# carregando os dados\n",
117
+ "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
118
+ "\n",
119
+ "# criação de uma nova coluna\n",
120
+ "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
121
+ "\n",
122
+ "# pré-processamento\n",
123
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n",
124
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n",
125
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n",
126
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n",
127
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n",
128
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n",
129
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n",
130
+ "\n",
131
+ "# removendo stop-words\n",
132
+ "stop = stopwords.words('english')\n",
133
+ "df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n",
134
+ "\n",
135
+ "# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n",
136
+ "df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n",
137
+ "y = df[\"storypoint_\"]\n",
138
+ "df = df.drop(columns=['storypoint_'])\n",
139
+ "\n",
140
+ "# 5º coluna -> extração das features para o neosp\n",
141
+ "df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n",
142
+ "df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n",
143
+ "df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n",
144
+ "df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n",
145
+ "df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n",
146
+ "df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n",
147
+ "df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n",
148
+ "df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n",
149
+ "df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n",
150
+ "df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n",
151
+ "df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n",
152
+ "# 16º colunas\n",
153
+ "\n",
154
+ "# Extração das features para o TFIDF\n",
155
+ "vectorizer = TfidfVectorizer()\n",
156
+ "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
157
+ "#dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
158
+ "dump(vectorizer, \"models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib\".format(project_name, project_name))\n",
159
+ "\n",
160
+ "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
161
+ "\n",
162
+ "# Juntando as features do neosp com o tfidf\n",
163
+ "df = df.join(df_vec)\n",
164
+ "X = df\n",
165
+ "\n",
166
+ "############ MbR\n",
167
+ "\n",
168
+ "model = DummyRegressor(strategy=\"mean\")\n",
169
+ "model.fit(X, y)\n",
170
+ "#dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
171
+ "dump(model, \"models/tawos/{}/model_tawos_{}_mbr.joblib\".format(project_name, project_name))\n",
172
+ "\n",
173
+ "############ Mediana\n",
174
+ "\n",
175
+ "model = DummyRegressor(strategy=\"median\")\n",
176
+ "model.fit(X, y)\n",
177
+ "#dump(model, \"model_tawos_aloy_median.joblib\")\n",
178
+ "dump(model, \"models/tawos/{}/model_tawos_{}_median.joblib\".format(project_name, project_name))\n",
179
+ "\n",
180
+ "########### NEOSP-SVR\n",
181
+ "\n",
182
+ "model = svm.SVR()\n",
183
+ "model.fit(X[X.columns[5:16]], y)\n",
184
+ "#dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
185
+ "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_svr.joblib\".format(project_name, project_name))\n",
186
+ "\n",
187
+ "########### NEOSP-LR\n",
188
+ "\n",
189
+ "model = LinearRegression()\n",
190
+ "model.fit(X[X.columns[5:16]], y)\n",
191
+ "#dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
192
+ "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_linear.joblib\".format(project_name, project_name))\n",
193
+ "\n",
194
+ "############ TFIDF-SVM\n",
195
+ "\n",
196
+ "model = svm.SVR()\n",
197
+ "model.fit(X[X.columns[16:]], y)\n",
198
+ "#dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
199
+ "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_svr.joblib\".format(project_name, project_name))\n",
200
+ "\n",
201
+ "############ TFIDF-LR\n",
202
+ "\n",
203
+ "model = LinearRegression()\n",
204
+ "model.fit(X[X.columns[16:]], y)\n",
205
+ "#dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
206
+ "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_linear.joblib\".format(project_name, project_name))"
207
+ ]
208
+ }
209
+ ],
210
+ "metadata": {
211
+ "kernelspec": {
212
+ "display_name": "Python 3",
213
+ "language": "python",
214
+ "name": "python3"
215
+ },
216
+ "language_info": {
217
+ "codemirror_mode": {
218
+ "name": "ipython",
219
+ "version": 3
220
+ },
221
+ "file_extension": ".py",
222
+ "mimetype": "text/x-python",
223
+ "name": "python",
224
+ "nbconvert_exporter": "python",
225
+ "pygments_lexer": "ipython3",
226
+ "version": "3.10.11"
227
+ },
228
+ "orig_nbformat": 4
229
+ },
230
+ "nbformat": 4,
231
+ "nbformat_minor": 2
232
+ }
create_alloy_model.ipynb CHANGED
@@ -2,31 +2,13 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 4,
6
  "metadata": {},
7
- "outputs": [
8
- {
9
- "name": "stdout",
10
- "output_type": "stream",
11
- "text": [
12
- "<class 'pandas.core.frame.DataFrame'>\n",
13
- "RangeIndex: 241 entries, 0 to 240\n",
14
- "Data columns (total 5 columns):\n",
15
- " # Column Non-Null Count Dtype \n",
16
- "--- ------ -------------- ----- \n",
17
- " 0 issuekey 241 non-null object\n",
18
- " 1 created 241 non-null object\n",
19
- " 2 title 241 non-null object\n",
20
- " 3 description 241 non-null object\n",
21
- " 4 storypoint 241 non-null int64 \n",
22
- "dtypes: int64(1), object(4)\n",
23
- "memory usage: 9.5+ KB\n"
24
- ]
25
- }
26
- ],
27
  "source": [
28
  "import pandas as pd\n",
29
- "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\ALOY_deep-se.csv\")\n",
 
30
  "df.info()"
31
  ]
32
  },
@@ -39,7 +21,7 @@
39
  },
40
  {
41
  "cell_type": "code",
42
- "execution_count": 5,
43
  "metadata": {},
44
  "outputs": [],
45
  "source": [
@@ -114,20 +96,9 @@
114
  },
115
  {
116
  "cell_type": "code",
117
- "execution_count": 6,
118
  "metadata": {},
119
- "outputs": [
120
- {
121
- "data": {
122
- "text/plain": [
123
- "['model_tawos_aloy_tfidf_linear.joblib']"
124
- ]
125
- },
126
- "execution_count": 6,
127
- "metadata": {},
128
- "output_type": "execute_result"
129
- }
130
- ],
131
  "source": [
132
  "from sklearn.dummy import DummyRegressor\n",
133
  "from nltk.corpus import stopwords\n",
@@ -141,7 +112,7 @@
141
  "from joblib import dump\n",
142
  "\n",
143
  "# carregando os dados\n",
144
- "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\ALOY_deep-se.csv\")\n",
145
  "\n",
146
  "# criação de uma nova coluna\n",
147
  "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
@@ -181,8 +152,8 @@
181
  "# Extração das features para o TFIDF\n",
182
  "vectorizer = TfidfVectorizer()\n",
183
  "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
184
- "dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
185
- "#dump(vectorizer, \"models/tawos/aloy/vectorizer_tfidf.joblib\")\n",
186
  "\n",
187
  "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
188
  "\n",
@@ -194,49 +165,44 @@
194
  "\n",
195
  "model = DummyRegressor(strategy=\"mean\")\n",
196
  "model.fit(X, y)\n",
197
- "dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
198
- "#dump(model, \"models/tawos/aloy/model_tawos_aloy_mbr.joblib\")\n",
199
  "\n",
200
  "############ Mediana\n",
201
  "\n",
202
  "model = DummyRegressor(strategy=\"median\")\n",
203
  "model.fit(X, y)\n",
204
- "dump(model, \"model_tawos_aloy_median.joblib\")\n",
205
- "#dump(model, \"models/tawos/aloy/model_tawos_aloy_median.joblib\")\n",
206
  "\n",
207
  "########### NEOSP-SVR\n",
208
  "\n",
209
  "model = svm.SVR()\n",
210
  "model.fit(X[X.columns[5:16]], y)\n",
211
- "dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
212
- "#dump(model, \"models/tawos/aloy/model_tawos_aloy_neosp_svr.joblib\")\n",
213
  "\n",
214
  "########### NEOSP-LR\n",
215
  "\n",
216
  "model = LinearRegression()\n",
217
  "model.fit(X[X.columns[5:16]], y)\n",
218
- "dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
219
- "#dump(model, \"models/tawos/aloy/model_tawos_aloy_neosp_linear.joblib\")\n",
220
  "\n",
221
  "############ TFIDF-SVM\n",
222
  "\n",
223
  "model = svm.SVR()\n",
224
  "model.fit(X[X.columns[16:]], y)\n",
225
- "dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
226
- "#dump(model, \"models/tawos/aloy/model_tawos_aloy_tfidf_svr.joblib\")\n",
227
  "\n",
228
  "############ TFIDF-LR\n",
229
  "\n",
230
  "model = LinearRegression()\n",
231
  "model.fit(X[X.columns[16:]], y)\n",
232
- "dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
233
- "#dump(model, \"models/tawos/aloy/model_tawos_aloy_tfidf_linear.joblib\")\n"
234
  ]
235
- },
236
- {
237
- "cell_type": "markdown",
238
- "metadata": {},
239
- "source": []
240
  }
241
  ],
242
  "metadata": {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": null,
6
  "metadata": {},
7
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "source": [
9
  "import pandas as pd\n",
10
+ "project_name = \"ALOY\"\n",
11
+ "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
12
  "df.info()"
13
  ]
14
  },
 
21
  },
22
  {
23
  "cell_type": "code",
24
+ "execution_count": null,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
 
96
  },
97
  {
98
  "cell_type": "code",
99
+ "execution_count": null,
100
  "metadata": {},
101
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
102
  "source": [
103
  "from sklearn.dummy import DummyRegressor\n",
104
  "from nltk.corpus import stopwords\n",
 
112
  "from joblib import dump\n",
113
  "\n",
114
  "# carregando os dados\n",
115
+ "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
116
  "\n",
117
  "# criação de uma nova coluna\n",
118
  "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
 
152
  "# Extração das features para o TFIDF\n",
153
  "vectorizer = TfidfVectorizer()\n",
154
  "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
155
+ "#dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
156
+ "dump(vectorizer, \"models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib\".format(project_name, project_name))\n",
157
  "\n",
158
  "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
159
  "\n",
 
165
  "\n",
166
  "model = DummyRegressor(strategy=\"mean\")\n",
167
  "model.fit(X, y)\n",
168
+ "#dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
169
+ "dump(model, \"models/tawos/{}/model_tawos_{}_mbr.joblib\".format(project_name, project_name))\n",
170
  "\n",
171
  "############ Mediana\n",
172
  "\n",
173
  "model = DummyRegressor(strategy=\"median\")\n",
174
  "model.fit(X, y)\n",
175
+ "#dump(model, \"model_tawos_aloy_median.joblib\")\n",
176
+ "dump(model, \"models/tawos/{}/model_tawos_{}_median.joblib\".format(project_name, project_name))\n",
177
  "\n",
178
  "########### NEOSP-SVR\n",
179
  "\n",
180
  "model = svm.SVR()\n",
181
  "model.fit(X[X.columns[5:16]], y)\n",
182
+ "#dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
183
+ "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_svr.joblib\".format(project_name, project_name))\n",
184
  "\n",
185
  "########### NEOSP-LR\n",
186
  "\n",
187
  "model = LinearRegression()\n",
188
  "model.fit(X[X.columns[5:16]], y)\n",
189
+ "#dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
190
+ "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_linear.joblib\".format(project_name, project_name))\n",
191
  "\n",
192
  "############ TFIDF-SVM\n",
193
  "\n",
194
  "model = svm.SVR()\n",
195
  "model.fit(X[X.columns[16:]], y)\n",
196
+ "#dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
197
+ "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_svr.joblib\".format(project_name, project_name))\n",
198
  "\n",
199
  "############ TFIDF-LR\n",
200
  "\n",
201
  "model = LinearRegression()\n",
202
  "model.fit(X[X.columns[16:]], y)\n",
203
+ "#dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
204
+ "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_linear.joblib\".format(project_name, project_name))\n"
205
  ]
 
 
 
 
 
206
  }
207
  ],
208
  "metadata": {
models/tawos/XD/model_tawos_XD_mbr.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfd1f37c3a508425a3d21f5dee3ed029754a752a70be1c1a9c2564c96009df98
3
+ size 383
models/tawos/XD/model_tawos_XD_median.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f33655a67e4d587143f615b1604a45fc5cac5b70b0c8e999b47a953a43511e43
3
+ size 383
models/tawos/XD/model_tawos_XD_neosp_linear.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aab222c9966e14bf870d7e25b3339e6d86b185557cbb66fc0a8330d6206523bd
3
+ size 1280
models/tawos/XD/model_tawos_XD_neosp_svr.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e764a03294eb8f6461ff716305e35b620fb7c51350946832f99954c45a4fcd3b
3
+ size 86524
models/tawos/XD/model_tawos_XD_tfidf_linear.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dd5c88b409a9a93b4bf754b50569df165c1fca36fd4f53e9f78e50f58d2f493
3
+ size 170304
models/tawos/XD/model_tawos_XD_tfidf_svr.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8868f069e4ae8533bb5bafef7d8fefd064006414413a971a0a828d8316fa88a0
3
+ size 37738316
models/tawos/XD/vectorizer_tawos_XD_tfidf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d12ff0ea3dddc7319dd2fea9547ecc96fc5bdbc15a263fa124b2929db436e3bb
3
+ size 189024
models/tawos/aloy/{vectorizer_tfidf.joblib → vectorizer_tawos_ALOY_tfidf.joblib} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d3eac7515bb5f3fb045d49e665f92bae8d26e259c74c4cdf01acded7a2ea410
3
  size 68159
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6416ad0d2dd928218d5bee8a2f5776d5985b8d464611da8e517cb4b78e4f01c7
3
  size 68159