giseldo commited on
Commit
00114fc
1 Parent(s): d10a474

ultima versao

Browse files
create_APSTUD_model.ipynb ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "<class 'pandas.core.frame.DataFrame'>\n",
13
+ "RangeIndex: 476 entries, 0 to 475\n",
14
+ "Data columns (total 5 columns):\n",
15
+ " # Column Non-Null Count Dtype \n",
16
+ "--- ------ -------------- ----- \n",
17
+ " 0 issuekey 476 non-null object\n",
18
+ " 1 created 476 non-null object\n",
19
+ " 2 title 476 non-null object\n",
20
+ " 3 description 476 non-null object\n",
21
+ " 4 storypoint 476 non-null int64 \n",
22
+ "dtypes: int64(1), object(4)\n",
23
+ "memory usage: 18.7+ KB\n"
24
+ ]
25
+ }
26
+ ],
27
+ "source": [
28
+ "import pandas as pd\n",
29
+ "project_name = \"APSTUD\"\n",
30
+ "\n",
31
+ "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
32
+ "\n",
33
+ "df.info()"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "markdown",
38
+ "metadata": {},
39
+ "source": [
40
+ "# Pré-Processamento"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 5,
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "import re\n",
50
+ "from string import punctuation\n",
51
+ "\n",
52
+ "def escape_tags_and_content(text):\n",
53
+ " \"\"\"Escape tags and their content containing text, which is not written in natural language, such as code snippets\"\"\"\n",
54
+ "\n",
55
+ " NO_TEXT_TAGS = \"code\", \"noformat\"\n",
56
+ " for tag in NO_TEXT_TAGS:\n",
57
+ " regex_matching_tag = re.compile(\"\\{%s(.*?)\\}(.*?)\\{%s\\}\" % (tag, tag), re.DOTALL)\n",
58
+ " text = re.sub(regex_matching_tag, \"\", text)\n",
59
+ "\n",
60
+ " return text\n",
61
+ "\n",
62
+ "def escape_tags(text):\n",
63
+ " \"\"\"Escape markup tags, but retain their content\"\"\"\n",
64
+ "\n",
65
+ " ESCAPE_TAGS = \"color\", \"quote\", \"anchor\", \"panel\"\n",
66
+ " for tag in ESCAPE_TAGS:\n",
67
+ " text = re.sub(\"\\{%s(.*?)\\}\" % tag, \"\", text)\n",
68
+ "\n",
69
+ " return text\n",
70
+ "\n",
71
+ "def escape_strings(text):\n",
72
+ " \"\"\"Escape line breaks, tabulators, slashes and JIRA heading markup symbols\"\"\"\n",
73
+ "\n",
74
+ " ESCAPE_STRINGS = \"\\\\r\", \"\\\\n\", \"\\\\t\", \"\\\\f\", \"\\\\v\", \"\\\"\", \"\\\\\\\\\", \"h1. \", \"h2. \", \"h3. \", \"h4. \", \"h5. \", \"h6. \"\n",
75
+ " for escape_string in ESCAPE_STRINGS:\n",
76
+ " text = text.replace(escape_string, \" \")\n",
77
+ "\n",
78
+ " return text\n",
79
+ "\n",
80
+ "def escape_links(text):\n",
81
+ " \"\"\"Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' \"\"\"\n",
82
+ "\n",
83
+ " LINK_STARTERS = r\"\\#\", r\"\\^\", r\"http\\:\\/\\/\", r\"https\\:\\/\\/\", r\"malto\\:\", r\"file\\:\", r\"\\~\"\n",
84
+ " for link_starter in LINK_STARTERS:\n",
85
+ " text = re.sub(\"\\[(.*?\\\\|)?%s(.*?)\\]\" % link_starter, \"\", text)\n",
86
+ " text = re.sub(r\"\\bhttps?://\\S+\", \"\", text)\n",
87
+ "\n",
88
+ " return text\n",
89
+ "\n",
90
+ "def escape_hex_character_codes(text):\n",
91
+ " \"\"\"Escape characters outside the latin alphabet which are converted to hex code representation\"\"\"\n",
92
+ "\n",
93
+ " return re.sub(r\"\\\\x\\w\\w\", \"\", text)\n",
94
+ "\n",
95
+ "def escape_punctuation_boundaries(text):\n",
96
+ " \"\"\"Remove all punctuation marks from the beginning and end of words,\n",
97
+ " except for trailing period at the end of words\"\"\"\n",
98
+ "\n",
99
+ " return \" \".join([word.strip(punctuation.replace(\".\", \"\")).lstrip(\".\") for word in text.split()])\n",
100
+ "\n",
101
+ "def escape_odd_spaces(text):\n",
102
+ " \"\"\"Replace several consequent spaces with one space\n",
103
+ " and remove spaces from string start and end\"\"\"\n",
104
+ "\n",
105
+ " text = re.sub(r\"\\s+\", \" \", text)\n",
106
+ " text = text.strip()\n",
107
+ "\n",
108
+ " return text"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "markdown",
113
+ "metadata": {},
114
+ "source": [
115
+ "# Criação do Modelo"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 6,
121
+ "metadata": {},
122
+ "outputs": [
123
+ {
124
+ "data": {
125
+ "text/plain": [
126
+ "['models/tawos/APSTUD/model_tawos_APSTUD_tfidf_linear.joblib']"
127
+ ]
128
+ },
129
+ "execution_count": 6,
130
+ "metadata": {},
131
+ "output_type": "execute_result"
132
+ }
133
+ ],
134
+ "source": [
135
+ "from sklearn.dummy import DummyRegressor\n",
136
+ "from nltk.corpus import stopwords\n",
137
+ "from textblob import TextBlob\n",
138
+ "import textstat\n",
139
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
140
+ "from sklearn import svm\n",
141
+ "from sklearn.linear_model import LinearRegression\n",
142
+ "from sklearn.feature_selection import SelectKBest\n",
143
+ "import pandas as pd\n",
144
+ "from joblib import dump\n",
145
+ "\n",
146
+ "# carregando os dados\n",
147
+ "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
148
+ "\n",
149
+ "# criação de uma nova coluna\n",
150
+ "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
151
+ "\n",
152
+ "# pré-processamento\n",
153
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n",
154
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n",
155
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n",
156
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n",
157
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n",
158
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n",
159
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n",
160
+ "\n",
161
+ "# removendo stop-words\n",
162
+ "stop = stopwords.words('english')\n",
163
+ "df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n",
164
+ "\n",
165
+ "# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n",
166
+ "df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n",
167
+ "y = df[\"storypoint_\"]\n",
168
+ "df = df.drop(columns=['storypoint_'])\n",
169
+ "\n",
170
+ "# 5º coluna -> extração das features para o neosp\n",
171
+ "df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n",
172
+ "df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n",
173
+ "df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n",
174
+ "df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n",
175
+ "df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n",
176
+ "df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n",
177
+ "df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n",
178
+ "df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n",
179
+ "df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n",
180
+ "df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n",
181
+ "df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n",
182
+ "# 16º colunas\n",
183
+ "\n",
184
+ "# Extração das features para o TFIDF\n",
185
+ "vectorizer = TfidfVectorizer()\n",
186
+ "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
187
+ "#dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
188
+ "dump(vectorizer, \"models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib\".format(project_name, project_name))\n",
189
+ "\n",
190
+ "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
191
+ "\n",
192
+ "# Juntando as features do neosp com o tfidf\n",
193
+ "df = df.join(df_vec)\n",
194
+ "X = df\n",
195
+ "\n",
196
+ "############ MbR\n",
197
+ "\n",
198
+ "model = DummyRegressor(strategy=\"mean\")\n",
199
+ "model.fit(X, y)\n",
200
+ "#dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
201
+ "dump(model, \"models/tawos/{}/model_tawos_{}_mbr.joblib\".format(project_name, project_name))\n",
202
+ "\n",
203
+ "############ Mediana\n",
204
+ "\n",
205
+ "model = DummyRegressor(strategy=\"median\")\n",
206
+ "model.fit(X, y)\n",
207
+ "#dump(model, \"model_tawos_aloy_median.joblib\")\n",
208
+ "dump(model, \"models/tawos/{}/model_tawos_{}_median.joblib\".format(project_name, project_name))\n",
209
+ "\n",
210
+ "########### NEOSP-SVR\n",
211
+ "\n",
212
+ "model = svm.SVR()\n",
213
+ "model.fit(X[X.columns[5:16]], y)\n",
214
+ "#dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
215
+ "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_svr.joblib\".format(project_name, project_name))\n",
216
+ "\n",
217
+ "########### NEOSP-LR\n",
218
+ "\n",
219
+ "model = LinearRegression()\n",
220
+ "model.fit(X[X.columns[5:16]], y)\n",
221
+ "#dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
222
+ "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_linear.joblib\".format(project_name, project_name))\n",
223
+ "\n",
224
+ "############ TFIDF-SVM\n",
225
+ "\n",
226
+ "model = svm.SVR()\n",
227
+ "model.fit(X[X.columns[16:]], y)\n",
228
+ "#dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
229
+ "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_svr.joblib\".format(project_name, project_name))\n",
230
+ "\n",
231
+ "############ TFIDF-LR\n",
232
+ "\n",
233
+ "model = LinearRegression()\n",
234
+ "model.fit(X[X.columns[16:]], y)\n",
235
+ "#dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
236
+ "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_linear.joblib\".format(project_name, project_name))"
237
+ ]
238
+ }
239
+ ],
240
+ "metadata": {
241
+ "kernelspec": {
242
+ "display_name": "Python 3",
243
+ "language": "python",
244
+ "name": "python3"
245
+ },
246
+ "language_info": {
247
+ "codemirror_mode": {
248
+ "name": "ipython",
249
+ "version": 3
250
+ },
251
+ "file_extension": ".py",
252
+ "mimetype": "text/x-python",
253
+ "name": "python",
254
+ "nbconvert_exporter": "python",
255
+ "pygments_lexer": "ipython3",
256
+ "version": "3.10.11"
257
+ },
258
+ "orig_nbformat": 4
259
+ },
260
+ "nbformat": 4,
261
+ "nbformat_minor": 2
262
+ }
create_CLI_model.ipynb ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "<class 'pandas.core.frame.DataFrame'>\n",
13
+ "RangeIndex: 293 entries, 0 to 292\n",
14
+ "Data columns (total 5 columns):\n",
15
+ " # Column Non-Null Count Dtype \n",
16
+ "--- ------ -------------- ----- \n",
17
+ " 0 issuekey 293 non-null object\n",
18
+ " 1 created 293 non-null object\n",
19
+ " 2 title 293 non-null object\n",
20
+ " 3 description 293 non-null object\n",
21
+ " 4 storypoint 293 non-null int64 \n",
22
+ "dtypes: int64(1), object(4)\n",
23
+ "memory usage: 11.6+ KB\n"
24
+ ]
25
+ }
26
+ ],
27
+ "source": [
28
+ "import pandas as pd\n",
29
+ "project_name = \"CLI\"\n",
30
+ "\n",
31
+ "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
32
+ "\n",
33
+ "df.info()"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "markdown",
38
+ "metadata": {},
39
+ "source": [
40
+ "# Pré-Processamento"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 2,
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "import re\n",
50
+ "from string import punctuation\n",
51
+ "\n",
52
+ "def escape_tags_and_content(text):\n",
53
+ " \"\"\"Escape tags and their content containing text, which is not written in natural language, such as code snippets\"\"\"\n",
54
+ "\n",
55
+ " NO_TEXT_TAGS = \"code\", \"noformat\"\n",
56
+ " for tag in NO_TEXT_TAGS:\n",
57
+ " regex_matching_tag = re.compile(\"\\{%s(.*?)\\}(.*?)\\{%s\\}\" % (tag, tag), re.DOTALL)\n",
58
+ " text = re.sub(regex_matching_tag, \"\", text)\n",
59
+ "\n",
60
+ " return text\n",
61
+ "\n",
62
+ "def escape_tags(text):\n",
63
+ " \"\"\"Escape markup tags, but retain their content\"\"\"\n",
64
+ "\n",
65
+ " ESCAPE_TAGS = \"color\", \"quote\", \"anchor\", \"panel\"\n",
66
+ " for tag in ESCAPE_TAGS:\n",
67
+ " text = re.sub(\"\\{%s(.*?)\\}\" % tag, \"\", text)\n",
68
+ "\n",
69
+ " return text\n",
70
+ "\n",
71
+ "def escape_strings(text):\n",
72
+ " \"\"\"Escape line breaks, tabulators, slashes and JIRA heading markup symbols\"\"\"\n",
73
+ "\n",
74
+ " ESCAPE_STRINGS = \"\\\\r\", \"\\\\n\", \"\\\\t\", \"\\\\f\", \"\\\\v\", \"\\\"\", \"\\\\\\\\\", \"h1. \", \"h2. \", \"h3. \", \"h4. \", \"h5. \", \"h6. \"\n",
75
+ " for escape_string in ESCAPE_STRINGS:\n",
76
+ " text = text.replace(escape_string, \" \")\n",
77
+ "\n",
78
+ " return text\n",
79
+ "\n",
80
+ "def escape_links(text):\n",
81
+ " \"\"\"Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' \"\"\"\n",
82
+ "\n",
83
+ " LINK_STARTERS = r\"\\#\", r\"\\^\", r\"http\\:\\/\\/\", r\"https\\:\\/\\/\", r\"malto\\:\", r\"file\\:\", r\"\\~\"\n",
84
+ " for link_starter in LINK_STARTERS:\n",
85
+ " text = re.sub(\"\\[(.*?\\\\|)?%s(.*?)\\]\" % link_starter, \"\", text)\n",
86
+ " text = re.sub(r\"\\bhttps?://\\S+\", \"\", text)\n",
87
+ "\n",
88
+ " return text\n",
89
+ "\n",
90
+ "def escape_hex_character_codes(text):\n",
91
+ " \"\"\"Escape characters outside the latin alphabet which are converted to hex code representation\"\"\"\n",
92
+ "\n",
93
+ " return re.sub(r\"\\\\x\\w\\w\", \"\", text)\n",
94
+ "\n",
95
+ "def escape_punctuation_boundaries(text):\n",
96
+ " \"\"\"Remove all punctuation marks from the beginning and end of words,\n",
97
+ " except for trailing period at the end of words\"\"\"\n",
98
+ "\n",
99
+ " return \" \".join([word.strip(punctuation.replace(\".\", \"\")).lstrip(\".\") for word in text.split()])\n",
100
+ "\n",
101
+ "def escape_odd_spaces(text):\n",
102
+ " \"\"\"Replace several consequent spaces with one space\n",
103
+ " and remove spaces from string start and end\"\"\"\n",
104
+ "\n",
105
+ " text = re.sub(r\"\\s+\", \" \", text)\n",
106
+ " text = text.strip()\n",
107
+ "\n",
108
+ " return text"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "markdown",
113
+ "metadata": {},
114
+ "source": [
115
+ "# Criação do Modelo"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 3,
121
+ "metadata": {},
122
+ "outputs": [
123
+ {
124
+ "data": {
125
+ "text/plain": [
126
+ "['models/tawos/CLI/model_tawos_CLI_tfidf_linear.joblib']"
127
+ ]
128
+ },
129
+ "execution_count": 3,
130
+ "metadata": {},
131
+ "output_type": "execute_result"
132
+ }
133
+ ],
134
+ "source": [
135
+ "from sklearn.dummy import DummyRegressor\n",
136
+ "from nltk.corpus import stopwords\n",
137
+ "from textblob import TextBlob\n",
138
+ "import textstat\n",
139
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
140
+ "from sklearn import svm\n",
141
+ "from sklearn.linear_model import LinearRegression\n",
142
+ "from sklearn.feature_selection import SelectKBest\n",
143
+ "import pandas as pd\n",
144
+ "from joblib import dump\n",
145
+ "\n",
146
+ "# carregando os dados\n",
147
+ "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
148
+ "\n",
149
+ "# criação de uma nova coluna\n",
150
+ "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
151
+ "\n",
152
+ "# pré-processamento\n",
153
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n",
154
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n",
155
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n",
156
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n",
157
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n",
158
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n",
159
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n",
160
+ "\n",
161
+ "# removendo stop-words\n",
162
+ "stop = stopwords.words('english')\n",
163
+ "df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n",
164
+ "\n",
165
+ "# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n",
166
+ "df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n",
167
+ "y = df[\"storypoint_\"]\n",
168
+ "df = df.drop(columns=['storypoint_'])\n",
169
+ "\n",
170
+ "# 5º coluna -> extração das features para o neosp\n",
171
+ "df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n",
172
+ "df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n",
173
+ "df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n",
174
+ "df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n",
175
+ "df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n",
176
+ "df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n",
177
+ "df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n",
178
+ "df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n",
179
+ "df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n",
180
+ "df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n",
181
+ "df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n",
182
+ "# 16º colunas\n",
183
+ "\n",
184
+ "# Extração das features para o TFIDF\n",
185
+ "vectorizer = TfidfVectorizer()\n",
186
+ "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
187
+ "#dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
188
+ "dump(vectorizer, \"models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib\".format(project_name, project_name))\n",
189
+ "\n",
190
+ "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
191
+ "\n",
192
+ "# Juntando as features do neosp com o tfidf\n",
193
+ "df = df.join(df_vec)\n",
194
+ "X = df\n",
195
+ "\n",
196
+ "############ MbR\n",
197
+ "\n",
198
+ "model = DummyRegressor(strategy=\"mean\")\n",
199
+ "model.fit(X, y)\n",
200
+ "#dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
201
+ "dump(model, \"models/tawos/{}/model_tawos_{}_mbr.joblib\".format(project_name, project_name))\n",
202
+ "\n",
203
+ "############ Mediana\n",
204
+ "\n",
205
+ "model = DummyRegressor(strategy=\"median\")\n",
206
+ "model.fit(X, y)\n",
207
+ "#dump(model, \"model_tawos_aloy_median.joblib\")\n",
208
+ "dump(model, \"models/tawos/{}/model_tawos_{}_median.joblib\".format(project_name, project_name))\n",
209
+ "\n",
210
+ "########### NEOSP-SVR\n",
211
+ "\n",
212
+ "model = svm.SVR()\n",
213
+ "model.fit(X[X.columns[5:16]], y)\n",
214
+ "#dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
215
+ "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_svr.joblib\".format(project_name, project_name))\n",
216
+ "\n",
217
+ "########### NEOSP-LR\n",
218
+ "\n",
219
+ "model = LinearRegression()\n",
220
+ "model.fit(X[X.columns[5:16]], y)\n",
221
+ "#dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
222
+ "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_linear.joblib\".format(project_name, project_name))\n",
223
+ "\n",
224
+ "############ TFIDF-SVM\n",
225
+ "\n",
226
+ "model = svm.SVR()\n",
227
+ "model.fit(X[X.columns[16:]], y)\n",
228
+ "#dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
229
+ "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_svr.joblib\".format(project_name, project_name))\n",
230
+ "\n",
231
+ "############ TFIDF-LR\n",
232
+ "\n",
233
+ "model = LinearRegression()\n",
234
+ "model.fit(X[X.columns[16:]], y)\n",
235
+ "#dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
236
+ "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_linear.joblib\".format(project_name, project_name))"
237
+ ]
238
+ }
239
+ ],
240
+ "metadata": {
241
+ "kernelspec": {
242
+ "display_name": "Python 3",
243
+ "language": "python",
244
+ "name": "python3"
245
+ },
246
+ "language_info": {
247
+ "codemirror_mode": {
248
+ "name": "ipython",
249
+ "version": 3
250
+ },
251
+ "file_extension": ".py",
252
+ "mimetype": "text/x-python",
253
+ "name": "python",
254
+ "nbconvert_exporter": "python",
255
+ "pygments_lexer": "ipython3",
256
+ "version": "3.10.11"
257
+ },
258
+ "orig_nbformat": 4
259
+ },
260
+ "nbformat": 4,
261
+ "nbformat_minor": 2
262
+ }
create_TIMOB_model.ipynb CHANGED
@@ -2,9 +2,28 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": null,
6
  "metadata": {},
7
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "source": [
9
  "import pandas as pd\n",
10
  "project_name = \"TIMOB\"\n",
@@ -23,7 +42,7 @@
23
  },
24
  {
25
  "cell_type": "code",
26
- "execution_count": null,
27
  "metadata": {},
28
  "outputs": [],
29
  "source": [
@@ -98,9 +117,20 @@
98
  },
99
  {
100
  "cell_type": "code",
101
- "execution_count": null,
102
  "metadata": {},
103
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
104
  "source": [
105
  "from sklearn.dummy import DummyRegressor\n",
106
  "from nltk.corpus import stopwords\n",
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
  "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "<class 'pandas.core.frame.DataFrame'>\n",
13
+ "RangeIndex: 3915 entries, 0 to 3914\n",
14
+ "Data columns (total 5 columns):\n",
15
+ " # Column Non-Null Count Dtype \n",
16
+ "--- ------ -------------- ----- \n",
17
+ " 0 issuekey 3915 non-null object \n",
18
+ " 1 created 3915 non-null object \n",
19
+ " 2 title 3915 non-null object \n",
20
+ " 3 description 3915 non-null object \n",
21
+ " 4 storypoint 3915 non-null float64\n",
22
+ "dtypes: float64(1), object(4)\n",
23
+ "memory usage: 153.1+ KB\n"
24
+ ]
25
+ }
26
+ ],
27
  "source": [
28
  "import pandas as pd\n",
29
  "project_name = \"TIMOB\"\n",
 
42
  },
43
  {
44
  "cell_type": "code",
45
+ "execution_count": 2,
46
  "metadata": {},
47
  "outputs": [],
48
  "source": [
 
117
  },
118
  {
119
  "cell_type": "code",
120
+ "execution_count": 3,
121
  "metadata": {},
122
+ "outputs": [
123
+ {
124
+ "data": {
125
+ "text/plain": [
126
+ "['models/tawos/TIMOB/model_tawos_TIMOB_tfidf_linear.joblib']"
127
+ ]
128
+ },
129
+ "execution_count": 3,
130
+ "metadata": {},
131
+ "output_type": "execute_result"
132
+ }
133
+ ],
134
  "source": [
135
  "from sklearn.dummy import DummyRegressor\n",
136
  "from nltk.corpus import stopwords\n",
create_XD_model.ipynb CHANGED
@@ -2,12 +2,31 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": null,
6
  "metadata": {},
7
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "source": [
9
  "import pandas as pd\n",
10
- "project_name = \"TIMOB\"\n",
11
  "\n",
12
  "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
13
  "\n",
@@ -23,7 +42,7 @@
23
  },
24
  {
25
  "cell_type": "code",
26
- "execution_count": null,
27
  "metadata": {},
28
  "outputs": [],
29
  "source": [
@@ -98,9 +117,20 @@
98
  },
99
  {
100
  "cell_type": "code",
101
- "execution_count": null,
102
  "metadata": {},
103
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
104
  "source": [
105
  "from sklearn.dummy import DummyRegressor\n",
106
  "from nltk.corpus import stopwords\n",
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 2,
6
  "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "<class 'pandas.core.frame.DataFrame'>\n",
13
+ "RangeIndex: 811 entries, 0 to 810\n",
14
+ "Data columns (total 5 columns):\n",
15
+ " # Column Non-Null Count Dtype \n",
16
+ "--- ------ -------------- ----- \n",
17
+ " 0 issuekey 811 non-null object\n",
18
+ " 1 created 811 non-null object\n",
19
+ " 2 title 811 non-null object\n",
20
+ " 3 description 811 non-null object\n",
21
+ " 4 storypoint 811 non-null int64 \n",
22
+ "dtypes: int64(1), object(4)\n",
23
+ "memory usage: 31.8+ KB\n"
24
+ ]
25
+ }
26
+ ],
27
  "source": [
28
  "import pandas as pd\n",
29
+ "project_name = \"XD\"\n",
30
  "\n",
31
  "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
32
  "\n",
 
42
  },
43
  {
44
  "cell_type": "code",
45
+ "execution_count": 3,
46
  "metadata": {},
47
  "outputs": [],
48
  "source": [
 
117
  },
118
  {
119
  "cell_type": "code",
120
+ "execution_count": 4,
121
  "metadata": {},
122
+ "outputs": [
123
+ {
124
+ "data": {
125
+ "text/plain": [
126
+ "['models/tawos/XD/model_tawos_XD_tfidf_linear.joblib']"
127
+ ]
128
+ },
129
+ "execution_count": 4,
130
+ "metadata": {},
131
+ "output_type": "execute_result"
132
+ }
133
+ ],
134
  "source": [
135
  "from sklearn.dummy import DummyRegressor\n",
136
  "from nltk.corpus import stopwords\n",
create_alloy_model.ipynb CHANGED
@@ -2,9 +2,28 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": null,
6
  "metadata": {},
7
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "source": [
9
  "import pandas as pd\n",
10
  "project_name = \"ALOY\"\n",
@@ -21,7 +40,7 @@
21
  },
22
  {
23
  "cell_type": "code",
24
- "execution_count": null,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
@@ -96,9 +115,20 @@
96
  },
97
  {
98
  "cell_type": "code",
99
- "execution_count": null,
100
  "metadata": {},
101
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
102
  "source": [
103
  "from sklearn.dummy import DummyRegressor\n",
104
  "from nltk.corpus import stopwords\n",
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 10,
6
  "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "<class 'pandas.core.frame.DataFrame'>\n",
13
+ "RangeIndex: 241 entries, 0 to 240\n",
14
+ "Data columns (total 5 columns):\n",
15
+ " # Column Non-Null Count Dtype \n",
16
+ "--- ------ -------------- ----- \n",
17
+ " 0 issuekey 241 non-null object\n",
18
+ " 1 created 241 non-null object\n",
19
+ " 2 title 241 non-null object\n",
20
+ " 3 description 241 non-null object\n",
21
+ " 4 storypoint 241 non-null int64 \n",
22
+ "dtypes: int64(1), object(4)\n",
23
+ "memory usage: 9.5+ KB\n"
24
+ ]
25
+ }
26
+ ],
27
  "source": [
28
  "import pandas as pd\n",
29
  "project_name = \"ALOY\"\n",
 
40
  },
41
  {
42
  "cell_type": "code",
43
+ "execution_count": 11,
44
  "metadata": {},
45
  "outputs": [],
46
  "source": [
 
115
  },
116
  {
117
  "cell_type": "code",
118
+ "execution_count": 12,
119
  "metadata": {},
120
+ "outputs": [
121
+ {
122
+ "data": {
123
+ "text/plain": [
124
+ "['models/tawos/ALOY/model_tawos_ALOY_tfidf_linear.joblib']"
125
+ ]
126
+ },
127
+ "execution_count": 12,
128
+ "metadata": {},
129
+ "output_type": "execute_result"
130
+ }
131
+ ],
132
  "source": [
133
  "from sklearn.dummy import DummyRegressor\n",
134
  "from nltk.corpus import stopwords\n",
models/tawos/APSTUD/model_tawos_APSTUD_mbr.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7acac58dc585421fbd6a591c8ec452275e9c4e48ae37c5dd82497c0ab35cc6b3
3
+ size 383
models/tawos/APSTUD/model_tawos_APSTUD_median.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6571c714502722b037a8acd8cbf088c366257eb1061179d542a657eea7aba33
3
+ size 383
models/tawos/APSTUD/model_tawos_APSTUD_neosp_linear.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e13246fcc693b4894adca7b7bf2eec614fc6a3ab96b58860146471b6b458550
3
+ size 1280
models/tawos/APSTUD/model_tawos_APSTUD_neosp_svr.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80ca226d62be4cd400503dbbf0705617bb3b9e9694a2059ce7c41bdeb5ab9a84
3
+ size 48180
models/tawos/APSTUD/model_tawos_APSTUD_tfidf_linear.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b103fa59d6908c65ba7c761c6a316f207b92a043a4ab5b8b45881bae91971c84
3
+ size 137848
models/tawos/APSTUD/model_tawos_APSTUD_tfidf_svr.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d7c47cb8057e26871533f974cc1589166ac35ba35b124f961f02de08a07f207
3
+ size 19491164
models/tawos/APSTUD/vectorizer_tawos_APSTUD_tfidf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61778540a81a78a549cf7c03df66b5eef8cc66202e072d083ab57aef64399649
3
+ size 155196
models/tawos/CLI/model_tawos_CLI_mbr.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45703b7823252f7922bc9c977d04b9b120b71304bdce17cc28344caa35fabbbe
3
+ size 383
models/tawos/CLI/model_tawos_CLI_median.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f33655a67e4d587143f615b1604a45fc5cac5b70b0c8e999b47a953a43511e43
3
+ size 383
models/tawos/CLI/model_tawos_CLI_neosp_linear.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a4a3292bebabee20e9e36430e6554f3de8da5fb422571484f6a95c227e81576
3
+ size 1280
models/tawos/CLI/model_tawos_CLI_neosp_svr.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f0b063bb04d30f576212a7154e6961c919e75595f6e143c42e4c19bc06a0844
3
+ size 29171
models/tawos/CLI/model_tawos_CLI_tfidf_linear.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfdc99bda09e610d6e429b394fae12c28898e7e28f4467c097d0e314522240ba
3
+ size 60608
models/tawos/CLI/model_tawos_CLI_tfidf_svr.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f4e2eef9d29a97978d4690b2502a7c3f2249d4d7a9bb5a2332875634919268c
3
+ size 5148515
models/tawos/CLI/vectorizer_tawos_CLI_tfidf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcd970eb344f06bd09a584eceb7237b1705c88452b7a1f0ea2e7b434d9c8dbae
3
+ size 68396
models/tawos/TIMOB/model_tawos_TIMOB_mbr.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:102c44692f2343dba35472d1eb958558c33726394f70a796bf0b8f4aea4f930e
3
+ size 383
models/tawos/TIMOB/model_tawos_TIMOB_median.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b953e6d331fe351298ed5347ee1248ed7b925103d17579f22e1af38f9969c6a7
3
+ size 383
models/tawos/TIMOB/model_tawos_TIMOB_neosp_linear.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46b123d78db5341847c348fcd4c4717b735e7a56b183d99c4fc2df11ea4cfdfc
3
+ size 1280
models/tawos/TIMOB/model_tawos_TIMOB_neosp_svr.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37fc0c3bfcad3e6422f9d764331d739bd99d618449abb1e6c2c8c197d41ae1d4
3
+ size 392692
models/tawos/TIMOB/model_tawos_TIMOB_tfidf_linear.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4ade1379f8b7b1322fe6ec3c8b6a858d9a7f3201f7ee14f2d795df33801f108
3
+ size 427664
models/tawos/TIMOB/model_tawos_TIMOB_tfidf_svr.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d8509d7867bbc0600873a3b423e76541ed9a93131b85e9b518800a8114bb8b6
3
+ size 447037100
models/tawos/TIMOB/vectorizer_tawos_TIMOB_tfidf.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb23410643efcc29e0f566115da31efd5958f03fbfaea577569d2a6c29157110
3
+ size 455625