giseldo commited on
Commit
0785047
1 Parent(s): 7a2451d

ultima versao

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. create_alloy_model.ipynb +256 -0
  2. database/tawos/README.md +48 -0
  3. database/tawos/deep/ALOY_deep-se.csv +0 -0
  4. database/tawos/deep/APSTUD_deep-se.csv +0 -0
  5. database/tawos/deep/CLI_deep-se.csv +0 -0
  6. database/tawos/deep/CLOV_deep-se.csv +0 -0
  7. database/tawos/deep/COMPASS_deep-se.csv +0 -0
  8. database/tawos/deep/CONFCLOUD_deep-se.csv +0 -0
  9. database/tawos/deep/CONFSERVER_deep-se.csv +0 -0
  10. database/tawos/deep/DAEMON_deep-se.csv +0 -0
  11. database/tawos/deep/DM_deep-se.csv +0 -0
  12. database/tawos/deep/DNN_deep-se.csv +0 -0
  13. database/tawos/deep/DURACLOUD_deep-se.csv +0 -0
  14. database/tawos/deep/EVG_deep-se.csv +0 -0
  15. database/tawos/deep/FAB_deep-se.csv +0 -0
  16. database/tawos/deep/MDL_deep-se.csv +0 -0
  17. database/tawos/deep/MESOS_deep-se.csv +0 -0
  18. database/tawos/deep/MULE_deep-se.csv +0 -0
  19. database/tawos/deep/NEXUS_deep-se.csv +0 -0
  20. database/tawos/deep/SERVER_deep-se.csv +0 -0
  21. database/tawos/deep/STL_deep-se.csv +0 -0
  22. database/tawos/deep/TIDOC_deep-se.csv +0 -0
  23. database/tawos/deep/TIMOB_deep-se.csv +0 -0
  24. database/tawos/deep/TISTUD_deep-se.csv +0 -0
  25. database/tawos/deep/XD_deep-se.csv +0 -0
  26. database/tawos/tfidf/ALOY_tfidf-se.csv +0 -0
  27. database/tawos/tfidf/APSTUD_tfidf-se.csv +0 -0
  28. database/tawos/tfidf/CLI_tfidf-se.csv +0 -0
  29. database/tawos/tfidf/CLOV_tfidf-se.csv +0 -0
  30. database/tawos/tfidf/COMPASS_tfidf-se.csv +0 -0
  31. database/tawos/tfidf/CONFCLOUD_tfidf-se.csv +0 -0
  32. database/tawos/tfidf/CONFSERVER_tfidf-se.csv +0 -0
  33. database/tawos/tfidf/DAEMON_tfidf-se.csv +0 -0
  34. database/tawos/tfidf/DM_tfidf-se.csv +0 -0
  35. database/tawos/tfidf/DNN_tfidf-se.csv +0 -0
  36. database/tawos/tfidf/DURACLOUD_tfidf-se.csv +0 -0
  37. database/tawos/tfidf/EVG_tfidf-se.csv +0 -0
  38. database/tawos/tfidf/FAB_tfidf-se.csv +0 -0
  39. database/tawos/tfidf/MDL_tfidf-se.csv +0 -0
  40. database/tawos/tfidf/MESOS_tfidf-se.csv +0 -0
  41. database/tawos/tfidf/MULE_tfidf-se.csv +0 -0
  42. database/tawos/tfidf/NEXUS_tfidf-se.csv +0 -0
  43. database/tawos/tfidf/SERVER_tfidf-se.csv +0 -0
  44. database/tawos/tfidf/STL_tfidf-se.csv +0 -0
  45. database/tawos/tfidf/TIDOC_tfidf-se.csv +0 -0
  46. database/tawos/tfidf/TIMOB_tfidf-se.csv +0 -0
  47. database/tawos/tfidf/TISTUD_tfidf-se.csv +0 -0
  48. database/tawos/tfidf/XD_tfidf-se.csv +0 -0
  49. model_tawos_aloy_mbr.joblib → models/tawos/aloy/model_tawos_aloy_mbr.joblib +1 -1
  50. model_tawos_aloy_neosp.joblib → models/tawos/aloy/model_tawos_aloy_median.joblib +2 -2
create_alloy_model.ipynb ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "ename": "FileNotFoundError",
10
+ "evalue": "[Errno 2] No such file or directory: 'database\\\\tawos\\\\deep\\\\ALOY_deep-se.csv'",
11
+ "output_type": "error",
12
+ "traceback": [
13
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
14
+ "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
15
+ "\u001b[1;32mc:\\Projetos\\model_effort_tawos\\model_creators\\create_alloy_model.ipynb Cell 1\u001b[0m in \u001b[0;36m2\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Projetos/model_effort_tawos/model_creators/create_alloy_model.ipynb#W0sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[1;32m----> <a href='vscode-notebook-cell:/c%3A/Projetos/model_effort_tawos/model_creators/create_alloy_model.ipynb#W0sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m df \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39;49mread_csv(\u001b[39m\"\u001b[39;49m\u001b[39mdatabase\u001b[39;49m\u001b[39m\\\\\u001b[39;49;00m\u001b[39mtawos\u001b[39;49m\u001b[39m\\\\\u001b[39;49;00m\u001b[39mdeep\u001b[39;49m\u001b[39m\\\\\u001b[39;49;00m\u001b[39mALOY_deep-se.csv\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Projetos/model_effort_tawos/model_creators/create_alloy_model.ipynb#W0sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m df\u001b[39m.\u001b[39minfo()\n",
16
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[0;32m 899\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 900\u001b[0m dialect,\n\u001b[0;32m 901\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 908\u001b[0m dtype_backend\u001b[39m=\u001b[39mdtype_backend,\n\u001b[0;32m 909\u001b[0m )\n\u001b[0;32m 910\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m--> 912\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
17
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:577\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 574\u001b[0m _validate_names(kwds\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mnames\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[0;32m 576\u001b[0m \u001b[39m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 577\u001b[0m parser \u001b[39m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwds)\n\u001b[0;32m 579\u001b[0m \u001b[39mif\u001b[39;00m chunksize \u001b[39mor\u001b[39;00m iterator:\n\u001b[0;32m 580\u001b[0m \u001b[39mreturn\u001b[39;00m parser\n",
18
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:1407\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1404\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptions[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m kwds[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m 1406\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles: IOHandles \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m-> 1407\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_engine \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_engine(f, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mengine)\n",
19
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\pandas\\io\\parsers\\readers.py:1661\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1659\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m mode:\n\u001b[0;32m 1660\u001b[0m mode \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m-> 1661\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39m=\u001b[39m get_handle(\n\u001b[0;32m 1662\u001b[0m f,\n\u001b[0;32m 1663\u001b[0m mode,\n\u001b[0;32m 1664\u001b[0m encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1665\u001b[0m compression\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mcompression\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1666\u001b[0m memory_map\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mmemory_map\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mFalse\u001b[39;49;00m),\n\u001b[0;32m 1667\u001b[0m is_text\u001b[39m=\u001b[39;49mis_text,\n\u001b[0;32m 1668\u001b[0m errors\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding_errors\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mstrict\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[0;32m 1669\u001b[0m storage_options\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mstorage_options\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1670\u001b[0m )\n\u001b[0;32m 1671\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 1672\u001b[0m f \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles\u001b[39m.\u001b[39mhandle\n",
20
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\pandas\\io\\common.py:859\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 854\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(handle, \u001b[39mstr\u001b[39m):\n\u001b[0;32m 855\u001b[0m \u001b[39m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 856\u001b[0m \u001b[39m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 857\u001b[0m \u001b[39mif\u001b[39;00m ioargs\u001b[39m.\u001b[39mencoding \u001b[39mand\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m ioargs\u001b[39m.\u001b[39mmode:\n\u001b[0;32m 858\u001b[0m \u001b[39m# Encoding\u001b[39;00m\n\u001b[1;32m--> 859\u001b[0m handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39;49m(\n\u001b[0;32m 860\u001b[0m handle,\n\u001b[0;32m 861\u001b[0m ioargs\u001b[39m.\u001b[39;49mmode,\n\u001b[0;32m 862\u001b[0m encoding\u001b[39m=\u001b[39;49mioargs\u001b[39m.\u001b[39;49mencoding,\n\u001b[0;32m 863\u001b[0m errors\u001b[39m=\u001b[39;49merrors,\n\u001b[0;32m 864\u001b[0m newline\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m 865\u001b[0m )\n\u001b[0;32m 866\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 867\u001b[0m \u001b[39m# Binary mode\u001b[39;00m\n\u001b[0;32m 868\u001b[0m handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(handle, ioargs\u001b[39m.\u001b[39mmode)\n",
21
+ "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'database\\\\tawos\\\\deep\\\\ALOY_deep-se.csv'"
22
+ ]
23
+ }
24
+ ],
25
+ "source": [
26
+ "import pandas as pd\n",
27
+ "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\ALOY_deep-se.csv\")\n",
28
+ "df.info()"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "metadata": {},
34
+ "source": [
35
+ "# Pré-Processamento"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 10,
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "import re\n",
45
+ "from string import punctuation\n",
46
+ "\n",
47
+ "def escape_tags_and_content(text):\n",
48
+ " \"\"\"Escape tags and their content containing text, which is not written in natural language, such as code snippets\"\"\"\n",
49
+ "\n",
50
+ " NO_TEXT_TAGS = \"code\", \"noformat\"\n",
51
+ " for tag in NO_TEXT_TAGS:\n",
52
+ " regex_matching_tag = re.compile(\"\\{%s(.*?)\\}(.*?)\\{%s\\}\" % (tag, tag), re.DOTALL)\n",
53
+ " text = re.sub(regex_matching_tag, \"\", text)\n",
54
+ "\n",
55
+ " return text\n",
56
+ "\n",
57
+ "def escape_tags(text):\n",
58
+ " \"\"\"Escape markup tags, but retain their content\"\"\"\n",
59
+ "\n",
60
+ " ESCAPE_TAGS = \"color\", \"quote\", \"anchor\", \"panel\"\n",
61
+ " for tag in ESCAPE_TAGS:\n",
62
+ " text = re.sub(\"\\{%s(.*?)\\}\" % tag, \"\", text)\n",
63
+ "\n",
64
+ " return text\n",
65
+ "\n",
66
+ "def escape_strings(text):\n",
67
+ " \"\"\"Escape line breaks, tabulators, slashes and JIRA heading markup symbols\"\"\"\n",
68
+ "\n",
69
+ " ESCAPE_STRINGS = \"\\\\r\", \"\\\\n\", \"\\\\t\", \"\\\\f\", \"\\\\v\", \"\\\"\", \"\\\\\\\\\", \"h1. \", \"h2. \", \"h3. \", \"h4. \", \"h5. \", \"h6. \"\n",
70
+ " for escape_string in ESCAPE_STRINGS:\n",
71
+ " text = text.replace(escape_string, \" \")\n",
72
+ "\n",
73
+ " return text\n",
74
+ "\n",
75
+ "def escape_links(text):\n",
76
+ " \"\"\"Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' \"\"\"\n",
77
+ "\n",
78
+ " LINK_STARTERS = r\"\\#\", r\"\\^\", r\"http\\:\\/\\/\", r\"https\\:\\/\\/\", r\"malto\\:\", r\"file\\:\", r\"\\~\"\n",
79
+ " for link_starter in LINK_STARTERS:\n",
80
+ " text = re.sub(\"\\[(.*?\\\\|)?%s(.*?)\\]\" % link_starter, \"\", text)\n",
81
+ " text = re.sub(r\"\\bhttps?://\\S+\", \"\", text)\n",
82
+ "\n",
83
+ " return text\n",
84
+ "\n",
85
+ "def escape_hex_character_codes(text):\n",
86
+ " \"\"\"Escape characters outside the latin alphabet which are converted to hex code representation\"\"\"\n",
87
+ "\n",
88
+ " return re.sub(r\"\\\\x\\w\\w\", \"\", text)\n",
89
+ "\n",
90
+ "def escape_punctuation_boundaries(text):\n",
91
+ " \"\"\"Remove all punctuation marks from the beginning and end of words,\n",
92
+ " except for trailing period at the end of words\"\"\"\n",
93
+ "\n",
94
+ " return \" \".join([word.strip(punctuation.replace(\".\", \"\")).lstrip(\".\") for word in text.split()])\n",
95
+ "\n",
96
+ "def escape_odd_spaces(text):\n",
97
+ " \"\"\"Replace several consequent spaces with one space\n",
98
+ " and remove spaces from string start and end\"\"\"\n",
99
+ "\n",
100
+ " text = re.sub(r\"\\s+\", \" \", text)\n",
101
+ " text = text.strip()\n",
102
+ "\n",
103
+ " return text"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "markdown",
108
+ "metadata": {},
109
+ "source": [
110
+ "# Criação do Modelo"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 18,
116
+ "metadata": {},
117
+ "outputs": [
118
+ {
119
+ "data": {
120
+ "text/plain": [
121
+ "['models/tawos/aloy/model_tawos_aloy_tfidf_linear.joblib']"
122
+ ]
123
+ },
124
+ "execution_count": 18,
125
+ "metadata": {},
126
+ "output_type": "execute_result"
127
+ }
128
+ ],
129
+ "source": [
130
+ "from sklearn.dummy import DummyRegressor\n",
131
+ "from nltk.corpus import stopwords\n",
132
+ "from textblob import TextBlob\n",
133
+ "import textstat\n",
134
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
135
+ "from sklearn import svm\n",
136
+ "from sklearn.linear_model import LinearRegression\n",
137
+ "from sklearn.feature_selection import SelectKBest\n",
138
+ "import pandas as pd\n",
139
+ "from joblib import dump\n",
140
+ "\n",
141
+ "# carregando os dados\n",
142
+ "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\ALOY_deep-se.csv\")\n",
143
+ "\n",
144
+ "# criação de uma nova coluna\n",
145
+ "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
146
+ "\n",
147
+ "# pré-processamento\n",
148
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n",
149
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n",
150
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n",
151
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n",
152
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n",
153
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n",
154
+ "df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n",
155
+ "\n",
156
+ "# removendo stop-words\n",
157
+ "stop = stopwords.words('english')\n",
158
+ "df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n",
159
+ "\n",
160
+ "# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n",
161
+ "df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n",
162
+ "y = df[\"storypoint_\"]\n",
163
+ "df = df.drop(columns=['storypoint_'])\n",
164
+ "\n",
165
+ "# 5º coluna -> extração das features para o neosp\n",
166
+ "df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n",
167
+ "df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n",
168
+ "df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n",
169
+ "df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n",
170
+ "df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n",
171
+ "df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n",
172
+ "df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n",
173
+ "df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n",
174
+ "df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n",
175
+ "df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n",
176
+ "df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n",
177
+ "# 16º colunas\n",
178
+ "\n",
179
+ "# Extração das features para o TFIDF\n",
180
+ "vectorizer = TfidfVectorizer()\n",
181
+ "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
182
+ "\n",
183
+ "dump(vectorizer, \"models/tawos/aloy/vectorizer_tfidf.joblib\")\n",
184
+ "\n",
185
+ "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
186
+ "\n",
187
+ "# Juntando as features do neosp com o tfidf\n",
188
+ "df = df.join(df_vec)\n",
189
+ "X = df\n",
190
+ "\n",
191
+ "############ MbR\n",
192
+ "\n",
193
+ "model = DummyRegressor(strategy=\"mean\")\n",
194
+ "model.fit(X, y)\n",
195
+ "dump(model, \"models/tawos/aloy/model_tawos_aloy_mbr.joblib\")\n",
196
+ "\n",
197
+ "############ Mediana\n",
198
+ "\n",
199
+ "model = DummyRegressor(strategy=\"median\")\n",
200
+ "model.fit(X, y)\n",
201
+ "dump(model, \"models/tawos/aloy/model_tawos_aloy_median.joblib\")\n",
202
+ "\n",
203
+ "########### NEOSP-SVR\n",
204
+ "\n",
205
+ "model = svm.SVR()\n",
206
+ "model.fit(X[X.columns[5:16]], y)\n",
207
+ "dump(model, \"models/tawos/aloy/model_tawos_aloy_neosp_svr.joblib\")\n",
208
+ "\n",
209
+ "########### NEOSP-LR\n",
210
+ "\n",
211
+ "model = LinearRegression()\n",
212
+ "model.fit(X[X.columns[5:16]], y)\n",
213
+ "dump(model, \"models/tawos/aloy/model_tawos_aloy_neosp_linear.joblib\")\n",
214
+ "\n",
215
+ "############ TFIDF-SVM\n",
216
+ "\n",
217
+ "model = svm.SVR()\n",
218
+ "model.fit(X[X.columns[17:]], y)\n",
219
+ "dump(model, \"models/tawos/aloy/model_tawos_aloy_tfidf_svr.joblib\")\n",
220
+ "\n",
221
+ "############ TFIDF-LR\n",
222
+ "\n",
223
+ "model = LinearRegression()\n",
224
+ "model.fit(X[X.columns[17:]], y)\n",
225
+ "dump(model, \"models/tawos/aloy/model_tawos_aloy_tfidf_linear.joblib\")\n"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "markdown",
230
+ "metadata": {},
231
+ "source": []
232
+ }
233
+ ],
234
+ "metadata": {
235
+ "kernelspec": {
236
+ "display_name": "Python 3",
237
+ "language": "python",
238
+ "name": "python3"
239
+ },
240
+ "language_info": {
241
+ "codemirror_mode": {
242
+ "name": "ipython",
243
+ "version": 3
244
+ },
245
+ "file_extension": ".py",
246
+ "mimetype": "text/x-python",
247
+ "name": "python",
248
+ "nbconvert_exporter": "python",
249
+ "pygments_lexer": "ipython3",
250
+ "version": "3.10.11"
251
+ },
252
+ "orig_nbformat": 4
253
+ },
254
+ "nbformat": 4,
255
+ "nbformat_minor": 2
256
+ }
database/tawos/README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tawosi Dataset
2
+
3
+ This directory consists 46 files, two files per each of the 26-3\*=23 projects:
4
+ - 23 files with "\_deep-se" suffix are prepared to be used by Deep-SE.
5
+ - 23 files with "\_tfidf-se" suffix are prepared to be used by TF/IDF-SE.
6
+
7
+ \* <sub>One of the repositories including three projects has been removed from the public domain during the time that the manuscript for this study [1] was under revision. Therefore, although the paper reports the results for all 26 projects, the replication package includes 23 projects as we refrain from publishing the data for the three remaining projects in accordance with The General Data Protection Regulation.</sub>
8
+
9
+ These 23 files are collected from 12 open source repositories by Tawosi et al. up until August, 2020.
10
+ The files named after their project key as "[project key]\_[approach].csv" e.g. MESOS_deep-se.csv, which is the set of issues collected from Appache repository Mesos project, and contains the features that Deep-SE needs for prediction. The following table shows the list of projects and the repositories where the project was collected from.
11
+
12
+ ## Project list
13
+
14
+ | Repository | Project | Key | File for Deep-SE | File for TF/IDF-SE |
15
+ |--------------|-----------------------------------|------------|---------------------------|---------------------------|
16
+ | Apache | Mesos | MESOS | MESOS_deeep-se.csv | MESOS_tfidf-se.csv |
17
+ | Apache | Alloy | ALOY | ALOY_deeep-se.csv | ALOY_tfidf-se.csv |
18
+ | Appcelerator | Appcelerator studio | TISTUD | TISTUD_deeep-se.csv | TISTUD_tfidf-se.csv |
19
+ | Appcelerator | Aptana studio | APSTUD | APSTUD_deeep-se.csv | APSTUD_tfidf-se.csv |
20
+ | Appcelerator | Command-Line Interface | CLI | CLI_deeep-se.csv | CLI_tfidf-se.csv |
21
+ | Appcelerator | Daemon | DAEMON | DAEMON_deeep-se.csv | DAEMON_tfidf-se.csv |
22
+ | Appcelerator | Documentation | TIDOC | TIDOC_deeep-se.csv | TIDOC_tfidf-se.csv |
23
+ | Appcelerator | Titanium | TIMOB | TIMOB_deeep-se.csv | TIMOB_tfidf-se.csv |
24
+ | Atlassian | Clover | CLOV | CLOV_deeep-se.csv | CLOV_tfidf-se.csv |
25
+ | Atlassian | Confluence Cloud | CONFCLOUD | CONFCLOUD_deeep-se.csv | CONFCLOUD_tfidf-se.csv |
26
+ | Atlassian | Confluence Server and Data Center | CONFSERVER | CONFSERVER_deeep-se.csv | CONFSERVER_tfidf-se.csv |
27
+ | DNNSoftware | DNN | DNN | DNN_deeep-se.csv | DNN_tfidf-se.csv |
28
+ | Duraspace | Duracloud | DURACLOUD | DURACLOUD_deeep-se.csv | DURACLOUD_tfidf-se.csv |
29
+ | Hyperledger | Fabric | FAB | FAB_deeep-se.csv | FAB_tfidf-se.csv |
30
+ | Hyperledger | Sawtooth | STL | STL_deeep-se.csv | STL_tfidf-se.csv |
31
+ | Lsstcorp | Data management | DM | DM_deeep-se.csv | DM_tfidf-se.csv |
32
+ | MongoDB | Compass | COMPASS | COMPASS_deeep-se.csv | COMPASS_tfidf-se.csv |
33
+ | MongoDB | Core Server | SERVER | SERVER_deeep-se.csv | SERVER_tfidf-se.csv |
34
+ | MongoDB | Evergreen | EVG | EVG_deeep-se.csv | EVG_tfidf-se.csv |
35
+ | Moodle | Moodle | MDL | MDL_deeep-se.csv | MDL_tfidf-se.csv |
36
+ | Mulesoft | Mule | MULE | MULE_deeep-se.csv | MULE_tfidf-se.csv |
37
+ | Sonatype | Sonatype’s Nexus | NEXUS | NEXUS_deeep-se.csv | NEXUS_tfidf-se.csv |
38
+ | Spring | Spring XD | XD | XD_deeep-se.csv | XD_tfidf-se.csv |
39
+
40
+ ## Content of the files
41
+
42
+ - Each csv file for Deep-SE approach contains 4 columns: *issuekey*, *created*, *title*, *description*, and *storypoint*.
43
+
44
+ - Each csv file for TF/IDF-SE approach contains more than 4 columns: starting with *issuekey*, *created*, *storypoint*, *context*, *codesnippet*, and a set of one-hot columns for issue type (header starting with t\_) followed by component(s) (header starting with c\_).
45
+
46
+ - The issues are sorted based on issue's creation time (i.e. the former issues was created before the latter issues).
47
+
48
+ [1] Vali Tawosi, Rebecca Moussa, and Federica Sarro. "Agile Effort Estimation: Have We Solved the Problem Yet? Insights From A Replication Study." IEEE Transactions on Software Engineering, no. TBA (2022): pp. TBA.
database/tawos/deep/ALOY_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/APSTUD_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/CLI_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/CLOV_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/COMPASS_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/CONFCLOUD_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/CONFSERVER_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/DAEMON_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/DM_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/DNN_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/DURACLOUD_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/EVG_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/FAB_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/MDL_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/MESOS_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/MULE_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/NEXUS_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/SERVER_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/STL_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/TIDOC_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/TIMOB_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/TISTUD_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/deep/XD_deep-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/ALOY_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/APSTUD_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/CLI_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/CLOV_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/COMPASS_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/CONFCLOUD_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/CONFSERVER_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/DAEMON_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/DM_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/DNN_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/DURACLOUD_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/EVG_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/FAB_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/MDL_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/MESOS_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/MULE_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/NEXUS_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/SERVER_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/STL_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/TIDOC_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/TIMOB_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/TISTUD_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/tawos/tfidf/XD_tfidf-se.csv ADDED
The diff for this file is too large to render. See raw diff
 
model_tawos_aloy_mbr.joblib → models/tawos/aloy/model_tawos_aloy_mbr.joblib RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:caf24da2287096a50a79a061a89a97b3754e97a73e761b347209441e4f4a8a5d
3
  size 383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d883f253e395899015acd3f76b9946b652da8237a0878d3f5fef36a45e9d29fd
3
  size 383
model_tawos_aloy_neosp.joblib → models/tawos/aloy/model_tawos_aloy_median.joblib RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af24bfeaf71ddd9722eba995ddc99afe0c8b106785dac50818218a3d9d963d83
3
- size 22883
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f33655a67e4d587143f615b1604a45fc5cac5b70b0c8e999b47a953a43511e43
3
+ size 383