{ "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 241 entries, 0 to 240\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 issuekey 241 non-null object\n", " 1 created 241 non-null object\n", " 2 title 241 non-null object\n", " 3 description 241 non-null object\n", " 4 storypoint 241 non-null int64 \n", "dtypes: int64(1), object(4)\n", "memory usage: 9.5+ KB\n" ] } ], "source": [ "import pandas as pd\n", "project_name = \"ALOY\"\n", "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n", "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pré-Processamento" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import re\n", "from string import punctuation\n", "\n", "def escape_tags_and_content(text):\n", " \"\"\"Escape tags and their content containing text, which is not written in natural language, such as code snippets\"\"\"\n", "\n", " NO_TEXT_TAGS = \"code\", \"noformat\"\n", " for tag in NO_TEXT_TAGS:\n", " regex_matching_tag = re.compile(\"\\{%s(.*?)\\}(.*?)\\{%s\\}\" % (tag, tag), re.DOTALL)\n", " text = re.sub(regex_matching_tag, \"\", text)\n", "\n", " return text\n", "\n", "def escape_tags(text):\n", " \"\"\"Escape markup tags, but retain their content\"\"\"\n", "\n", " ESCAPE_TAGS = \"color\", \"quote\", \"anchor\", \"panel\"\n", " for tag in ESCAPE_TAGS:\n", " text = re.sub(\"\\{%s(.*?)\\}\" % tag, \"\", text)\n", "\n", " return text\n", "\n", "def escape_strings(text):\n", " \"\"\"Escape line breaks, tabulators, slashes and JIRA heading markup symbols\"\"\"\n", "\n", " ESCAPE_STRINGS = \"\\\\r\", \"\\\\n\", \"\\\\t\", \"\\\\f\", \"\\\\v\", \"\\\"\", \"\\\\\\\\\", \"h1. \", \"h2. \", \"h3. \", \"h4. \", \"h5. \", \"h6. \"\n", " for escape_string in ESCAPE_STRINGS:\n", " text = text.replace(escape_string, \" \")\n", "\n", " return text\n", "\n", "def escape_links(text):\n", " \"\"\"Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' \"\"\"\n", "\n", " LINK_STARTERS = r\"\\#\", r\"\\^\", r\"http\\:\\/\\/\", r\"https\\:\\/\\/\", r\"malto\\:\", r\"file\\:\", r\"\\~\"\n", " for link_starter in LINK_STARTERS:\n", " text = re.sub(\"\\[(.*?\\\\|)?%s(.*?)\\]\" % link_starter, \"\", text)\n", " text = re.sub(r\"\\bhttps?://\\S+\", \"\", text)\n", "\n", " return text\n", "\n", "def escape_hex_character_codes(text):\n", " \"\"\"Escape characters outside the latin alphabet which are converted to hex code representation\"\"\"\n", "\n", " return re.sub(r\"\\\\x\\w\\w\", \"\", text)\n", "\n", "def escape_punctuation_boundaries(text):\n", " \"\"\"Remove all punctuation marks from the beginning and end of words,\n", " except for trailing period at the end of words\"\"\"\n", "\n", " return \" \".join([word.strip(punctuation.replace(\".\", \"\")).lstrip(\".\") for word in text.split()])\n", "\n", "def escape_odd_spaces(text):\n", " \"\"\"Replace several consequent spaces with one space\n", " and remove spaces from string start and end\"\"\"\n", "\n", " text = re.sub(r\"\\s+\", \" \", text)\n", " text = text.strip()\n", "\n", " return text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Criação do Modelo" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['models/tawos/ALOY/model_tawos_ALOY_tfidf_linear.joblib']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.dummy import DummyRegressor\n", "from nltk.corpus import stopwords\n", "from textblob import TextBlob\n", "import textstat\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn import svm\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.feature_selection import SelectKBest\n", "import pandas as pd\n", "from joblib import dump\n", "\n", "# carregando os dados\n", "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n", "\n", "# criação de uma nova coluna\n", "df[\"context\"] = df[\"title\"] + df[\"description\"]\n", "\n", "# pré-processamento\n", "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n", "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n", "df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n", "df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n", "df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n", "df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n", "df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n", "\n", "# removendo stop-words\n", "stop = stopwords.words('english')\n", "df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n", "\n", "# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n", "df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n", "y = df[\"storypoint_\"]\n", "df = df.drop(columns=['storypoint_'])\n", "\n", "# 5º coluna -> extração das features para o neosp\n", "df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n", "df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n", "df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n", "df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n", "df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n", "df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n", "df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n", "df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n", "df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n", "df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n", "df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n", "# 16º colunas\n", "\n", "# Extração das features para o TFIDF\n", "vectorizer = TfidfVectorizer()\n", "X_vec = vectorizer.fit_transform(df[\"context_\"])\n", "#dump(vectorizer, \"vectorizer_tfidf.joblib\")\n", "dump(vectorizer, \"models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib\".format(project_name, project_name))\n", "\n", "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n", "\n", "# Juntando as features do neosp com o tfidf\n", "df = df.join(df_vec)\n", "X = df\n", "\n", "############ MbR\n", "\n", "model = DummyRegressor(strategy=\"mean\")\n", "model.fit(X, y)\n", "#dump(model, \"model_tawos_aloy_mbr.joblib\")\n", "dump(model, \"models/tawos/{}/model_tawos_{}_mbr.joblib\".format(project_name, project_name))\n", "\n", "############ Mediana\n", "\n", "model = DummyRegressor(strategy=\"median\")\n", "model.fit(X, y)\n", "#dump(model, \"model_tawos_aloy_median.joblib\")\n", "dump(model, \"models/tawos/{}/model_tawos_{}_median.joblib\".format(project_name, project_name))\n", "\n", "########### NEOSP-SVR\n", "\n", "model = svm.SVR()\n", "model.fit(X[X.columns[5:16]], y)\n", "#dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n", "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_svr.joblib\".format(project_name, project_name))\n", "\n", "########### NEOSP-LR\n", "\n", "model = LinearRegression()\n", "model.fit(X[X.columns[5:16]], y)\n", "#dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n", "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_linear.joblib\".format(project_name, project_name))\n", "\n", "############ TFIDF-SVM\n", "\n", "model = svm.SVR()\n", "model.fit(X[X.columns[16:]], y)\n", "#dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n", "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_svr.joblib\".format(project_name, project_name))\n", "\n", "############ TFIDF-LR\n", "\n", "model = LinearRegression()\n", "model.fit(X[X.columns[16:]], y)\n", "#dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n", "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_linear.joblib\".format(project_name, project_name))\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }