File size: 10,211 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3915 entries, 0 to 3914\n",
      "Data columns (total 5 columns):\n",
      " #   Column       Non-Null Count  Dtype  \n",
      "---  ------       --------------  -----  \n",
      " 0   issuekey     3915 non-null   object \n",
      " 1   created      3915 non-null   object \n",
      " 2   title        3915 non-null   object \n",
      " 3   description  3915 non-null   object \n",
      " 4   storypoint   3915 non-null   float64\n",
      "dtypes: float64(1), object(4)\n",
      "memory usage: 153.1+ KB\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "project_name = \"TIMOB\"\n",
    "\n",
    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
    "\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pré-Processamento"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from string import punctuation\n",
    "\n",
    "def escape_tags_and_content(text):\n",
    "    \"\"\"Escape tags and their content containing text, which is not written in natural language, such as code snippets\"\"\"\n",
    "\n",
    "    NO_TEXT_TAGS = \"code\", \"noformat\"\n",
    "    for tag in NO_TEXT_TAGS:\n",
    "        regex_matching_tag = re.compile(\"\\{%s(.*?)\\}(.*?)\\{%s\\}\" % (tag, tag), re.DOTALL)\n",
    "        text = re.sub(regex_matching_tag, \"\", text)\n",
    "\n",
    "    return text\n",
    "\n",
    "def escape_tags(text):\n",
    "    \"\"\"Escape markup tags, but retain their content\"\"\"\n",
    "\n",
    "    ESCAPE_TAGS = \"color\", \"quote\", \"anchor\", \"panel\"\n",
    "    for tag in  ESCAPE_TAGS:\n",
    "        text = re.sub(\"\\{%s(.*?)\\}\" % tag, \"\", text)\n",
    "\n",
    "    return text\n",
    "\n",
    "def escape_strings(text):\n",
    "    \"\"\"Escape line breaks, tabulators, slashes and JIRA heading markup symbols\"\"\"\n",
    "\n",
    "    ESCAPE_STRINGS = \"\\\\r\", \"\\\\n\", \"\\\\t\", \"\\\\f\", \"\\\\v\", \"\\\"\", \"\\\\\\\\\", \"h1. \", \"h2. \", \"h3. \", \"h4. \", \"h5. \", \"h6. \"\n",
    "    for escape_string in ESCAPE_STRINGS:\n",
    "        text = text.replace(escape_string, \" \")\n",
    "\n",
    "    return text\n",
    "\n",
    "def escape_links(text):\n",
    "    \"\"\"Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' \"\"\"\n",
    "\n",
    "    LINK_STARTERS = r\"\\#\", r\"\\^\", r\"http\\:\\/\\/\", r\"https\\:\\/\\/\", r\"malto\\:\", r\"file\\:\", r\"\\~\"\n",
    "    for link_starter in LINK_STARTERS:\n",
    "        text = re.sub(\"\\[(.*?\\\\|)?%s(.*?)\\]\" % link_starter, \"\", text)\n",
    "        text = re.sub(r\"\\bhttps?://\\S+\", \"\", text)\n",
    "\n",
    "    return text\n",
    "\n",
    "def escape_hex_character_codes(text):\n",
    "    \"\"\"Escape characters outside the latin alphabet which are converted to hex code representation\"\"\"\n",
    "\n",
    "    return re.sub(r\"\\\\x\\w\\w\", \"\", text)\n",
    "\n",
    "def escape_punctuation_boundaries(text):\n",
    "    \"\"\"Remove all punctuation marks from the beginning and end of words,\n",
    "    except for trailing period at the end of words\"\"\"\n",
    "\n",
    "    return \" \".join([word.strip(punctuation.replace(\".\", \"\")).lstrip(\".\") for word in text.split()])\n",
    "\n",
    "def escape_odd_spaces(text):\n",
    "    \"\"\"Replace several consequent spaces with one space\n",
    "    and remove spaces from string start and end\"\"\"\n",
    "\n",
    "    text = re.sub(r\"\\s+\", \" \", text)\n",
    "    text = text.strip()\n",
    "\n",
    "    return text"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Criação do Modelo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['models/tawos/TIMOB/model_tawos_TIMOB_tfidf_linear.joblib']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.dummy import DummyRegressor\n",
    "from nltk.corpus import stopwords\n",
    "from textblob import TextBlob\n",
    "import textstat\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn import svm\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "import pandas as pd\n",
    "from joblib import dump\n",
    "\n",
    "# carregando os dados\n",
    "df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n",
    "\n",
    "# criação de uma nova coluna\n",
    "df[\"context\"] = df[\"title\"] + df[\"description\"]\n",
    "\n",
    "# pré-processamento\n",
    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n",
    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n",
    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n",
    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n",
    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n",
    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n",
    "df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n",
    "\n",
    "# removendo stop-words\n",
    "stop = stopwords.words('english')\n",
    "df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n",
    "\n",
    "# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n",
    "df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n",
    "y = df[\"storypoint_\"]\n",
    "df = df.drop(columns=['storypoint_'])\n",
    "\n",
    "# 5º coluna -> extração das features para o neosp\n",
    "df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n",
    "df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n",
    "df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n",
    "df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n",
    "df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n",
    "df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n",
    "df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n",
    "df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n",
    "df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n",
    "df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n",
    "df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n",
    "# 16º colunas\n",
    "\n",
    "# Extração das features para o TFIDF\n",
    "vectorizer = TfidfVectorizer()\n",
    "X_vec = vectorizer.fit_transform(df[\"context_\"])\n",
    "#dump(vectorizer, \"vectorizer_tfidf.joblib\")\n",
    "dump(vectorizer, \"models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib\".format(project_name, project_name))\n",
    "\n",
    "df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n",
    "\n",
    "# Juntando as features do neosp com o tfidf\n",
    "df = df.join(df_vec)\n",
    "X = df\n",
    "\n",
    "############ MbR\n",
    "\n",
    "model = DummyRegressor(strategy=\"mean\")\n",
    "model.fit(X, y)\n",
    "#dump(model, \"model_tawos_aloy_mbr.joblib\")\n",
    "dump(model, \"models/tawos/{}/model_tawos_{}_mbr.joblib\".format(project_name, project_name))\n",
    "\n",
    "############ Mediana\n",
    "\n",
    "model = DummyRegressor(strategy=\"median\")\n",
    "model.fit(X, y)\n",
    "#dump(model, \"model_tawos_aloy_median.joblib\")\n",
    "dump(model, \"models/tawos/{}/model_tawos_{}_median.joblib\".format(project_name, project_name))\n",
    "\n",
    "########### NEOSP-SVR\n",
    "\n",
    "model = svm.SVR()\n",
    "model.fit(X[X.columns[5:16]], y)\n",
    "#dump(model, \"model_tawos_aloy_neosp_svr.joblib\")\n",
    "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_svr.joblib\".format(project_name, project_name))\n",
    "\n",
    "########### NEOSP-LR\n",
    "\n",
    "model = LinearRegression()\n",
    "model.fit(X[X.columns[5:16]], y)\n",
    "#dump(model, \"model_tawos_aloy_neosp_linear.joblib\")\n",
    "dump(model, \"models/tawos/{}/model_tawos_{}_neosp_linear.joblib\".format(project_name, project_name))\n",
    "\n",
    "############ TFIDF-SVM\n",
    "\n",
    "model = svm.SVR()\n",
    "model.fit(X[X.columns[16:]], y)\n",
    "#dump(model, \"model_tawos_aloy_tfidf_svr.joblib\")\n",
    "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_svr.joblib\".format(project_name, project_name))\n",
    "\n",
    "############ TFIDF-LR\n",
    "\n",
    "model = LinearRegression()\n",
    "model.fit(X[X.columns[16:]], y)\n",
    "#dump(model, \"model_tawos_aloy_tfidf_linear.joblib\")\n",
    "dump(model, \"models/tawos/{}/model_tawos_{}_tfidf_linear.joblib\".format(project_name, project_name))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}