In [4]:
import pandas as pd
project_name = "APSTUD"

df = pd.read_csv("database\\tawos\\deep\\{}_deep-se.csv".format(project_name))

df.info()


RangeIndex: 476 entries, 0 to 475
Data columns (total 5 columns):
 # Column Non-Null Count Dtype 
--- ------ -------------- ----- 
 0 issuekey 476 non-null object
 1 created 476 non-null object
 2 title 476 non-null object
 3 description 476 non-null object
 4 storypoint 476 non-null int64 
dtypes: int64(1), object(4)
memory usage: 18.7+ KB


# Pré-Processamento

In [5]:
import re
from string import punctuation

def escape_tags_and_content(text):
 """Escape tags and their content containing text, which is not written in natural language, such as code snippets"""

 NO_TEXT_TAGS = "code", "noformat"
 for tag in NO_TEXT_TAGS:
 regex_matching_tag = re.compile("\{%s(.*?)\}(.*?)\{%s\}" % (tag, tag), re.DOTALL)
 text = re.sub(regex_matching_tag, "", text)

 return text

def escape_tags(text):
 """Escape markup tags, but retain their content"""

 ESCAPE_TAGS = "color", "quote", "anchor", "panel"
 for tag in ESCAPE_TAGS:
 text = re.sub("\{%s(.*?)\}" % tag, "", text)

 return text

def escape_strings(text):
 """Escape line breaks, tabulators, slashes and JIRA heading markup symbols"""

 ESCAPE_STRINGS = "\\r", "\\n", "\\t", "\\f", "\\v", "\"", "\\\\", "h1. ", "h2. ", "h3. ", "h4. ", "h5. ", "h6. "
 for escape_string in ESCAPE_STRINGS:
 text = text.replace(escape_string, " ")

 return text

def escape_links(text):
 """Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' """

 LINK_STARTERS = r"\#", r"\^", r"http\:\/\/", r"https\:\/\/", r"malto\:", r"file\:", r"\~"
 for link_starter in LINK_STARTERS:
 text = re.sub("\[(.*?\\|)?%s(.*?)\]" % link_starter, "", text)
 text = re.sub(r"\bhttps?://\S+", "", text)

 return text

def escape_hex_character_codes(text):
 """Escape characters outside the latin alphabet which are converted to hex code representation"""

 return re.sub(r"\\x\w\w", "", text)

def escape_punctuation_boundaries(text):
 """Remove all punctuation marks from the beginning and end of words,
 except for trailing period at the end of words"""

 return " ".join([word.strip(punctuation.replace(".", "")).lstrip(".") for word in text.split()])

def escape_odd_spaces(text):
 """Replace several consequent spaces with one space
 and remove spaces from string start and end"""

 text = re.sub(r"\s+", " ", text)
 text = text.strip()

 return text

# Criação do Modelo

In [6]:
from sklearn.dummy import DummyRegressor
from nltk.corpus import stopwords
from textblob import TextBlob
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest
import pandas as pd
from joblib import dump

# carregando os dados
df = pd.read_csv("database\\tawos\\deep\\{}_deep-se.csv".format(project_name))

# criação de uma nova coluna
df["context"] = df["title"] + df["description"]

# pré-processamento
df["context"] = df["context"].apply(lambda x: escape_tags_and_content(x))
df["context"] = df["context"].apply(lambda x: escape_tags(x))
df["context"] = df["context"].apply(lambda x: escape_strings(x))
df["context"] = df["context"].apply(lambda x: escape_links(x))
df["context"] = df["context"].apply(lambda x: escape_hex_character_codes(x))
df["context"] = df["context"].apply(lambda x: escape_punctuation_boundaries(x))
df["context"] = df["context"].apply(lambda x: escape_odd_spaces(x))

# removendo stop-words
stop = stopwords.words('english')
df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP
df = df.rename(columns={ "issuekey": "issuekey_", "created": "created_", "description": "description_", "title": "title_", "context": "context_", "storypoint": "storypoint_"})
y = df["storypoint_"]
df = df.drop(columns=['storypoint_'])

# 5º coluna -> extração das features para o neosp
df["gunning_fog_"] = df['context_'].apply(textstat.gunning_fog)
df["flesch_reading_ease_"] = df['context_'].apply(textstat.flesch_reading_ease)
df["flesch_kincaid_grade_"] = df['context_'].apply(textstat.flesch_kincaid_grade)
df["smog_index_"] = df['context_'].apply(textstat.smog_index)
df["coleman_liau_index_"] = df['context_'].apply(textstat.coleman_liau_index)
df["automated_readability_index_"] = df['context_'].apply(textstat.automated_readability_index)
df["dale_chall_readability_score_"] = df['context_'].apply(textstat.dale_chall_readability_score)
df["difficult_words_"] = df['context_'].apply(textstat.difficult_words)
df["linsear_write_formula_"] = df['context_'].apply(textstat.linsear_write_formula)
df["polarity_"] = df["context_"].apply(lambda x: TextBlob(x).sentiment.polarity)
df["subjectivity_"] = df["context_"].apply(lambda x: TextBlob(x).sentiment.subjectivity)
# 16º colunas

# Extração das features para o TFIDF
vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(df["context_"])
#dump(vectorizer, "vectorizer_tfidf.joblib")
dump(vectorizer, "models/tawos/{}/vectorizer_tawos_{}_tfidf.joblib".format(project_name, project_name))

df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())

# Juntando as features do neosp com o tfidf
df = df.join(df_vec)
X = df

############ MbR

model = DummyRegressor(strategy="mean")
model.fit(X, y)
#dump(model, "model_tawos_aloy_mbr.joblib")
dump(model, "models/tawos/{}/model_tawos_{}_mbr.joblib".format(project_name, project_name))

############ Mediana

model = DummyRegressor(strategy="median")
model.fit(X, y)
#dump(model, "model_tawos_aloy_median.joblib")
dump(model, "models/tawos/{}/model_tawos_{}_median.joblib".format(project_name, project_name))

########### NEOSP-SVR

model = svm.SVR()
model.fit(X[X.columns[5:16]], y)
#dump(model, "model_tawos_aloy_neosp_svr.joblib")
dump(model, "models/tawos/{}/model_tawos_{}_neosp_svr.joblib".format(project_name, project_name))

########### NEOSP-LR

model = LinearRegression()
model.fit(X[X.columns[5:16]], y)
#dump(model, "model_tawos_aloy_neosp_linear.joblib")
dump(model, "models/tawos/{}/model_tawos_{}_neosp_linear.joblib".format(project_name, project_name))

############ TFIDF-SVM

model = svm.SVR()
model.fit(X[X.columns[16:]], y)
#dump(model, "model_tawos_aloy_tfidf_svr.joblib")
dump(model, "models/tawos/{}/model_tawos_{}_tfidf_svr.joblib".format(project_name, project_name))

############ TFIDF-LR

model = LinearRegression()
model.fit(X[X.columns[16:]], y)
#dump(model, "model_tawos_aloy_tfidf_linear.joblib")
dump(model, "models/tawos/{}/model_tawos_{}_tfidf_linear.joblib".format(project_name, project_name))

['models/tawos/APSTUD/model_tawos_APSTUD_tfidf_linear.joblib']