File size: 12,989 Bytes
c193173
 
 
 
 
 
 
 
 
59cd031
 
c193173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59cd031
c193173
59cd031
c193173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59cd031
c193173
 
 
 
 
 
 
59cd031
c193173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
# -*- coding: utf-8 -*-
"""GradioAppTest.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1QhxoNhhM_kcaoQOyz5hsNWLcf2m2L225
"""

# !pip install gradio
# !pip install transformers

import gradio as gr
from transformers import pipeline

"""## JSON"""

# Define the process that the models will be trained for
trainedProcess = "praksa"
trainedProcessJSON = "Praksa"

json = [
    {
        "name": "Praksa",
        "phases": [
            {
                "name": "Odabir preferencija",
                "alias": ["Prijava prakse", "Odabir zadatka", "Prvi korak"],
                "description": "Odabir preferencija je prvi korak u procesu polaganja prakse. Zahtjeva da student odabere zadatak sa popisa...",
                "duration": "1 mjesec",
            },
            {
                "name": "Ispunjavanje prijavnice",
                "description": "Ispunjavanje prijavnice je drugi korak u procesu polaganja prakse. Student mora ispuniti prijavnicu koja se nalazi na stranici kolegija...",
                "duration": "1 tjedan",
            },
            {
                "name": "Predaja dnevnika prakse",
                "alias": ["Završetak prakse", "Dnevnik"],
                "description": "Predaja dnevnika prakse zadnji je korak u procesu polaganja prakse. S završetkom rada, student predaje dnevnik prakse na stranicu kolegija...",
                "duration": "3 dana",
            },
        ],
        "duration": "2 mjeseca",
    },
    {
        "name": "Izrada završnog rada",
        "phases": [
            {
                "name": "Prijava teme",
                "alias": ["Prvi korak"],
                "description": "Prvi korak u procesu izrade završnog rada je prijava teme. Zahtjeva da student odabere mentora te prijavi temu sa popisa...",
                "duration": "5 dana",
            },
            {
                "name": "Ispuna obrasca",
                "description": "Student ispunjava obrazac sa prijavljenom temom...",
                "duration": "4 dana",
            },
            {
                "name": "Obrana rada",
                "description": "Student brani svoj rad pred komosijom...",
                "duration": "1 sat",
            },
        ],
        "duration": "3 mjeseca",
    },
]

# If tasks do not contain alias propery, assign an empty one to them
for process in json:
    for task in process["phases"]:
        if "alias" not in task:
            task["alias"] = []

"""## User intent recognition model

CPU ~6m

GPU ~3m
"""

# Define training epochs
training_epochs = 10
label_size = 6

# Define dataset URL for training
UIDatasetURL = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSPR-FPTMBcYRynP4JdwYQQ8dAhSx1x8i1LPckUcuIUUlrWT82b5Thqb1bBNnPeGJPxxX1CJAlFSd6F/pub?output=xlsx'

# Will require runetime restart on Google colab (sometimes, idk)
# !pip install tensorflow_text

# !pip install text-hr

"""### Data loading"""

import tensorflow as tf
import tensorflow_text as tft
import tensorflow_hub as tfh
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Text preprocessor for bert based models
preprocessor = tfh.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2')

# Language Agnostic BERT sentence encoder
model = tfh.KerasLayer('https://tfhub.dev/google/LaBSE/2')

# Read the data
import pandas as pd
data = pd.read_excel(UIDatasetURL)

columns = ['text', 'intent', 'process']
data.columns = columns

data = data[data["process"] == trainedProcess].drop(columns="process")

"""#### Category merging"""

# Convert categories to codes
data['intent'] = data['intent'].astype('category')
data['intent_codes'] = data['intent'].cat.codes

# Display the distribution of codes
values = data['intent'].value_counts()
plt.stem(values)

"""#### Normalize data

### Text preprocessing

1. Remove punctuation
2. Lowercase the text
3. Apply tokenization
4. Remove stopwords
5. Apply lemmatizer
"""

import string
import re
import nltk
import text_hr

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

def remove_punctuation(text):
    return "".join([i for i in text if i not in string.punctuation])

def tokenization(text):
    return re.split(r"\s+",text)

stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    return [i for i in text if i not in stopwords]

porter_stemmer = PorterStemmer()
def stemming(text):
    return [porter_stemmer.stem(word) for word in text]

wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
    return [wordnet_lemmatizer.lemmatize(word) for word in text]

data['text'] = data['text']\
    .apply(lambda x: remove_punctuation(x))\
    .apply(lambda x: x.lower())\
    .apply(lambda x: tokenization(x))\
    .apply(lambda x: lemmatizer(x))

stop_words_list_hr = []
for word_base, l_key, cnt, _suff_id, wform_key, wform in text_hr.get_all_std_words():
    if word_base is not None: stop_words_list_hr.append(word_base)
    if wform is not None: stop_words_list_hr.append(wform)

stop_words_list_hr = list(dict.fromkeys(stop_words_list_hr))

def remove_stopwords_hr(text):
    output = [i for i in text if i not in stop_words_list_hr]
    return output

data['text'] = data['text'].apply(lambda x: remove_stopwords_hr(x))

data['text'] = data['text'].str.join(" ")

"""### Split validation and training data

Train 75%, validation 25%
"""

codes = data['intent_codes'].unique()

# Variable to understand the meaning behind codes
CODES_REPR = data[["intent_codes", "intent"]].drop_duplicates().sort_values("intent_codes")


def codeToIntent(prediction) -> str:
    """ Returns the intent of the prediction, not the code """
    return CODES_REPR[CODES_REPR["intent_codes"] == prediction.argmax()].iloc[0]["intent"]

preprocessed_validation_data = pd.DataFrame(columns=data.columns)
preprocessed_train_data = pd.DataFrame(columns=data.columns)

for c in codes:
    sample = data[data['intent_codes'] == c]
    sample = sample.sample(frac=1)
    # val = sample.sample(frac=0.25)
    val = sample.sample(frac=0)
    train = pd.concat([sample, val]).drop_duplicates(keep=False)
    preprocessed_validation_data = preprocessed_validation_data.append(val, ignore_index=True)
    preprocessed_train_data = preprocessed_train_data.append(train, ignore_index=True)

# Preprocessed google translation data
train_data_eng = preprocessed_train_data[['text', 'intent_codes']]
train_data_eng.columns = ['text', 'intent_codes']

validation_data_eng = preprocessed_validation_data[['text', 'intent_codes']]
validation_data_eng.columns = ['text', 'intent_codes']

def df_to_dataset(df, shuffle=True, batch_size=16):
    df = df.copy()
    labels = df.pop('intent_codes')
    lables_cat = tf.keras.utils.to_categorical(labels, label_size)
    dataset = tf.data.Dataset.from_tensor_slices((dict(df), lables_cat))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(df))
    dataset = dataset.batch(batch_size).prefetch(batch_size)
    return dataset

_validation = train_data_eng
train_data_eng = df_to_dataset(train_data_eng)

# validation_data_eng = df_to_dataset(validation_data_eng)
validation_data_eng = df_to_dataset(_validation)

"""### Model definition and training

2 epochs training (testing purposes)
"""

# Model builder
def model_build():
    inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    encoded_input = preprocessor(inputs)
    encoder_outputs = model(encoded_input)

    x = encoder_outputs['pooled_output']
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.7)(x)
    outputs = tf.keras.layers.Dense(label_size, activation='softmax', name='classifier')(x)
    
    return tf.keras.Model(inputs, outputs)

# Build a model with preprocessed data 
model_eng = model_build()
model_eng.compile(
    optimizer = tf.keras.optimizers.Adam(0.001),
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics = tf.keras.metrics.CategoricalAccuracy()
)

eng_history = model_eng.fit(
    train_data_eng,
    epochs = training_epochs,
    batch_size = 16,
    validation_data = validation_data_eng,
)

"""## Data extraction pipeline"""

# !pip install transformers

from transformers import pipeline

pipe = pipeline("token-classification", model="rkrstacic/bpmn-task-extractor")

"""## Sentence similarity"""

# !pip install -U sentence-transformers

import numpy as np
from typing import List, Dict

# Function that shows the result
def predictNER(text: str) -> Dict:
    currentString = "".join([x["word"] for x in pipe(text) if x["entity"] != "LABEL_0"])
                
    # Return dictionary without empty values
    return { "Task": currentString.replace("▁", " ")[1:] }

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

from typing import List
import torch

def getTaskSimilarityIndex(flatIndex: int, tasks) -> int:
    """ Get task index based on the flatten task list """
    for index, task in enumerate(tasks):
        if flatIndex <= len(task["alias"]):
            return index
        
        flatIndex -= len(task["alias"]) + 1
        
    return -1

def getFlattenTasks(tasks) -> List[str]:
    """ Returns the flatten version of task names and their aliases """
    resTasks = []

    for task in tasks:
        resTasks.append(task["name"])
        resTasks = resTasks + task["alias"]
    
    return resTasks

def taskSimilarity(text: str, tasks) -> int:
    """ Returns the task index which is the most similar to the text """
    return getTaskSimilarityIndex(torch.argmax(util.pytorch_cos_sim(
        model.encode(text, convert_to_tensor=True),
        model.encode(getFlattenTasks(tasks), convert_to_tensor=True)
    )).item(), tasks)

"""## Using the user intent model"""

def preprocessText(text: str) -> str:
    """ Do the same preprocessing as the UI model training input data """
    text = remove_punctuation(text)
    text = text.lower()
    text = tokenization(text)
    text = lemmatizer(text)
    text = remove_stopwords_hr(text)

    return " ".join(text)

def predict_intent(text: str) -> str:
    """ Predict the text intent based on the abovetrained model """
    return codeToIntent(model_eng.predict([preprocessText(text)], verbose=False))

def getPhases(phases) -> str:
    """ P1: Returns the formatted phases """
    phases = [phase["name"].lower() for phase in phases]
    return ', '.join(phases[:-1]) + ' i ' + phases[-1]

# Define functions that handle output text formatting

def getP1String(process) -> str:
    return f"Faze procesa za proces '{process['name']}' su: {getPhases(process['phases'])}"

def getP2String(process) -> str:
    return f"Proces '{process['name']}' traje {process['duration']}"

def getP3String(taskName: str, task) -> str:
    return f"Kratki opis '{taskName}': {task['description']}"

def getP4String(taskName: str, task) -> str:
    return f"Proces '{taskName}' traje {task['duration']}"

def getP5String(taskIndex: int, taskName: str, process) -> str:
    if len(process["phases"]) <= taskIndex + 1:
        return f"'{taskName}' je zadnji korak u procesu '{process['name']}'"
    
    return f"Nakon '{taskName}' je '{process['phases'][taskIndex + 1]['name'].lower()}'"

def getP6String() -> str:
    return "Nažalost, ne razumijem Vaše pitanje"

def print_result(text: str, process) -> None:
    """ Chatbot output messages based on intent """
    intent = predict_intent(text)
    taskIndex = taskSimilarity(text, process["phases"])
    task = process["phases"][taskIndex]
    taskName = task["name"].lower()

    # P1: Koje su faze
    if intent == 'P1':
        return(getP1String(process))

    # P2: Koliko traje cijeli proces
    elif intent == 'P2':
        return(getP2String(process))

    # P3: Kako ide odabir preferencija?
    elif intent == 'P3':
        return(getP3String(taskName, task))

    # P4: Koliko traje {task}
    elif intent == 'P4':
        return(getP4String(taskName, task))

    # P5: Što je nakon {task}
    elif intent == 'P5':
        return(getP5String(taskIndex, taskName, process))
    
    # Ništa od navedenog
    else:
        return(getP6String())

def chatbot(input_text) -> None:
    """ By: Rafael Krstačić """
    processName = trainedProcessJSON
    currentProcess = None

    for process in json:
        if process["name"] == processName:
            currentProcess = process
            break
    else:
        raise KeyError("Process does not exist in json")

    return print_result(input_text, currentProcess)

"""## Gradio app"""

chatbot("Koliko traje predaja dnevnika prakse")

iface = gr.Interface(
    fn=chatbot,
    inputs="text",
    outputs=["text"],
    title="Sentiment Analysis"
)

iface.launch()