davidlee1102
commited on
Commit
•
d35f33e
1
Parent(s):
7bef4b5
First commit
Browse files- .gitattributes +1 -0
- constance_data.py +11 -0
- emotion_model.py +26 -0
- model/nlp_surrey_coursework_hunglenhat/fingerprint.pb +3 -0
- model/nlp_surrey_coursework_hunglenhat/keras_metadata.pb +3 -0
- model/nlp_surrey_coursework_hunglenhat/saved_model.pb +3 -0
- model/nlp_surrey_coursework_hunglenhat/variables/variables.index +0 -0
- model/tokenizer.pickle +3 -0
- pre_processing_data.py +80 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
model/nlp_surrey_coursework_hunglenhat/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
constance_data.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
emotion_track_list = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
|
2 |
+
'desire',
|
3 |
+
'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude',
|
4 |
+
'grief', 'joy', 'love',
|
5 |
+
'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise',
|
6 |
+
'neutral']
|
7 |
+
|
8 |
+
decode_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
|
9 |
+
|
10 |
+
decode_cut_list = [4, 6, 8, 10, 14, 15, 17, 18, 20, 21, 22, 25, 26, 27]
|
11 |
+
decode_cut_transformed_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
|
emotion_model.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import tensorflow as tf
|
3 |
+
import tensorflow_addons as tfa
|
4 |
+
|
5 |
+
from constance_data import emotion_track_list, decode_cut_list
|
6 |
+
from pre_processing_data import preprocessing_data, pre_processing_data_2, text_transform
|
7 |
+
|
8 |
+
|
9 |
+
def emotion_predict(sentence: str):
|
10 |
+
lr = 1e-3
|
11 |
+
wd = 1e-4 * lr
|
12 |
+
model = tf.keras.models.load_model("model/nlp_surrey_coursework_hunglenhat")
|
13 |
+
model.compile(loss='sparse_categorical_crossentropy',
|
14 |
+
optimizer=tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd), metrics=['accuracy'])
|
15 |
+
sentence = pre_processing_data_2(sentence)
|
16 |
+
if not sentence:
|
17 |
+
sentence = preprocessing_data(sentence)
|
18 |
+
|
19 |
+
sentence = text_transform(sentence)
|
20 |
+
try:
|
21 |
+
sentence = model.predict(sentence)
|
22 |
+
except Exception as E:
|
23 |
+
print(E)
|
24 |
+
index_max = np.argmax(sentence)
|
25 |
+
result = emotion_track_list[decode_cut_list[index_max]]
|
26 |
+
return result
|
model/nlp_surrey_coursework_hunglenhat/fingerprint.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4f4c2e72bf0f471efafd4a4f43d81e20d41f1270ffe840f7dad866e71a223e0
|
3 |
+
size 53
|
model/nlp_surrey_coursework_hunglenhat/keras_metadata.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45149d48f67806c16c962cfa8db094098ac9e49baf9001e1dc105d7cca8a5384
|
3 |
+
size 15056
|
model/nlp_surrey_coursework_hunglenhat/saved_model.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e48ebc2bab3ee1e455e5ca7dbd40c5b9887d425ec000a75b256b54b8ad5d7396
|
3 |
+
size 801857
|
model/nlp_surrey_coursework_hunglenhat/variables/variables.index
ADDED
Binary file (2.38 kB). View file
|
|
model/tokenizer.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a2b3fb1cc1be8d44ba122aa41103dcbee6c6137bd5f4f5f8e7ceecea0d00839
|
3 |
+
size 241110
|
pre_processing_data.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import contractions
|
2 |
+
import spacy
|
3 |
+
import nltk
|
4 |
+
import pickle
|
5 |
+
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
from nltk.tokenize import RegexpTokenizer
|
8 |
+
from keras_preprocessing.sequence import pad_sequences
|
9 |
+
|
10 |
+
nltk.download('punkt')
|
11 |
+
nltk.download('wordnet')
|
12 |
+
nltk.download('omw-1.4')
|
13 |
+
nltk.download('stopwords')
|
14 |
+
nltk.download('averaged_perceptron_tagger')
|
15 |
+
nlp = spacy.load("en_core_web_sm")
|
16 |
+
|
17 |
+
stop_words = set(stopwords.words('english'))
|
18 |
+
|
19 |
+
|
20 |
+
def text_transform(string_text):
|
21 |
+
with open('model/tokenizer.pickle', 'rb') as handle:
|
22 |
+
loaded_tokenizer = pickle.load(handle)
|
23 |
+
string_text_list = []
|
24 |
+
string_text_list.append(string_text)
|
25 |
+
sequences = loaded_tokenizer.texts_to_sequences(string_text_list)
|
26 |
+
padded_sequences = pad_sequences(sequences, maxlen=50, padding='post', truncating='post')
|
27 |
+
return padded_sequences
|
28 |
+
|
29 |
+
|
30 |
+
# python -m spacy download en_core_web_sm
|
31 |
+
# pre-processing the data by getting verb, adj, adv; because of the emotion of sentence is depends on these character
|
32 |
+
import re
|
33 |
+
|
34 |
+
|
35 |
+
# pre-processing the data by getting verb, adj, adv; because of the emotion of sentence is depends on these character
|
36 |
+
def get_main_words(string_text):
|
37 |
+
tokens = nltk.word_tokenize(string_text)
|
38 |
+
pos_tags = nltk.pos_tag(tokens)
|
39 |
+
|
40 |
+
pos_string = "{'JJR', 'VB', 'WP', 'WRB', 'NNS', 'JJS', 'JJ', 'RB', 'MD', 'VBZ', 'VBG', 'VBP'}"
|
41 |
+
words = re.findall(r"'(\w+)'", pos_string)
|
42 |
+
|
43 |
+
string_list = [token for token, tag in pos_tags if tag in words]
|
44 |
+
|
45 |
+
if string_list:
|
46 |
+
string_list = ' '.join(string_list)
|
47 |
+
return string_list
|
48 |
+
return None
|
49 |
+
|
50 |
+
|
51 |
+
# complex pre-processing data
|
52 |
+
def pre_processing_data_2(string_text):
|
53 |
+
string_text = string_text.lower()
|
54 |
+
string_output = ' '.join([token.lemma_ for token in nlp(string_text)])
|
55 |
+
string_output = contractions.fix(string_output)
|
56 |
+
|
57 |
+
string_processed = get_main_words(string_output)
|
58 |
+
if string_processed:
|
59 |
+
tokenizer = RegexpTokenizer(r'\w+')
|
60 |
+
string_processed = tokenizer.tokenize(string_processed)
|
61 |
+
string_processed = " ".join(string_processed)
|
62 |
+
return string_processed
|
63 |
+
|
64 |
+
tokenizer = RegexpTokenizer(r'\w+')
|
65 |
+
string_output = tokenizer.tokenize(string_output)
|
66 |
+
string_output = [w for w in string_output if not w in stop_words]
|
67 |
+
string_output = " ".join(string_output)
|
68 |
+
return string_output
|
69 |
+
|
70 |
+
|
71 |
+
def preprocessing_data(string_text):
|
72 |
+
string_text = string_text.lower()
|
73 |
+
string_output = ' '.join([token.lemma_ for token in nlp(string_text)])
|
74 |
+
string_output = contractions.fix(string_output)
|
75 |
+
|
76 |
+
tokenizer = RegexpTokenizer(r'\w+')
|
77 |
+
string_output = tokenizer.tokenize(string_output)
|
78 |
+
string_output = [w for w in string_output if not w in stop_words]
|
79 |
+
string_output = " ".join(string_output)
|
80 |
+
return string_output
|