Spaces:
Sleeping
Sleeping
Veronika1101
commited on
Commit
•
d15a7ed
1
Parent(s):
f8f4553
Upload 20 files
Browse files- .gitattributes +4 -0
- Data/20182704132259.jpg +0 -0
- Data/c793397a-39df-5ff7-8137-e59568352c11.jpeg +0 -0
- Data/chad_806facbe78804299a9eeeab5fb0a387b_3.png +3 -0
- Data/healthcare_facilities_reviews.csv +3 -0
- Data/healthcare_facilities_reviews.jsonl +3 -0
- Data/maxresdefault.jpg +0 -0
- Models/bert_file.py +21 -0
- Models/bert_strim.py +50 -0
- Models/lstm.py +38 -0
- Models/model_file.py +176 -0
- Models/rnn_preprocessing.py +50 -0
- Models/strim_nlp.py +64 -0
- Models/toxic1.py +44 -0
- Weights/BERTmodel_weights2.pth +3 -0
- Weights/cat_model4.cbm +3 -0
- Weights/final_model_bah.pth +3 -0
- Weights/tfidf_vectorizer.joblib +3 -0
- Weights/vocab_to_int.json +0 -0
- app.py +69 -0
- requirements.txt +24 -0
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
Data/chad_806facbe78804299a9eeeab5fb0a387b_3.png filter=lfs diff=lfs merge=lfs -text
|
37 |
+
Data/healthcare_facilities_reviews.csv filter=lfs diff=lfs merge=lfs -text
|
38 |
+
Data/healthcare_facilities_reviews.jsonl filter=lfs diff=lfs merge=lfs -text
|
39 |
+
Weights/cat_model4.cbm filter=lfs diff=lfs merge=lfs -text
|
Data/20182704132259.jpg
ADDED
Data/c793397a-39df-5ff7-8137-e59568352c11.jpeg
ADDED
Data/chad_806facbe78804299a9eeeab5fb0a387b_3.png
ADDED
Git LFS Details
|
Data/healthcare_facilities_reviews.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b329837f76ec5275dc35f7228007a2a55ac62b37f88ad54fef222bd317c8efd3
|
3 |
+
size 79002044
|
Data/healthcare_facilities_reviews.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:74e8cb7f3eb5981b0934f66856123900a0f4c6ca83b1b06704e50deafea2b186
|
3 |
+
size 95300708
|
Data/maxresdefault.jpg
ADDED
Models/bert_file.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModel
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
class BERTClassifier(nn.Module):
|
5 |
+
def __init__(self, bert_path="cointegrated/rubert-tiny2"):
|
6 |
+
super().__init__()
|
7 |
+
self.bert = AutoModel.from_pretrained(bert_path)
|
8 |
+
for param in self.bert.parameters():
|
9 |
+
param.requires_grad = False
|
10 |
+
self.linear = nn.Sequential(
|
11 |
+
nn.Linear(312, 150),
|
12 |
+
nn.Dropout(0.1),
|
13 |
+
nn.ReLU(),
|
14 |
+
nn.Linear(150, 1),
|
15 |
+
nn.Sigmoid()
|
16 |
+
)
|
17 |
+
|
18 |
+
def forward(self, x, masks):
|
19 |
+
bert_out = self.bert(x, attention_mask=masks)[0][:, 0, :]
|
20 |
+
out = self.linear(bert_out)
|
21 |
+
return out
|
Models/bert_strim.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModel
|
3 |
+
import torch
|
4 |
+
from Models.bert_file import BERTClassifier
|
5 |
+
import numpy as np
|
6 |
+
import time
|
7 |
+
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
|
9 |
+
model = BERTClassifier()
|
10 |
+
device = 'cpu'
|
11 |
+
|
12 |
+
model.load_state_dict(torch.load('Weights/BERTmodel_weights2.pth',map_location=torch.device('cpu')))
|
13 |
+
model.eval()
|
14 |
+
|
15 |
+
@st.cache_data
|
16 |
+
def predict_sentiment(text):
|
17 |
+
MAX_LEN = 100
|
18 |
+
encoded_review = tokenizer.encode_plus(
|
19 |
+
text,
|
20 |
+
max_length=MAX_LEN,
|
21 |
+
add_special_tokens=True,
|
22 |
+
return_token_type_ids=False,
|
23 |
+
pad_to_max_length=True,
|
24 |
+
return_attention_mask=True,
|
25 |
+
return_tensors='pt',
|
26 |
+
)
|
27 |
+
input_ids = encoded_review['input_ids'].to(device)
|
28 |
+
attention_mask = encoded_review['attention_mask'].to(device)
|
29 |
+
|
30 |
+
with torch.no_grad():
|
31 |
+
output = model(input_ids, attention_mask)
|
32 |
+
prediction = torch.round(output).cpu().numpy()[0][0]
|
33 |
+
if prediction == 1:
|
34 |
+
return "Позитивный отзыв 😀"
|
35 |
+
else:
|
36 |
+
return "Негативный отзыв 😟"
|
37 |
+
|
38 |
+
def bert_model_page():
|
39 |
+
st.title("Классификация отзывов")
|
40 |
+
user_input = st.text_area("Введите отзыв:")
|
41 |
+
if st.button("Классифицировать"):
|
42 |
+
if user_input:
|
43 |
+
start_time = time.time()
|
44 |
+
prediction = predict_sentiment(user_input)
|
45 |
+
end_time = time.time()
|
46 |
+
execution_time = end_time - start_time
|
47 |
+
st.write("Результат классификации:", prediction)
|
48 |
+
st.write(f'Время предсказания: {execution_time:.4f} секунд')
|
49 |
+
else:
|
50 |
+
st.write("Пожалуйста, введите отзыв для классификации.")
|
Models/lstm.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import re
|
4 |
+
import json
|
5 |
+
import time
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
from Models.model_file import data_preprocessing, preprocess_single_string, LSTMBahdanauAttention
|
8 |
+
from nltk.corpus import stopwords
|
9 |
+
stop_words = set(stopwords.words('russian'))
|
10 |
+
|
11 |
+
with open('Weights/vocab_to_int.json', 'r') as file:
|
12 |
+
vocab_to_int = json.load(file)
|
13 |
+
|
14 |
+
SEQ_LEN = 96
|
15 |
+
model_bah = LSTMBahdanauAttention()
|
16 |
+
model_bah.load_state_dict(torch.load('Weights/final_model_bah.pth'))
|
17 |
+
model_bah.eval()
|
18 |
+
|
19 |
+
def analyze_sentiment(text):
|
20 |
+
preprocessed_text = data_preprocessing(text)
|
21 |
+
sample = preprocess_single_string(preprocessed_text, SEQ_LEN, vocab_to_int)
|
22 |
+
with torch.no_grad():
|
23 |
+
probability = model_bah(sample.unsqueeze(0))[0].sigmoid().item()
|
24 |
+
return probability
|
25 |
+
|
26 |
+
def lstm_model_page():
|
27 |
+
st.title("Классификация отзывов")
|
28 |
+
user_input = st.text_area("Введите ваш отзыв:")
|
29 |
+
if st.button("Классифицировать"):
|
30 |
+
start_time = time.time()
|
31 |
+
probability = analyze_sentiment(user_input)
|
32 |
+
end_time = time.time()
|
33 |
+
execution_time = end_time - start_time
|
34 |
+
if probability > 0.5:
|
35 |
+
st.write("Отзыв положительный 🌟")
|
36 |
+
else:
|
37 |
+
st.write("Отзыв отрицательный 😞")
|
38 |
+
st.write(f'Время предсказания: {execution_time:.4f} секунд')
|
Models/model_file.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
stop_words = set(stopwords.words('russian'))
|
8 |
+
from collections import Counter
|
9 |
+
from gensim.models import Word2Vec
|
10 |
+
import pandas as pd
|
11 |
+
import torch.nn.functional as F
|
12 |
+
|
13 |
+
|
14 |
+
HIDDEN_SIZE = 32
|
15 |
+
SEQ_LEN = 32
|
16 |
+
df = pd.read_json('Data/healthcare_facilities_reviews.jsonl', lines=True)
|
17 |
+
|
18 |
+
def data_preprocessing(text: str) -> str:
|
19 |
+
text = text.lower()
|
20 |
+
text = re.sub('<.*?>', '', text) # html tags
|
21 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
22 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
23 |
+
text = [word for word in text.split() if not word.isdigit()]
|
24 |
+
text = ' '.join(text)
|
25 |
+
return text
|
26 |
+
|
27 |
+
contents = df['content'].tolist()
|
28 |
+
preprocessed = [data_preprocessing(content) for content in contents]
|
29 |
+
|
30 |
+
corpus = [word for text in preprocessed for word in text.split()]
|
31 |
+
sorted_words = Counter(corpus).most_common()
|
32 |
+
|
33 |
+
def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
|
34 |
+
return list(filter(lambda x: x[1] > n, sorted_words))
|
35 |
+
|
36 |
+
sorted_words = get_words_by_freq(sorted_words, 100)
|
37 |
+
sorted_words[-10:]
|
38 |
+
|
39 |
+
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
|
40 |
+
|
41 |
+
reviews_int = []
|
42 |
+
for text in preprocessed:
|
43 |
+
r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
|
44 |
+
reviews_int.append(r)
|
45 |
+
|
46 |
+
w2v_input = []
|
47 |
+
for review in preprocessed:
|
48 |
+
cur_review = []
|
49 |
+
for word in review.split():
|
50 |
+
if vocab_to_int.get(word):
|
51 |
+
cur_review.append(word)
|
52 |
+
w2v_input.append(cur_review)
|
53 |
+
|
54 |
+
VOCAB_SIZE = len(vocab_to_int) + 1
|
55 |
+
|
56 |
+
EMBEDDING_DIM = 64
|
57 |
+
|
58 |
+
wv = Word2Vec(
|
59 |
+
min_count=1,
|
60 |
+
vector_size=EMBEDDING_DIM
|
61 |
+
)
|
62 |
+
wv.build_vocab(w2v_input)
|
63 |
+
|
64 |
+
wv.train(
|
65 |
+
corpus_iterable=w2v_input,
|
66 |
+
total_examples=wv.corpus_count,
|
67 |
+
epochs=10
|
68 |
+
)
|
69 |
+
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
|
70 |
+
|
71 |
+
# Бежим по всем словам словаря: если слово есть, достаем его вектор
|
72 |
+
# если слова нет, то распечатываем его и пропускаем
|
73 |
+
for word, i in vocab_to_int.items():
|
74 |
+
try:
|
75 |
+
embedding_vector = wv.wv[word]
|
76 |
+
embedding_matrix[i] = embedding_vector
|
77 |
+
except KeyError as e:
|
78 |
+
pass
|
79 |
+
print(f'{e}: word: {word}')
|
80 |
+
|
81 |
+
# Создаем предобученный эмбеддинг – этот слой в нашей сети обучаться не будет
|
82 |
+
embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
|
83 |
+
|
84 |
+
def data_preprocessing(text: str) -> str:
|
85 |
+
text = text.lower()
|
86 |
+
text = re.sub('<.*?>', '', text) # html tags
|
87 |
+
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
|
88 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
89 |
+
text = [word for word in text.split() if not word.isdigit()]
|
90 |
+
text = ' '.join(text)
|
91 |
+
return text
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
|
96 |
+
features = np.zeros((len(review_int), seq_len), dtype = int)
|
97 |
+
for i, review in enumerate(review_int):
|
98 |
+
if len(review) <= seq_len:
|
99 |
+
zeros = list(np.zeros(seq_len - len(review)))
|
100 |
+
new = zeros + review
|
101 |
+
else:
|
102 |
+
new = review[: seq_len]
|
103 |
+
features[i, :] = np.array(new)
|
104 |
+
|
105 |
+
return features
|
106 |
+
|
107 |
+
def preprocess_single_string(
|
108 |
+
input_string: str,
|
109 |
+
seq_len: int,
|
110 |
+
vocab_to_int: dict,
|
111 |
+
verbose : bool = False
|
112 |
+
) -> torch.tensor:
|
113 |
+
preprocessed_string = data_preprocessing(input_string)
|
114 |
+
result_list = []
|
115 |
+
for word in preprocessed_string.split():
|
116 |
+
try:
|
117 |
+
result_list.append(vocab_to_int[word])
|
118 |
+
except KeyError as e:
|
119 |
+
if verbose:
|
120 |
+
print(f'{e}: not in dictionary!')
|
121 |
+
pass
|
122 |
+
result_padded = padding([result_list], seq_len)[0]
|
123 |
+
|
124 |
+
return torch.tensor(result_padded)
|
125 |
+
|
126 |
+
class BahdanauAttention(nn.Module):
|
127 |
+
def __init__(
|
128 |
+
self,
|
129 |
+
hidden_size: int = HIDDEN_SIZE
|
130 |
+
) -> None:
|
131 |
+
|
132 |
+
super().__init__()
|
133 |
+
self.hidden_size = hidden_size
|
134 |
+
self.W = nn.Linear(hidden_size, hidden_size)
|
135 |
+
self.U = nn.Linear(hidden_size, hidden_size)
|
136 |
+
self.V = nn.Linear(hidden_size, 1)
|
137 |
+
self.tanh = nn.Tanh()
|
138 |
+
|
139 |
+
def forward(
|
140 |
+
self,
|
141 |
+
keys: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
|
142 |
+
query: torch.Tensor # BATCH_SIZE x HIDDEN_SIZE
|
143 |
+
):
|
144 |
+
|
145 |
+
query = query.unsqueeze(1) # BATCH_SIZE x 1 x HIDDEN_SIZE
|
146 |
+
r_query = self.W(query) # BATCH_SIZE x 1 x HIDDEN_SIZE
|
147 |
+
|
148 |
+
r_keys = self.U(keys) # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
|
149 |
+
|
150 |
+
scores = self.V(torch.tanh(r_query + r_keys)) # BATCH_SIZE x SEQ_LEN x 1
|
151 |
+
scores = scores.squeeze(-1) # BATCH_SIZE x SEQ_LEN
|
152 |
+
att_weights = F.softmax(scores, dim=1) # BATCH_SIZE x SEQ_LEN
|
153 |
+
context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) # BATCH_SIZE x HIDDEN_SIZE
|
154 |
+
return context, att_weights
|
155 |
+
|
156 |
+
class LSTMBahdanauAttention(nn.Module):
|
157 |
+
def __init__(self) -> None:
|
158 |
+
super().__init__()
|
159 |
+
|
160 |
+
# self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
|
161 |
+
self.embedding = embedding_layer
|
162 |
+
self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
|
163 |
+
self.attn = BahdanauAttention(HIDDEN_SIZE)
|
164 |
+
self.clf = nn.Sequential(
|
165 |
+
nn.Linear(HIDDEN_SIZE, 128),
|
166 |
+
nn.Dropout(),
|
167 |
+
nn.Tanh(),
|
168 |
+
nn.Linear(128, 1)
|
169 |
+
)
|
170 |
+
|
171 |
+
def forward(self, x):
|
172 |
+
embeddings = self.embedding(x)
|
173 |
+
outputs, (h_n, _) = self.lstm(embeddings)
|
174 |
+
context, att_weights = self.attn(outputs, h_n.squeeze(0))
|
175 |
+
out = self.clf(context)
|
176 |
+
return out, att_weights
|
Models/rnn_preprocessing.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
stop_words = set(stopwords.words('russian'))
|
8 |
+
|
9 |
+
def data_preprocessing(text: str) -> str:
|
10 |
+
text = text.lower()
|
11 |
+
text = re.sub('<.*?>', '', text)
|
12 |
+
text = ''.join([c for c in text if c not in string.punctuation])
|
13 |
+
text = ' '.join([word for word in text.split() if word not in stop_words])
|
14 |
+
text = [word for word in text.split() if not word.isdigit()]
|
15 |
+
text = ' '.join(text)
|
16 |
+
return text
|
17 |
+
|
18 |
+
def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
|
19 |
+
return list(filter(lambda x: x[1] > n, sorted_words))
|
20 |
+
|
21 |
+
def padding(review_int: list, seq_len: int) -> np.array:
|
22 |
+
features = np.zeros((len(review_int), seq_len), dtype = int)
|
23 |
+
for i, review in enumerate(review_int):
|
24 |
+
if len(review) <= seq_len:
|
25 |
+
zeros = list(np.zeros(seq_len - len(review)))
|
26 |
+
new = zeros + review
|
27 |
+
else:
|
28 |
+
new = review[: seq_len]
|
29 |
+
features[i, :] = np.array(new)
|
30 |
+
|
31 |
+
return features
|
32 |
+
|
33 |
+
def preprocess_single_string(
|
34 |
+
input_string: str,
|
35 |
+
seq_len: int,
|
36 |
+
vocab_to_int: dict,
|
37 |
+
verbose : bool = False
|
38 |
+
) -> torch.tensor:
|
39 |
+
preprocessed_string = data_preprocessing(input_string)
|
40 |
+
result_list = []
|
41 |
+
for word in preprocessed_string.split():
|
42 |
+
try:
|
43 |
+
result_list.append(vocab_to_int[word])
|
44 |
+
except KeyError as e:
|
45 |
+
if verbose:
|
46 |
+
print(f'{e}: not in dictionary!')
|
47 |
+
pass
|
48 |
+
result_padded = padding([result_list], seq_len)[0]
|
49 |
+
|
50 |
+
return torch.tensor(result_padded)
|
Models/strim_nlp.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import catboost
|
4 |
+
from catboost import CatBoostClassifier
|
5 |
+
import re
|
6 |
+
import string
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
from pymystem3 import Mystem
|
9 |
+
from joblib import load
|
10 |
+
import nltk
|
11 |
+
nltk.download('stopwords')
|
12 |
+
import time
|
13 |
+
|
14 |
+
def data_preprocessing(text):
|
15 |
+
stop_words = set(stopwords.words('russian'))
|
16 |
+
text = text.lower()
|
17 |
+
text = re.sub("<.*?>", "", text)
|
18 |
+
text = re.sub(r'http\S+', " ", text)
|
19 |
+
text = re.sub(r'@\w+', ' ', text)
|
20 |
+
text = re.sub(r'#\w+', ' ', text)
|
21 |
+
text = re.sub(r'\d+', ' ', text)
|
22 |
+
text = "".join([c for c in text if c not in string.punctuation])
|
23 |
+
return " ".join([word for word in text.split() if word not in stop_words])
|
24 |
+
|
25 |
+
def lemmatize_text(text):
|
26 |
+
mystem = Mystem()
|
27 |
+
lemmas = mystem.lemmatize(text)
|
28 |
+
return ' '.join(lemmas)
|
29 |
+
|
30 |
+
model = CatBoostClassifier()
|
31 |
+
model.load_model('Weights/cat_model4.cbm')
|
32 |
+
|
33 |
+
tfidf_vectorizer = load('Weights/tfidf_vectorizer.joblib')
|
34 |
+
|
35 |
+
def classic_ml_page():
|
36 |
+
st.title("Классификация отзывов")
|
37 |
+
user_review = st.text_area("Введите ваш отзыв здесь:")
|
38 |
+
|
39 |
+
if st.button("Классифицировать"):
|
40 |
+
if user_review:
|
41 |
+
preprocessed_review = data_preprocessing(user_review)
|
42 |
+
lemmatized_review = lemmatize_text(preprocessed_review)
|
43 |
+
vectorized_review = tfidf_vectorizer.transform([lemmatized_review])
|
44 |
+
start_time = time.time()
|
45 |
+
prediction = model.predict(vectorized_review)
|
46 |
+
end_time = time.time()
|
47 |
+
execution_time = end_time - start_time
|
48 |
+
if prediction[0] == 1:
|
49 |
+
st.write("Позитивный отзыв 😀")
|
50 |
+
else:
|
51 |
+
st.write("Негативный отзыв 😟")
|
52 |
+
st.write(f'Время предсказания: {execution_time:.4f} секунд')
|
53 |
+
else:
|
54 |
+
st.write("Пожалуйста, введите отзыв для классификации.")
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
|
Models/toxic1.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# toxic.py
|
2 |
+
import streamlit as st
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import time
|
6 |
+
import torch
|
7 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
8 |
+
|
9 |
+
model_t_checkpoint = 'cointegrated/rubert-tiny-toxicity'
|
10 |
+
tokenizer_t = AutoTokenizer.from_pretrained(model_t_checkpoint)
|
11 |
+
model_t = AutoModelForSequenceClassification.from_pretrained(model_t_checkpoint)
|
12 |
+
|
13 |
+
def text2toxicity(text, aggregate=True):
|
14 |
+
with torch.no_grad():
|
15 |
+
inputs = tokenizer_t(text, return_tensors='pt', truncation=True, padding=True).to('cpu')
|
16 |
+
proba = torch.sigmoid(model_t(**inputs).logits).cpu().numpy()
|
17 |
+
if isinstance(text, str):
|
18 |
+
proba = proba[0]
|
19 |
+
if aggregate:
|
20 |
+
return 1 - proba.T[0] * (1 - proba.T[-1])
|
21 |
+
return proba
|
22 |
+
|
23 |
+
def toxicity_page():
|
24 |
+
st.title("""
|
25 |
+
Определим токсичный комментарий или нет
|
26 |
+
""")
|
27 |
+
user_text_input = st.text_area('Введите ваш отзыв здесь:')
|
28 |
+
|
29 |
+
if st.button('Предсказать'):
|
30 |
+
start_time = time.time()
|
31 |
+
proba = text2toxicity(user_text_input, True)
|
32 |
+
end_time = time.time()
|
33 |
+
prediction_time = end_time - start_time
|
34 |
+
|
35 |
+
if proba >= 0.5:
|
36 |
+
st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий токсичный.')
|
37 |
+
st.image('Data/maxresdefault.jpg')
|
38 |
+
else:
|
39 |
+
st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий не токсичный.')
|
40 |
+
st.image('Data/c793397a-39df-5ff7-8137-e59568352c11.jpeg')
|
41 |
+
st.write(f'Время предсказания: {prediction_time:.4f} секунд')
|
42 |
+
|
43 |
+
st.markdown("<h3 style='font-size: 18px;'>Ссылка на Токсичный бот</h3>", unsafe_allow_html=True)
|
44 |
+
st.markdown("[Токсичный бот](https://t.me/toxic1101_bot)")
|
Weights/BERTmodel_weights2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a449261c46bb588503a65b6cb825f996ae1f2e4af24ceacc6b7a94ef9542bdbf
|
3 |
+
size 116986906
|
Weights/cat_model4.cbm
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:27995b1be7aee32a51075d40d154e00d7590e9fec2f2408635cd57d563ac0513
|
3 |
+
size 1135408
|
Weights/final_model_bah.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b4c57591e552f8f7173253da29d0529a8ff5d0875b4fa7017aa111f5e9f87455
|
3 |
+
size 1506113
|
Weights/tfidf_vectorizer.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:621e7e86acf6a032018e0e5ebf0876579f4f846478a70e782eb3c476298c088f
|
3 |
+
size 1750676
|
Weights/vocab_to_int.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import requests
|
4 |
+
import time
|
5 |
+
import numpy as np
|
6 |
+
import os
|
7 |
+
from Models.toxic1 import toxicity_page
|
8 |
+
from Models.strim_nlp import classic_ml_page
|
9 |
+
from Models.lstm import lstm_model_page
|
10 |
+
from Models.bert_strim import bert_model_page
|
11 |
+
import base64
|
12 |
+
import pandas as pd
|
13 |
+
|
14 |
+
background_image = 'Data/chad_806facbe78804299a9eeeab5fb0a387b_3.png'
|
15 |
+
st.markdown(
|
16 |
+
f"""
|
17 |
+
<style>
|
18 |
+
.reportview-container {{
|
19 |
+
background: url(data:image/jpeg;base64,{base64.b64encode(open(background_image, "rb").read()).decode()});
|
20 |
+
background-size: cover;
|
21 |
+
}}
|
22 |
+
</style>
|
23 |
+
""", unsafe_allow_html=True
|
24 |
+
)
|
25 |
+
def app_description_page():
|
26 |
+
st.title("Welcome to My App!")
|
27 |
+
st.markdown("<h3 style='font-size: 18px;'>This is a Streamlit application where you can explore four different models.</h3>", unsafe_allow_html=True)
|
28 |
+
st.markdown("<h3 style='font-size: 18px;'>About the project:</h3>", unsafe_allow_html=True)
|
29 |
+
st.markdown("<h3 style='font-size: 18px;'>The task is to train 3 different models on a dataset that contains reviews about the clinic.</h3>", unsafe_allow_html=True)
|
30 |
+
st.markdown("<h3 style='font-size: 18px;'>You can write text and the model will classify it as “Negative” or “Positive”</h3>", unsafe_allow_html=True)
|
31 |
+
data = {
|
32 |
+
"Model": ["CatBoostClassifier", "LSTM", "Rubert-tiny2", "Rubert-tiny-toxicity"],
|
33 |
+
"F1 metric": [0.87, 0.94, 0.90, 0.84]
|
34 |
+
}
|
35 |
+
df = pd.DataFrame(data)
|
36 |
+
st.markdown("<h3 style='font-size: 18px;'>Models:</h3>", unsafe_allow_html=True)
|
37 |
+
st.markdown("<h3 style='font-size: 18px;'>1. CatBoostClassifier trained on TF-IDF </h3>", unsafe_allow_html=True)
|
38 |
+
st.markdown("<h3 style='font-size: 18px;'>2. LSTM with BahdanauAttention </h3>", unsafe_allow_html=True)
|
39 |
+
st.markdown("<h3 style='font-size: 18px;'>3. Rubert-tiny2 </h3>", unsafe_allow_html=True)
|
40 |
+
st.markdown("<h3 style='font-size: 18px;'>4. Rubert-tiny-toxicity </h3>", unsafe_allow_html=True)
|
41 |
+
st.dataframe(df)
|
42 |
+
st.image('20182704132259.jpg', use_column_width=True)
|
43 |
+
|
44 |
+
def model_selection_page():
|
45 |
+
st.sidebar.title("Model Selection")
|
46 |
+
selected_model = st.sidebar.radio("Select a model", ("Classic ML", "LSTM", "BERT"))
|
47 |
+
|
48 |
+
if selected_model == "Classic ML":
|
49 |
+
classic_ml_page()
|
50 |
+
st.write("You selected Classic ML.")
|
51 |
+
elif selected_model == "LSTM":
|
52 |
+
lstm_model_page()
|
53 |
+
st.write("You selected LSTM.")
|
54 |
+
elif selected_model == "BERT":
|
55 |
+
bert_model_page()
|
56 |
+
st.write("You selected BERT.")
|
57 |
+
|
58 |
+
def main():
|
59 |
+
page = st.sidebar.radio("Go to", ("App Description", "Model Selection", "Toxicity Model"))
|
60 |
+
|
61 |
+
if page == "App Description":
|
62 |
+
app_description_page()
|
63 |
+
elif page == "Model Selection":
|
64 |
+
model_selection_page()
|
65 |
+
elif page == "Toxicity Model":
|
66 |
+
toxicity_page()
|
67 |
+
|
68 |
+
if __name__ == "__main__":
|
69 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cachetools
|
2 |
+
catboost
|
3 |
+
charset-normalizer
|
4 |
+
cycler
|
5 |
+
gensim
|
6 |
+
GitPython
|
7 |
+
graphviz
|
8 |
+
huggingface-hub
|
9 |
+
joblib
|
10 |
+
markdown-it-py
|
11 |
+
networkx
|
12 |
+
nltk
|
13 |
+
numpy
|
14 |
+
pandas
|
15 |
+
pillow
|
16 |
+
requests
|
17 |
+
scikit-learn
|
18 |
+
streamlit
|
19 |
+
sympy
|
20 |
+
torch
|
21 |
+
tqdm
|
22 |
+
transformers
|
23 |
+
pymystem3
|
24 |
+
scipy==1.10.1
|