Spaces:

Veronika1101
/

Nlp_proj

Sleeping

App Files Files Community

Veronika1101 commited on Apr 12

Commit

d15a7ed

•

1 Parent(s): f8f4553

Upload 20 files

Browse files

Files changed (21) hide show

.gitattributes +4 -0
Data/20182704132259.jpg +0 -0
Data/c793397a-39df-5ff7-8137-e59568352c11.jpeg +0 -0
Data/chad_806facbe78804299a9eeeab5fb0a387b_3.png +3 -0
Data/healthcare_facilities_reviews.csv +3 -0
Data/healthcare_facilities_reviews.jsonl +3 -0
Data/maxresdefault.jpg +0 -0
Models/bert_file.py +21 -0
Models/bert_strim.py +50 -0
Models/lstm.py +38 -0
Models/model_file.py +176 -0
Models/rnn_preprocessing.py +50 -0
Models/strim_nlp.py +64 -0
Models/toxic1.py +44 -0
Weights/BERTmodel_weights2.pth +3 -0
Weights/cat_model4.cbm +3 -0
Weights/final_model_bah.pth +3 -0
Weights/tfidf_vectorizer.joblib +3 -0
Weights/vocab_to_int.json +0 -0
app.py +69 -0
requirements.txt +24 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Data/chad_806facbe78804299a9eeeab5fb0a387b_3.png filter=lfs diff=lfs merge=lfs -text
+Data/healthcare_facilities_reviews.csv filter=lfs diff=lfs merge=lfs -text
+Data/healthcare_facilities_reviews.jsonl filter=lfs diff=lfs merge=lfs -text
+Weights/cat_model4.cbm filter=lfs diff=lfs merge=lfs -text

Data/20182704132259.jpg ADDED Viewed

Data/c793397a-39df-5ff7-8137-e59568352c11.jpeg ADDED Viewed

Data/chad_806facbe78804299a9eeeab5fb0a387b_3.png ADDED Viewed

Git LFS Details

SHA256: 835384bd33c9055373d50ba319e0d38f9411ae6c9867f69b6fc017fc3aa220f5
Pointer size: 132 Bytes
Size of remote file: 1.14 MB

Data/healthcare_facilities_reviews.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b329837f76ec5275dc35f7228007a2a55ac62b37f88ad54fef222bd317c8efd3
+size 79002044

Data/healthcare_facilities_reviews.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74e8cb7f3eb5981b0934f66856123900a0f4c6ca83b1b06704e50deafea2b186
+size 95300708

Data/maxresdefault.jpg ADDED Viewed

Models/bert_file.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from transformers import AutoModel
+from torch import nn
+class BERTClassifier(nn.Module):
+    def __init__(self, bert_path="cointegrated/rubert-tiny2"):
+        super().__init__()
+        self.bert = AutoModel.from_pretrained(bert_path)
+        for param in self.bert.parameters():
+            param.requires_grad = False
+        self.linear = nn.Sequential(
+            nn.Linear(312, 150),
+            nn.Dropout(0.1),
+            nn.ReLU(),
+            nn.Linear(150, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, x, masks):
+        bert_out = self.bert(x, attention_mask=masks)[0][:, 0, :]
+        out = self.linear(bert_out)
+        return out

Models/bert_strim.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import streamlit as st
+from transformers import AutoTokenizer, AutoModel
+import torch
+from Models.bert_file import BERTClassifier
+import numpy as np
+import time
+tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
+model = BERTClassifier()
+device = 'cpu'
+model.load_state_dict(torch.load('Weights/BERTmodel_weights2.pth',map_location=torch.device('cpu')))
+model.eval()
+@st.cache_data
+def predict_sentiment(text):
+    MAX_LEN = 100
+    encoded_review = tokenizer.encode_plus(
+        text,
+        max_length=MAX_LEN,
+        add_special_tokens=True,
+        return_token_type_ids=False,
+        pad_to_max_length=True,
+        return_attention_mask=True,
+        return_tensors='pt',
+    )
+    input_ids = encoded_review['input_ids'].to(device)
+    attention_mask = encoded_review['attention_mask'].to(device)
+    with torch.no_grad():
+        output = model(input_ids, attention_mask)
+        prediction = torch.round(output).cpu().numpy()[0][0]
+    if prediction == 1:
+        return "Позитивный отзыв 😀"
+    else:
+        return "Негативный отзыв 😟"
+def bert_model_page():
+    st.title("Классификация отзывов")
+    user_input = st.text_area("Введите отзыв:")
+    if st.button("Классифицировать"):
+        if user_input:
+            start_time = time.time()
+            prediction = predict_sentiment(user_input)
+            end_time = time.time()
+            execution_time = end_time - start_time
+            st.write("Результат классификации:", prediction)
+            st.write(f'Время предсказания: {execution_time:.4f} секунд')
+        else:
+            st.write("Пожалуйста, введите отзыв для классификации.")

Models/lstm.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import streamlit as st
+import torch
+import re
+import json
+import time
+from nltk.corpus import stopwords
+from Models.model_file import data_preprocessing, preprocess_single_string, LSTMBahdanauAttention
+from nltk.corpus import stopwords
+stop_words = set(stopwords.words('russian'))
+with open('Weights/vocab_to_int.json', 'r') as file:
+    vocab_to_int = json.load(file)
+SEQ_LEN = 96
+model_bah = LSTMBahdanauAttention()
+model_bah.load_state_dict(torch.load('Weights/final_model_bah.pth'))
+model_bah.eval()
+def analyze_sentiment(text):
+    preprocessed_text = data_preprocessing(text)
+    sample = preprocess_single_string(preprocessed_text, SEQ_LEN, vocab_to_int)
+    with torch.no_grad():
+        probability = model_bah(sample.unsqueeze(0))[0].sigmoid().item()
+        return probability
+def lstm_model_page():
+    st.title("Классификация отзывов")
+    user_input = st.text_area("Введите ваш отзыв:")
+    if st.button("Классифицировать"):
+        start_time = time.time()
+        probability = analyze_sentiment(user_input)
+        end_time = time.time()
+        execution_time = end_time - start_time
+        if probability > 0.5:
+            st.write("Отзыв положительный 🌟")
+        else:
+            st.write("Отзыв отрицательный 😞")
+        st.write(f'Время предсказания: {execution_time:.4f} секунд')

Models/model_file.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import re
+import string
+import numpy as np
+import torch
+import torch.nn as nn
+from nltk.corpus import stopwords
+stop_words = set(stopwords.words('russian'))
+from collections import Counter
+from gensim.models import Word2Vec
+import pandas as pd
+import torch.nn.functional as F
+HIDDEN_SIZE = 32
+SEQ_LEN = 32
+df = pd.read_json('Data/healthcare_facilities_reviews.jsonl', lines=True)
+def data_preprocessing(text: str) -> str:
+    text = text.lower()
+    text = re.sub('<.*?>', '', text) # html tags
+    text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
+    text = ' '.join([word for word in text.split() if word not in stop_words])
+    text = [word for word in text.split() if not word.isdigit()]
+    text = ' '.join(text)
+    return text
+contents = df['content'].tolist()
+preprocessed = [data_preprocessing(content) for content in contents]
+corpus = [word for text in preprocessed for word in text.split()]
+sorted_words = Counter(corpus).most_common()
+def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
+    return list(filter(lambda x: x[1] > n, sorted_words))
+sorted_words = get_words_by_freq(sorted_words, 100)
+sorted_words[-10:]
+vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
+reviews_int = []
+for text in preprocessed:
+    r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
+    reviews_int.append(r)
+w2v_input = []
+for review in preprocessed:
+    cur_review = []
+    for word in review.split():
+        if vocab_to_int.get(word):
+            cur_review.append(word)
+    w2v_input.append(cur_review)
+VOCAB_SIZE = len(vocab_to_int) + 1
+EMBEDDING_DIM = 64
+wv = Word2Vec(
+    min_count=1,
+    vector_size=EMBEDDING_DIM
+    )
+wv.build_vocab(w2v_input)
+wv.train(
+    corpus_iterable=w2v_input,
+    total_examples=wv.corpus_count,
+    epochs=10
+    )
+embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
+# Бежим по всем словам словаря: если слово есть, достаем его вектор
+# если слова нет, то распечатываем его и пропускаем
+for word, i in vocab_to_int.items():
+    try:
+        embedding_vector = wv.wv[word]
+        embedding_matrix[i] = embedding_vector
+    except KeyError as e:
+        pass
+        print(f'{e}: word: {word}')
+# Создаем предобученный эмбеддинг – этот слой в нашей сети обучаться не будет
+embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
+def data_preprocessing(text: str) -> str:
+    text = text.lower()
+    text = re.sub('<.*?>', '', text) # html tags
+    text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
+    text = ' '.join([word for word in text.split() if word not in stop_words])
+    text = [word for word in text.split() if not word.isdigit()]
+    text = ' '.join(text)
+    return text
+def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
+    features = np.zeros((len(review_int), seq_len), dtype = int)
+    for i, review in enumerate(review_int):
+        if len(review) <= seq_len:
+            zeros = list(np.zeros(seq_len - len(review)))
+            new = zeros + review
+        else:
+            new = review[: seq_len]
+        features[i, :] = np.array(new)
+    return features
+def preprocess_single_string(
+    input_string: str,
+    seq_len: int,
+    vocab_to_int: dict,
+    verbose : bool = False
+    ) -> torch.tensor:
+    preprocessed_string = data_preprocessing(input_string)
+    result_list = []
+    for word in preprocessed_string.split():
+        try:
+            result_list.append(vocab_to_int[word])
+        except KeyError as e:
+            if verbose:
+                print(f'{e}: not in dictionary!')
+            pass
+    result_padded = padding([result_list], seq_len)[0]
+    return torch.tensor(result_padded)
+class BahdanauAttention(nn.Module):
+    def __init__(
+    self,
+    hidden_size: int = HIDDEN_SIZE
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.W = nn.Linear(hidden_size, hidden_size)
+        self.U = nn.Linear(hidden_size, hidden_size)
+        self.V = nn.Linear(hidden_size, 1)
+        self.tanh = nn.Tanh()
+    def forward(
+        self,
+        keys: torch.Tensor, # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
+        query: torch.Tensor # BATCH_SIZE x HIDDEN_SIZE
+        ):
+        query = query.unsqueeze(1) # BATCH_SIZE x 1 x HIDDEN_SIZE
+        r_query = self.W(query) # BATCH_SIZE x 1 x HIDDEN_SIZE
+        r_keys = self.U(keys) # BATCH_SIZE x SEQ_LEN x HIDDEN_SIZE
+        scores = self.V(torch.tanh(r_query + r_keys)) # BATCH_SIZE x SEQ_LEN x 1
+        scores = scores.squeeze(-1) # BATCH_SIZE x SEQ_LEN
+        att_weights = F.softmax(scores, dim=1) # BATCH_SIZE x SEQ_LEN
+        context = torch.bmm(att_weights.unsqueeze(1), keys).squeeze(1) # BATCH_SIZE x HIDDEN_SIZE
+        return context, att_weights
+class LSTMBahdanauAttention(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        # self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
+        self.embedding = embedding_layer
+        self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_SIZE, batch_first=True)
+        self.attn = BahdanauAttention(HIDDEN_SIZE)
+        self.clf = nn.Sequential(
+        nn.Linear(HIDDEN_SIZE, 128),
+        nn.Dropout(),
+        nn.Tanh(),
+        nn.Linear(128, 1)
+        )
+    def forward(self, x):
+        embeddings = self.embedding(x)
+        outputs, (h_n, _) = self.lstm(embeddings)
+        context, att_weights = self.attn(outputs, h_n.squeeze(0))
+        out = self.clf(context)
+        return out, att_weights

Models/rnn_preprocessing.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import re
+import string
+import numpy as np
+import torch
+from nltk.corpus import stopwords
+stop_words = set(stopwords.words('russian'))
+def data_preprocessing(text: str) -> str:
+    text = text.lower()
+    text = re.sub('<.*?>', '', text)
+    text = ''.join([c for c in text if c not in string.punctuation])
+    text = ' '.join([word for word in text.split() if word not in stop_words])
+    text = [word for word in text.split() if not word.isdigit()]
+    text = ' '.join(text)
+    return text
+def get_words_by_freq(sorted_words: list[tuple[str, int]], n: int = 10) -> list:
+    return list(filter(lambda x: x[1] > n, sorted_words))
+def padding(review_int: list, seq_len: int) -> np.array:
+    features = np.zeros((len(review_int), seq_len), dtype = int)
+    for i, review in enumerate(review_int):
+        if len(review) <= seq_len:
+            zeros = list(np.zeros(seq_len - len(review)))
+            new = zeros + review
+        else:
+            new = review[: seq_len]
+        features[i, :] = np.array(new)
+    return features
+def preprocess_single_string(
+    input_string: str,
+    seq_len: int,
+    vocab_to_int: dict,
+    verbose : bool = False
+    ) -> torch.tensor:
+    preprocessed_string = data_preprocessing(input_string)
+    result_list = []
+    for word in preprocessed_string.split():
+        try:
+            result_list.append(vocab_to_int[word])
+        except KeyError as e:
+            if verbose:
+                print(f'{e}: not in dictionary!')
+            pass
+    result_padded = padding([result_list], seq_len)[0]
+    return torch.tensor(result_padded)

Models/strim_nlp.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import streamlit as st
+import pandas as pd
+import catboost
+from catboost import CatBoostClassifier
+import re
+import string
+from nltk.corpus import stopwords
+from pymystem3 import Mystem
+from joblib import load
+import nltk
+nltk.download('stopwords')
+import time
+def data_preprocessing(text):
+    stop_words = set(stopwords.words('russian'))
+    text = text.lower()
+    text = re.sub("<.*?>", "", text)
+    text = re.sub(r'http\S+', " ", text)
+    text = re.sub(r'@\w+', ' ', text)
+    text = re.sub(r'#\w+', ' ', text)
+    text = re.sub(r'\d+', ' ', text)
+    text = "".join([c for c in text if c not in string.punctuation])
+    return " ".join([word for word in text.split() if word not in stop_words])
+def lemmatize_text(text):
+    mystem = Mystem()
+    lemmas = mystem.lemmatize(text)
+    return ' '.join(lemmas)
+model = CatBoostClassifier()
+model.load_model('Weights/cat_model4.cbm')
+tfidf_vectorizer = load('Weights/tfidf_vectorizer.joblib')
+def classic_ml_page():
+    st.title("Классификация отзывов")
+    user_review = st.text_area("Введите ваш отзыв здесь:")
+    if st.button("Классифицировать"):
+        if user_review:
+            preprocessed_review = data_preprocessing(user_review)
+            lemmatized_review = lemmatize_text(preprocessed_review)
+            vectorized_review = tfidf_vectorizer.transform([lemmatized_review])
+            start_time = time.time()
+            prediction = model.predict(vectorized_review)
+            end_time = time.time()
+            execution_time = end_time - start_time
+            if prediction[0] == 1:
+                st.write("Позитивный отзыв 😀")
+            else:
+                st.write("Негативный отзыв 😟")
+            st.write(f'Время предсказания: {execution_time:.4f} секунд')
+        else:
+            st.write("Пожалуйста, введите отзыв для классификации.")

Models/toxic1.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# toxic.py
+import streamlit as st
+import numpy as np
+import pandas as pd
+import time
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+model_t_checkpoint = 'cointegrated/rubert-tiny-toxicity'
+tokenizer_t = AutoTokenizer.from_pretrained(model_t_checkpoint)
+model_t = AutoModelForSequenceClassification.from_pretrained(model_t_checkpoint)
+def text2toxicity(text, aggregate=True):
+    with torch.no_grad():
+        inputs = tokenizer_t(text, return_tensors='pt', truncation=True, padding=True).to('cpu')
+        proba = torch.sigmoid(model_t(**inputs).logits).cpu().numpy()
+    if isinstance(text, str):
+        proba = proba[0]
+    if aggregate:
+        return 1 - proba.T[0] * (1 - proba.T[-1])
+    return proba
+def toxicity_page():
+    st.title("""
+    Определим токсичный комментарий или нет
+    """)
+    user_text_input = st.text_area('Введите ваш отзыв здесь:')
+    if st.button('Предсказать'):
+        start_time = time.time()
+        proba = text2toxicity(user_text_input, True)
+        end_time = time.time()
+        prediction_time = end_time - start_time
+        if proba >= 0.5:
+            st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий токсичный.')
+            st.image('Data/maxresdefault.jpg')
+        else:
+            st.write(f'Степень токсичности комментария: {round(proba, 2)} – комментарий не токсичный.')
+            st.image('Data/c793397a-39df-5ff7-8137-e59568352c11.jpeg')
+        st.write(f'Время предсказания: {prediction_time:.4f} секунд')
+    st.markdown("<h3 style='font-size: 18px;'>Ссылка на Токсичный бот</h3>", unsafe_allow_html=True)
+    st.markdown("[Токсичный бот](https://t.me/toxic1101_bot)")

Weights/BERTmodel_weights2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a449261c46bb588503a65b6cb825f996ae1f2e4af24ceacc6b7a94ef9542bdbf
+size 116986906

Weights/cat_model4.cbm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27995b1be7aee32a51075d40d154e00d7590e9fec2f2408635cd57d563ac0513
+size 1135408

Weights/final_model_bah.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4c57591e552f8f7173253da29d0529a8ff5d0875b4fa7017aa111f5e9f87455
+size 1506113

Weights/tfidf_vectorizer.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:621e7e86acf6a032018e0e5ebf0876579f4f846478a70e782eb3c476298c088f
+size 1750676

Weights/vocab_to_int.json ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import streamlit as st
+import torch
+import requests
+import time
+import numpy as np
+import os
+from Models.toxic1 import toxicity_page
+from Models.strim_nlp import classic_ml_page
+from Models.lstm import lstm_model_page
+from Models.bert_strim import bert_model_page
+import base64
+import pandas as pd
+background_image = 'Data/chad_806facbe78804299a9eeeab5fb0a387b_3.png'
+st.markdown(
+    f"""
+    <style>
+    .reportview-container {{
+        background: url(data:image/jpeg;base64,{base64.b64encode(open(background_image, "rb").read()).decode()});
+        background-size: cover;
+    }}
+    </style>
+    """, unsafe_allow_html=True
+)
+def app_description_page():
+    st.title("Welcome to My App!")
+    st.markdown("<h3 style='font-size: 18px;'>This is a Streamlit application where you can explore four different models.</h3>", unsafe_allow_html=True)
+    st.markdown("<h3 style='font-size: 18px;'>About the project:</h3>", unsafe_allow_html=True)
+    st.markdown("<h3 style='font-size: 18px;'>The task is to train 3 different models on a dataset that contains reviews about the clinic.</h3>", unsafe_allow_html=True)
+    st.markdown("<h3 style='font-size: 18px;'>You can write text and the model will classify it as “Negative” or “Positive”</h3>", unsafe_allow_html=True)
+    data = {
+    "Model": ["CatBoostClassifier", "LSTM", "Rubert-tiny2", "Rubert-tiny-toxicity"],
+    "F1 metric": [0.87, 0.94, 0.90, 0.84]
+    }
+    df = pd.DataFrame(data)
+    st.markdown("<h3 style='font-size: 18px;'>Models:</h3>", unsafe_allow_html=True)
+    st.markdown("<h3 style='font-size: 18px;'>1. CatBoostClassifier trained on TF-IDF </h3>", unsafe_allow_html=True)
+    st.markdown("<h3 style='font-size: 18px;'>2. LSTM with BahdanauAttention </h3>", unsafe_allow_html=True)
+    st.markdown("<h3 style='font-size: 18px;'>3. Rubert-tiny2 </h3>", unsafe_allow_html=True)
+    st.markdown("<h3 style='font-size: 18px;'>4. Rubert-tiny-toxicity </h3>", unsafe_allow_html=True)
+    st.dataframe(df)
+    st.image('20182704132259.jpg', use_column_width=True)
+def model_selection_page():
+    st.sidebar.title("Model Selection")
+    selected_model = st.sidebar.radio("Select a model", ("Classic ML", "LSTM", "BERT"))
+    if selected_model == "Classic ML":
+        classic_ml_page()
+        st.write("You selected Classic ML.")
+    elif selected_model == "LSTM":
+        lstm_model_page()
+        st.write("You selected LSTM.")
+    elif selected_model == "BERT":
+        bert_model_page()
+        st.write("You selected BERT.")
+def main():
+    page = st.sidebar.radio("Go to", ("App Description", "Model Selection", "Toxicity Model"))
+    if page == "App Description":
+        app_description_page()
+    elif page == "Model Selection":
+        model_selection_page()
+    elif page == "Toxicity Model":
+        toxicity_page()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+cachetools
+catboost
+charset-normalizer
+cycler
+gensim
+GitPython
+graphviz
+huggingface-hub
+joblib
+markdown-it-py
+networkx
+nltk
+numpy
+pandas
+pillow
+requests
+scikit-learn
+streamlit
+sympy
+torch
+tqdm
+transformers
+pymystem3
+scipy==1.10.1