Spaces:

Project-nlp
/

nlp-bert-team

Sleeping

App Files Files Community

VerVelVel commited on May 30

Commit

169057e

•

1 Parent(s): 9f7ee75

logreg and toxic bert

Browse files

Files changed (8) hide show

Hello.py +30 -0
images/pipeline_logreg.png +0 -0
images/toxity_metrics.png +0 -0
models/model1/logistic_regression_pipeline.pkl +3 -0
models/model1/model_weights.pth +3 -0
models/sds +0 -0
notebooks/first_ml.ipynb +1539 -0
pages/policlinic.py +15 -0

Hello.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import streamlit as st
+st.set_page_config(
+    page_title="Hello",
+    page_icon="👋",
+)
+st.write("# Добро пожаловать на страничку нашего проекта! 👋")
+st.sidebar.success("Выберите интересующую вас задачу.")
+st.markdown(
+    """
+    **👈 Выберите интересующую вас задачу и наши модели постараются вам помочь!**
+    ### Что можно найти в этом сервисе?
+    - Страницу, позволяющую выполнить классификацию отзыва на поликлиники (при помощи трех различных моделей)
+    - Страницу, позволяющую выполнить оценку степени токсичности пользовательского сообщения с помощью модели rubert-tiny-toxicity
+    - Страницу, позволяющую выполнить генерацию текста GPT-моделью по пользовательскому prompt
+    - Страницу с информацией о:
+    - - процессе обучения модели: кривые обучения и метрик
+    - - времени обучения
+    - - значениях метрик
+    ### Над проектом трудились:
+    - [Даша](https://github.com/Dasha0203)
+    - [Вера](https://github.com/VerVelVel)
+"""
+)

images/pipeline_logreg.png ADDED Viewed

images/toxity_metrics.png ADDED Viewed

models/model1/logistic_regression_pipeline.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e522e0db3ea799a291336149ab421d2ec56a6ea03e402bd438bec16b92a49dfb
+size 5705593

models/model1/model_weights.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da7fd2151d6a5446fc178462ff93ee61c24f98cb0aa41343e2e8c36802e2170b
+size 47712485

models/sds ADDED Viewed

File without changes

notebooks/first_ml.ipynb ADDED Viewed

	@@ -0,0 +1,1539 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# TF-IDF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import string\n",
+    "from collections import defaultdict\n",
+    "from sklearn import metrics\n",
+    "from time import time\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "from nltk.tokenize import RegexpTokenizer\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.cluster import KMeans\n",
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "from sklearn.decomposition import TruncatedSVD\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.preprocessing import Normalizer\n",
+    "import pymorphy2\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import classification_report, accuracy_score, f1_score"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Загрузка данных"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_json('data/healthcare_facilities_reviews.jsonl', lines=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>review_id</th>\n",
+       "      <th>category</th>\n",
+       "      <th>title</th>\n",
+       "      <th>content</th>\n",
+       "      <th>sentiment</th>\n",
+       "      <th>source_url</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Поликлиники стоматологические</td>\n",
+       "      <td>Классный мастер</td>\n",
+       "      <td>Огромное спасибо за чудесное удаление двух зуб...</td>\n",
+       "      <td>positive</td>\n",
+       "      <td>http://www.spr.ru/forum_vyvod.php?id_tema=2727539</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Поликлиники стоматологические</td>\n",
+       "      <td>Замечательный врач</td>\n",
+       "      <td>Хочу выразить особую благодарность замечательн...</td>\n",
+       "      <td>positive</td>\n",
+       "      <td>http://www.spr.ru/forum_vyvod.php?id_tema=2302877</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>Поликлиники стоматологические</td>\n",
+       "      <td>Благодарность работникам рентгена</td>\n",
+       "      <td>Добрый вечер! Хотелось бы поблагодарить сотруд...</td>\n",
+       "      <td>positive</td>\n",
+       "      <td>http://www.spr.ru/forum_vyvod.php?id_tema=2815031</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>Поликлиники стоматологические</td>\n",
+       "      <td>Доктор Рабинович</td>\n",
+       "      <td>Женщины советского образца в регистратуре не и...</td>\n",
+       "      <td>negative</td>\n",
+       "      <td>http://www.spr.ru/forum_vyvod.php?id_tema=3443161</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>Поликлиники стоматологические</td>\n",
+       "      <td>Есть кому сказать спасибо</td>\n",
+       "      <td>У меня с детства очень плохие зубы (тонкая и х...</td>\n",
+       "      <td>positive</td>\n",
+       "      <td>http://www.spr.ru/forum_vyvod.php?id_tema=2592430</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70592</th>\n",
+       "      <td>70592</td>\n",
+       "      <td>Водительские комиссии</td>\n",
+       "      <td>Хуже районной поликлиники</td>\n",
+       "      <td>Заведение ужасное. Врачи делят 1 кабинет на 2х...</td>\n",
+       "      <td>negative</td>\n",
+       "      <td>http://www.spr.ru/forum_vyvod.php?id_tema=273326</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70593</th>\n",
+       "      <td>70593</td>\n",
+       "      <td>Водительские комиссии</td>\n",
+       "      <td>Справки</td>\n",
+       "      <td>Люди, не обращайтесь в эту фирму! Муж проходил...</td>\n",
+       "      <td>negative</td>\n",
+       "      <td>http://www.spr.ru/forum_vyvod.php?id_tema=3401583</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70594</th>\n",
+       "      <td>70594</td>\n",
+       "      <td>Водительские комиссии</td>\n",
+       "      <td>Мед-Альфа - это наше будущее</td>\n",
+       "      <td>Дорогие посетители медицинского центра ООО \"Ме...</td>\n",
+       "      <td>positive</td>\n",
+       "      <td>http://www.spr.ru/forum_vyvod.php?id_tema=326078</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70595</th>\n",
+       "      <td>70595</td>\n",
+       "      <td>Водительские комиссии</td>\n",
+       "      <td>Хамское поведение</td>\n",
+       "      <td>В регистратуре сидит хамка, такое отношение и ...</td>\n",
+       "      <td>negative</td>\n",
+       "      <td>http://www.spr.ru/forum_vyvod.php?id_tema=3171911</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70596</th>\n",
+       "      <td>70596</td>\n",
+       "      <td>Водительские комиссии</td>\n",
+       "      <td>Только хорошие впечатления</td>\n",
+       "      <td>Хочу поблагодарить весь персонал \"МедАльфаПроф...</td>\n",
+       "      <td>positive</td>\n",
+       "      <td>http://www.spr.ru/forum_vyvod.php?id_tema=3391562</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>70597 rows × 6 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       review_id                       category  \\\n",
+       "0              0  Поликлиники стоматологические   \n",
+       "1              1  Поликлиники стоматологические   \n",
+       "2              2  Поликлиники стоматологические   \n",
+       "3              3  Поликлиники стоматологические   \n",
+       "4              4  Поликлиники стоматологические   \n",
+       "...          ...                            ...   \n",
+       "70592      70592          Водительские комиссии   \n",
+       "70593      70593          Водительские комиссии   \n",
+       "70594      70594          Водительские комиссии   \n",
+       "70595      70595          Водительские комиссии   \n",
+       "70596      70596          Водительские комиссии   \n",
+       "\n",
+       "                                   title  \\\n",
+       "0                        Классный мастер   \n",
+       "1                     Замечательный врач   \n",
+       "2      Благодарность работникам рентгена   \n",
+       "3                       Доктор Рабинович   \n",
+       "4              Есть кому сказать спасибо   \n",
+       "...                                  ...   \n",
+       "70592          Хуже районной поликлиники   \n",
+       "70593                            Справки   \n",
+       "70594       Мед-Альфа - это наше будущее   \n",
+       "70595                  Хамское поведение   \n",
+       "70596         Только хорошие впечатления   \n",
+       "\n",
+       "                                                 content sentiment  \\\n",
+       "0      Огромное спасибо за чудесное удаление двух зуб...  positive   \n",
+       "1      Хочу выразить особую благодарность замечательн...  positive   \n",
+       "2      Добрый вечер! Хотелось бы поблагодарить сотруд...  positive   \n",
+       "3      Женщины советского об��азца в регистратуре не и...  negative   \n",
+       "4      У меня с детства очень плохие зубы (тонкая и х...  positive   \n",
+       "...                                                  ...       ...   \n",
+       "70592  Заведение ужасное. Врачи делят 1 кабинет на 2х...  negative   \n",
+       "70593  Люди, не обращайтесь в эту фирму! Муж проходил...  negative   \n",
+       "70594  Дорогие посетители медицинского центра ООО \"Ме...  positive   \n",
+       "70595  В регистратуре сидит хамка, такое отношение и ...  negative   \n",
+       "70596  Хочу поблагодарить весь персонал \"МедАльфаПроф...  positive   \n",
+       "\n",
+       "                                              source_url  \n",
+       "0      http://www.spr.ru/forum_vyvod.php?id_tema=2727539  \n",
+       "1      http://www.spr.ru/forum_vyvod.php?id_tema=2302877  \n",
+       "2      http://www.spr.ru/forum_vyvod.php?id_tema=2815031  \n",
+       "3      http://www.spr.ru/forum_vyvod.php?id_tema=3443161  \n",
+       "4      http://www.spr.ru/forum_vyvod.php?id_tema=2592430  \n",
+       "...                                                  ...  \n",
+       "70592   http://www.spr.ru/forum_vyvod.php?id_tema=273326  \n",
+       "70593  http://www.spr.ru/forum_vyvod.php?id_tema=3401583  \n",
+       "70594   http://www.spr.ru/forum_vyvod.php?id_tema=326078  \n",
+       "70595  http://www.spr.ru/forum_vyvod.php?id_tema=3171911  \n",
+       "70596  http://www.spr.ru/forum_vyvod.php?id_tema=3391562  \n",
+       "\n",
+       "[70597 rows x 6 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df[['sentiment', 'content']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sentiment</th>\n",
+       "      <th>content</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>positive</td>\n",
+       "      <td>Огромное спасибо за чудесное удаление двух зуб...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>positive</td>\n",
+       "      <td>Хочу выразить особую благодарность замечательн...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>positive</td>\n",
+       "      <td>Добрый вечер! Хотелось бы поблагодарить сотруд...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>negative</td>\n",
+       "      <td>Женщины советского образца в регистратуре не и...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>positive</td>\n",
+       "      <td>У меня с детства очень плохие зубы (тонкая и х...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70592</th>\n",
+       "      <td>negative</td>\n",
+       "      <td>Заведение ужасное. Врачи делят 1 кабинет на 2х...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70593</th>\n",
+       "      <td>negative</td>\n",
+       "      <td>Люди, не обращайтесь в эту фирму! Муж проходил...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70594</th>\n",
+       "      <td>positive</td>\n",
+       "      <td>Дорогие посетители медицинского центра ООО \"Ме...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70595</th>\n",
+       "      <td>negative</td>\n",
+       "      <td>В регистратуре сидит ��амка, такое отношение и ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70596</th>\n",
+       "      <td>positive</td>\n",
+       "      <td>Хочу поблагодарить весь персонал \"МедАльфаПроф...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>70597 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      sentiment                                            content\n",
+       "0      positive  Огромное спасибо за чудесное удаление двух зуб...\n",
+       "1      positive  Хочу выразить особую благодарность замечательн...\n",
+       "2      positive  Добрый вечер! Хотелось бы поблагодарить сотруд...\n",
+       "3      negative  Женщины советского образца в регистратуре не и...\n",
+       "4      positive  У меня с детства очень плохие зубы (тонкая и х...\n",
+       "...         ...                                                ...\n",
+       "70592  negative  Заведение ужасное. Врачи делят 1 кабинет на 2х...\n",
+       "70593  negative  Люди, не обращайтесь в эту фирму! Муж проходил...\n",
+       "70594  positive  Дорогие посетители медицинского центра ООО \"Ме...\n",
+       "70595  negative  В регистратуре сидит хамка, такое отношение и ...\n",
+       "70596  positive  Хочу поблагодарить весь персонал \"МедАльфаПроф...\n",
+       "\n",
+       "[70597 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Добрый вечер! Хотелось бы поблагодарить сотрудников рентгена! Протезируюсь, отношусь к поликлинике № 189. Там меня отфутболили! Подходила к Кочину, зам. гл. врачу, заведующей просто сделать 3 снимка (пол-ка рядом с домом)- мне грубо отказали! А сотрудник рентгена просто сидела кроссворд разгадывала! Они видите ли, не принимают с протезирования! Сказали, где протезируетесь, там и делайте, а я говорю, мне у Вас удобно. Побоялись они! Первый раз попала к молодой девушке, она меня выслушала и сделала 1 снимок, а потом записала на другие дни, мне это удобно. Конечно, народу полно было! Бедные сотрудники. Все, кто читает отзыв (особенно жители Люблино 189 пол-ки), давайте жаловаться в департамент! Спасибо еще раз, за рентген (слышала в очереди, что народу у Вас было много и вы уже перебрали с нормой). Спасибо.'"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['content'][2]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Очистка текста"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "morph = pymorphy2.MorphAnalyzer()\n",
+    "russian_stopwords = stopwords.words(\"russian\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_75887/650983554.py:12: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df['cleaned_text'] = df['content'].apply(clean_text)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def clean_text(text):\n",
+    "    # Удаление всего, что не является буквами или знаками препинания\n",
+    "    clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\\s]')\n",
+    "    text = clean_pattern.sub('', text)\n",
+    "    url_pattern = re.compile(r'http\\S+|www\\S+|https\\S+')\n",
+    "    text = url_pattern.sub(r'', text)\n",
+    "    text = text.translate(str.maketrans('', '', string.punctuation))\n",
+    "    text = text.lower()\n",
+    "    lemmatized_text =  ' '.join([morph.parse(word)[0].normal_form for word in text.split() if word not in russian_stopwords])\n",
+    "    return lemmatized_text\n",
+    "\n",
+    "df['cleaned_text'] = df['content'].apply(clean_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sentiment</th>\n",
+       "      <th>content</th>\n",
+       "      <th>cleaned_text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>positive</td>\n",
+       "      <td>Огромное спасибо за чудесное удаление двух зуб...</td>\n",
+       "      <td>огромный спасибо чудесный удаление два зуб муд...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>positive</td>\n",
+       "      <td>Хочу выразить особую благодарность замечательн...</td>\n",
+       "      <td>хотеть выразить особый благодарность замечател...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>positive</td>\n",
+       "      <td>Добрый вечер! Хотелось бы поблагодарить сотруд...</td>\n",
+       "      <td>добрый вечер хотеться поблагодарить сотрудник ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>negative</td>\n",
+       "      <td>Женщины советского образца в регистратуре не и...</td>\n",
+       "      <td>женщина советский образец регистратура иметь п...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>positive</td>\n",
+       "      <td>У меня с детства очень плохие зубы (тонкая и х...</td>\n",
+       "      <td>детство очень плохой зуб тонкий хрупкий эмаль ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70592</th>\n",
+       "      <td>negative</td>\n",
+       "      <td>Заведение ужасное. Врачи делят 1 кабинет на 2х...</td>\n",
+       "      <td>заведение ужасный врач делить 1 кабинет 2х спе...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70593</th>\n",
+       "      <td>negative</td>\n",
+       "      <td>Люди, не обращайтесь в эту фирму! Муж проходил...</td>\n",
+       "      <td>человек обращаться фирма муж проходить анализ ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70594</th>\n",
+       "      <td>positive</td>\n",
+       "      <td>Дорогие посетители медицинского центра ООО \"Ме...</td>\n",
+       "      <td>дорогой посетитель медицинский центр ооо медал...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70595</th>\n",
+       "      <td>negative</td>\n",
+       "      <td>В регистратуре сидит хамка, такое отношение и ...</td>\n",
+       "      <td>регистратура сидеть хамка такой отношение мане...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70596</th>\n",
+       "      <td>positive</td>\n",
+       "      <td>Хочу поблагодарить весь персонал \"МедАльфаПроф...</td>\n",
+       "      <td>хотеть поблагодарить весь персонал медальфапро...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>70597 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      sentiment                                            content  \\\n",
+       "0      positive  Огромное спасибо за чудесное удаление двух зуб...   \n",
+       "1      positive  Хочу выразить особую благодарность замечательн...   \n",
+       "2      positive  Добрый вечер! Хотелось бы поблагодарить сотруд...   \n",
+       "3      negative  Женщины советского образца в регистратуре не и...   \n",
+       "4      positive  У меня с детства очень плохие зубы (тонкая и х...   \n",
+       "...         ...                                                ...   \n",
+       "70592  negative  Заведение ужасное. Врачи делят 1 кабинет на 2х...   \n",
+       "70593  negative  Люди, не обращайтесь в эту фирму! Муж проходил...   \n",
+       "70594  positive  Дорогие посетители медицинского центра ООО \"Ме...   \n",
+       "70595  negative  В регистратуре сидит хамка, такое отношение и ...   \n",
+       "70596  positive  Хочу поблагодарить весь персонал \"МедАльфаПроф...   \n",
+       "\n",
+       "                                            cleaned_text  \n",
+       "0      огромный спасибо чудесный удаление два зуб муд...  \n",
+       "1      хотеть выразить особый благодарность замечател...  \n",
+       "2      добрый вечер хотеться поблагодарить сотрудник ...  \n",
+       "3      женщина советский образец регистратура иметь п...  \n",
+       "4      детство очень плохой зуб тонкий хрупкий эмаль ...  \n",
+       "...                                                  ...  \n",
+       "70592  заведение ужасный врач делить 1 кабинет 2х спе...  \n",
+       "70593  человек обращаться фирма муж проходить анализ ...  \n",
+       "70594  дорогой посетитель медицинский центр ооо медал...  \n",
+       "70595  регистратура сидеть хамка такой отношение мане...  \n",
+       "70596  хотеть поблагодарить весь персонал медальфапро...  \n",
+       "\n",
+       "[70597 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'добрый вечер хотеться поблагодарить сотрудник рентген протезироваться относиться поликлиника 189 отфутболить подходить кочин зам гл врач заведовать просто сделать 3 снимок полка рядом дом грубо отказать сотрудник рентген просто сидеть кроссворд разгадывать видеть принимать протезирование сказать протезироваться делать говорить удобно побояться первый попасть молодой девушка выслушать сделать 1 снимка записать другой день это удобно народ полно бедный сотрудник читать отзыв особенно житель люблино 189 полка давать жаловаться департамент спасибо рентген слышать очередь народ перебрать норма спасибо'"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['cleaned_text'][2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_75887/3526150694.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'negative' else 0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'negative' else 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sentiment</th>\n",
+       "      <th>content</th>\n",
+       "      <th>cleaned_text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Огромное спасибо за чудесное удаление двух зуб...</td>\n",
+       "      <td>огромный спасибо чудесный удаление два зуб муд...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Хочу выразить особую благодарность замечательн...</td>\n",
+       "      <td>хотеть выразить особый благодарность замечател...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Добрый вечер! Хотелось бы поблагодарить сотруд...</td>\n",
+       "      <td>добрый вечер хотеться поблагодарить сотрудник ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Женщины советского образца в регистратуре не и...</td>\n",
+       "      <td>женщина советский образец регистратура иметь п...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>У меня с детства очень плохие зубы (тонкая и х...</td>\n",
+       "      <td>детство очень плохой зуб тонкий хрупкий эмаль ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70592</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Заведение ужасное. Врачи делят 1 кабинет на 2х...</td>\n",
+       "      <td>заведение ужасный врач делить 1 кабинет 2х спе...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70593</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Люди, не обращайтесь в эту фирму! Муж проходил...</td>\n",
+       "      <td>человек обращаться фирма муж проходить анализ ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70594</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Дорогие посетители медицинского центра ООО \"Ме...</td>\n",
+       "      <td>дорогой посетитель медицинский центр ооо медал...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70595</th>\n",
+       "      <td>1</td>\n",
+       "      <td>В регистратуре сидит хамка, такое отношение и ...</td>\n",
+       "      <td>регистратура сидеть хамка такой отношение мане...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70596</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Хочу поблагодарить весь персонал \"МедАльфаПроф...</td>\n",
+       "      <td>хотеть поблагодарить весь персонал медальфапро...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>70597 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       sentiment                                            content  \\\n",
+       "0              0  Огромное спасибо за чудесное удаление двух зуб...   \n",
+       "1              0  Хочу выразить особую благодарность замечательн...   \n",
+       "2              0  Добрый вечер! Хотелось бы поблагодарить сотруд...   \n",
+       "3              1  Женщины советского образца в регистратуре не и...   \n",
+       "4              0  У меня с детства очень плохие зубы (тонкая и х...   \n",
+       "...          ...                                                ...   \n",
+       "70592          1  Заведение ужасное. Врачи делят 1 кабинет на 2х...   \n",
+       "70593          1  Люди, не обращайтесь в эту фирму! Муж проходил...   \n",
+       "70594          0  Дорогие посетители медицинского центра ООО \"Ме...   \n",
+       "70595          1  В регистратуре сидит хамка, такое отношение и ...   \n",
+       "70596          0  Хочу поблагодарить весь персонал \"МедАльфаПроф...   \n",
+       "\n",
+       "                                            cleaned_text  \n",
+       "0      огромный спасибо чудесный удаление два зуб муд...  \n",
+       "1      хотеть выразить особый благодарность замечател...  \n",
+       "2      добрый вечер хотеться поблагодарить сотрудник ...  \n",
+       "3      женщина советский образец регистратура иметь п...  \n",
+       "4      детство очень плохой зуб тонкий хрупкий эмаль ...  \n",
+       "...                                                  ...  \n",
+       "70592  заведение ужасный врач делить 1 кабинет 2х спе...  \n",
+       "70593  человек обращаться фирма муж проходить анализ ...  \n",
+       "70594  дорогой посетитель медицинский центр ооо медал...  \n",
+       "70595  регистратура сидеть хамка такой отношение мане...  \n",
+       "70596  хотеть поблагодарить весь персонал медальфапро...  \n",
+       "\n",
+       "[70597 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['sentiment'], test_size=0.2, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Векторизация и сжатие"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "vectorization done in 4.084 s\n",
+      "n_samples train: 56477, n_features: 1010\n",
+      "n_samples test: 14120, n_features: 1010\n"
+     ]
+    }
+   ],
+   "source": [
+    "vectorizer = TfidfVectorizer(\n",
+    "    max_df=0.9,\n",
+    "    min_df=500,\n",
+    "    # ngram_range=(1, 2),  # Использование униграмм и биграмм\n",
+    "    # max_features=5000,\n",
+    "    stop_words=stopwords.words('russian'),\n",
+    ")\n",
+    "t0 = time()\n",
+    "X_train_tfidf = vectorizer.fit_transform(X_train)\n",
+    "X_test_tfidf = vectorizer.transform(X_test)\n",
+    "\n",
+    "print(f\"vectorization done in {time() - t0:.3f} s\")\n",
+    "print(f\"n_samples train: {X_train_tfidf.shape[0]}, n_features: {X_train_tfidf.shape[1]}\")\n",
+    "print(f\"n_samples test: {X_test_tfidf.shape[0]}, n_features: {X_test_tfidf.shape[1]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LSA done in 14.485 s\n",
+      "Explained variance of the SVD step: 74.3%\n"
+     ]
+    }
+   ],
+   "source": [
+    "lsa = make_pipeline(TruncatedSVD(n_components=500), Normalizer(copy=False))\n",
+    "t0 = time()\n",
+    "X_train_lsa = lsa.fit_transform(X_train_tfidf)\n",
+    "\n",
+    "# Применение обученной модели LSA к тестовым данным\n",
+    "X_test_lsa = lsa.transform(X_test_tfidf)\n",
+    "explained_variance = lsa[0].explained_variance_ratio_.sum()\n",
+    "\n",
+    "print(f\"LSA done in {time() - t0:.3f} s\")\n",
+    "print(f\"Explained variance of the SVD step: {explained_variance * 100:.1f}%\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Логистическая регрессия"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.94      0.94      0.94      8342\n",
+      "           1       0.91      0.92      0.91      5778\n",
+      "\n",
+      "    accuracy                           0.93     14120\n",
+      "   macro avg       0.92      0.93      0.93     14120\n",
+      "weighted avg       0.93      0.93      0.93     14120\n",
+      "\n",
+      "Accuracy: 0.9277620396600567\n",
+      "F1 score: 0.9120689655172414\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = LogisticRegression()\n",
+    "\n",
+    "# Обучение модели\n",
+    "model.fit(X_train_lsa, y_train)\n",
+    "\n",
+    "# Прогнозирование на тестовой выборке\n",
+    "y_pred = model.predict(X_test_lsa)\n",
+    "\n",
+    "# Вывод результатов\n",
+    "print(classification_report(y_test, y_pred))\n",
+    "print(f'Accuracy: {accuracy_score(y_test, y_pred)}')\n",
+    "print(f'F1 score: {f1_score(y_test, y_pred)}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Создание пайплайна"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to /home/vera/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n",
+      "[nltk_data] Downloading package punkt to /home/vera/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style>#sk-container-id-1 {\n",
+       "  /* Definition of color scheme common for light and dark mode */\n",
+       "  --sklearn-color-text: black;\n",
+       "  --sklearn-color-line: gray;\n",
+       "  /* Definition of color scheme for unfitted estimators */\n",
+       "  --sklearn-color-unfitted-level-0: #fff5e6;\n",
+       "  --sklearn-color-unfitted-level-1: #f6e4d2;\n",
+       "  --sklearn-color-unfitted-level-2: #ffe0b3;\n",
+       "  --sklearn-color-unfitted-level-3: chocolate;\n",
+       "  /* Definition of color scheme for fitted estimators */\n",
+       "  --sklearn-color-fitted-level-0: #f0f8ff;\n",
+       "  --sklearn-color-fitted-level-1: #d4ebff;\n",
+       "  --sklearn-color-fitted-level-2: #b3dbfd;\n",
+       "  --sklearn-color-fitted-level-3: cornflowerblue;\n",
+       "\n",
+       "  /* Specific color for light theme */\n",
+       "  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
+       "  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
+       "  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
+       "  --sklearn-color-icon: #696969;\n",
+       "\n",
+       "  @media (prefers-color-scheme: dark) {\n",
+       "    /* Redefinition of color scheme for dark theme */\n",
+       "    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
+       "    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
+       "    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
+       "    --sklearn-color-icon: #878787;\n",
+       "  }\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 pre {\n",
+       "  padding: 0;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 input.sk-hidden--visually {\n",
+       "  border: 0;\n",
+       "  clip: rect(1px 1px 1px 1px);\n",
+       "  clip: rect(1px, 1px, 1px, 1px);\n",
+       "  height: 1px;\n",
+       "  margin: -1px;\n",
+       "  overflow: hidden;\n",
+       "  padding: 0;\n",
+       "  position: absolute;\n",
+       "  width: 1px;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-dashed-wrapped {\n",
+       "  border: 1px dashed var(--sklearn-color-line);\n",
+       "  margin: 0 0.4em 0.5em 0.4em;\n",
+       "  box-sizing: border-box;\n",
+       "  padding-bottom: 0.4em;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-container {\n",
+       "  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
+       "     but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
+       "     so we also need the `!important` here to be able to override the\n",
+       "     default hidden behavior on the sphinx rendered scikit-learn.org.\n",
+       "     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
+       "  display: inline-block !important;\n",
+       "  position: relative;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-text-repr-fallback {\n",
+       "  display: none;\n",
+       "}\n",
+       "\n",
+       "div.sk-parallel-item,\n",
+       "div.sk-serial,\n",
+       "div.sk-item {\n",
+       "  /* draw centered vertical line to link estimators */\n",
+       "  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
+       "  background-size: 2px 100%;\n",
+       "  background-repeat: no-repeat;\n",
+       "  background-position: center center;\n",
+       "}\n",
+       "\n",
+       "/* Parallel-specific style estimator block */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item::after {\n",
+       "  content: \"\";\n",
+       "  width: 100%;\n",
+       "  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
+       "  flex-grow: 1;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel {\n",
+       "  display: flex;\n",
+       "  align-items: stretch;\n",
+       "  justify-content: center;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  position: relative;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
+       "  align-self: flex-end;\n",
+       "  width: 50%;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
+       "  align-self: flex-start;\n",
+       "  width: 50%;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
+       "  width: 0;\n",
+       "}\n",
+       "\n",
+       "/* Serial-specific style estimator block */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-serial {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "  align-items: center;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  padding-right: 1em;\n",
+       "  padding-left: 1em;\n",
+       "}\n",
+       "\n",
+       "\n",
+       "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
+       "clickable and can be expanded/collapsed.\n",
+       "- Pipeline and ColumnTransformer use this feature and define the default style\n",
+       "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
+       "*/\n",
+       "\n",
+       "/* Pipeline and ColumnTransformer style (default) */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable {\n",
+       "  /* Default theme specific background. It is overwritten whether we have a\n",
+       "  specific estimator or a Pipeline/ColumnTransformer */\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "}\n",
+       "\n",
+       "/* Toggleable label */\n",
+       "#sk-container-id-1 label.sk-toggleable__label {\n",
+       "  cursor: pointer;\n",
+       "  display: block;\n",
+       "  width: 100%;\n",
+       "  margin-bottom: 0;\n",
+       "  padding: 0.5em;\n",
+       "  box-sizing: border-box;\n",
+       "  text-align: center;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
+       "  /* Arrow on the left of the label */\n",
+       "  content: \"▸\";\n",
+       "  float: left;\n",
+       "  margin-right: 0.25em;\n",
+       "  color: var(--sklearn-color-icon);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "}\n",
+       "\n",
+       "/* Toggleable content - dropdown */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content {\n",
+       "  max-height: 0;\n",
+       "  max-width: 0;\n",
+       "  overflow: hidden;\n",
+       "  text-align: left;\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content pre {\n",
+       "  margin: 0.2em;\n",
+       "  border-radius: 0.25em;\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
+       "  /* Expand drop-down */\n",
+       "  max-height: 200px;\n",
+       "  max-width: 100%;\n",
+       "  overflow: auto;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
+       "  content: \"▾\";\n",
+       "}\n",
+       "\n",
+       "/* Pipeline/ColumnTransformer-specific style */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Estimator-specific style */\n",
+       "\n",
+       "/* Colorize estimator box */\n",
+       "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
+       "#sk-container-id-1 div.sk-label label {\n",
+       "  /* The background is the default theme color */\n",
+       "  color: var(--sklearn-color-text-on-default-background);\n",
+       "}\n",
+       "\n",
+       "/* On hover, darken the color of the background */\n",
+       "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Label box, darken color on hover, fitted */\n",
+       "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Estimator label */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label label {\n",
+       "  font-family: monospace;\n",
+       "  font-weight: bold;\n",
+       "  display: inline-block;\n",
+       "  line-height: 1.2em;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label-container {\n",
+       "  text-align: center;\n",
+       "}\n",
+       "\n",
+       "/* Estimator-specific */\n",
+       "#sk-container-id-1 div.sk-estimator {\n",
+       "  font-family: monospace;\n",
+       "  border: 1px dotted var(--sklearn-color-border-box);\n",
+       "  border-radius: 0.25em;\n",
+       "  box-sizing: border-box;\n",
+       "  margin-bottom: 0.5em;\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-estimator.fitted {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-0);\n",
+       "}\n",
+       "\n",
+       "/* on hover */\n",
+       "#sk-container-id-1 div.sk-estimator:hover {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
+       "\n",
+       "/* Common style for \"i\" and \"?\" */\n",
+       "\n",
+       ".sk-estimator-doc-link,\n",
+       "a:link.sk-estimator-doc-link,\n",
+       "a:visited.sk-estimator-doc-link {\n",
+       "  float: right;\n",
+       "  font-size: smaller;\n",
+       "  line-height: 1em;\n",
+       "  font-family: monospace;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  border-radius: 1em;\n",
+       "  height: 1em;\n",
+       "  width: 1em;\n",
+       "  text-decoration: none !important;\n",
+       "  margin-left: 1ex;\n",
+       "  /* unfitted */\n",
+       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
+       "  color: var(--sklearn-color-unfitted-level-1);\n",
+       "}\n",
+       "\n",
+       ".sk-estimator-doc-link.fitted,\n",
+       "a:link.sk-estimator-doc-link.fitted,\n",
+       "a:visited.sk-estimator-doc-link.fitted {\n",
+       "  /* fitted */\n",
+       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
+       "  color: var(--sklearn-color-fitted-level-1);\n",
+       "}\n",
+       "\n",
+       "/* On hover */\n",
+       "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
+       ".sk-estimator-doc-link:hover,\n",
+       "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
+       ".sk-estimator-doc-link:hover {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
+       "  color: var(--sklearn-color-background);\n",
+       "  text-decoration: none;\n",
+       "}\n",
+       "\n",
+       "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
+       ".sk-estimator-doc-link.fitted:hover,\n",
+       "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
+       ".sk-estimator-doc-link.fitted:hover {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-3);\n",
+       "  color: var(--sklearn-color-background);\n",
+       "  text-decoration: none;\n",
+       "}\n",
+       "\n",
+       "/* Span, style for the box shown on hovering the info icon */\n",
+       ".sk-estimator-doc-link span {\n",
+       "  display: none;\n",
+       "  z-index: 9999;\n",
+       "  position: relative;\n",
+       "  font-weight: normal;\n",
+       "  right: .2ex;\n",
+       "  padding: .5ex;\n",
+       "  margin: .5ex;\n",
+       "  width: min-content;\n",
+       "  min-width: 20ex;\n",
+       "  max-width: 50ex;\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  box-shadow: 2pt 2pt 4pt #999;\n",
+       "  /* unfitted */\n",
+       "  background: var(--sklearn-color-unfitted-level-0);\n",
+       "  border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
+       "}\n",
+       "\n",
+       ".sk-estimator-doc-link.fitted span {\n",
+       "  /* fitted */\n",
+       "  background: var(--sklearn-color-fitted-level-0);\n",
+       "  border: var(--sklearn-color-fitted-level-3);\n",
+       "}\n",
+       "\n",
+       ".sk-estimator-doc-link:hover span {\n",
+       "  display: block;\n",
+       "}\n",
+       "\n",
+       "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
+       "\n",
+       "#sk-container-id-1 a.estimator_doc_link {\n",
+       "  float: right;\n",
+       "  font-size: 1rem;\n",
+       "  line-height: 1em;\n",
+       "  font-family: monospace;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  border-radius: 1rem;\n",
+       "  height: 1rem;\n",
+       "  width: 1rem;\n",
+       "  text-decoration: none;\n",
+       "  /* unfitted */\n",
+       "  color: var(--sklearn-color-unfitted-level-1);\n",
+       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 a.estimator_doc_link.fitted {\n",
+       "  /* fitted */\n",
+       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
+       "  color: var(--sklearn-color-fitted-level-1);\n",
+       "}\n",
+       "\n",
+       "/* On hover */\n",
+       "#sk-container-id-1 a.estimator_doc_link:hover {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
+       "  color: var(--sklearn-color-background);\n",
+       "  text-decoration: none;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-3);\n",
+       "}\n",
+       "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;, TextPreprocessor()),\n",
+       "                (&#x27;vectorizer&#x27;,\n",
+       "                 TfidfVectorizer(max_df=0.9, min_df=500,\n",
+       "                                 stop_words=[&#x27;и&#x27;, &#x27;в&#x27;, &#x27;во&#x27;, &#x27;не&#x27;, &#x27;что&#x27;, &#x27;он&#x27;,\n",
+       "                                             &#x27;на&#x27;, &#x27;я&#x27;, &#x27;с&#x27;, &#x27;со&#x27;, &#x27;как&#x27;, &#x27;а&#x27;,\n",
+       "                                             &#x27;то&#x27;, &#x27;все&#x27;, &#x27;она&#x27;, &#x27;так&#x27;, &#x27;его&#x27;,\n",
+       "                                             &#x27;но&#x27;, &#x27;да&#x27;, &#x27;ты&#x27;, &#x27;к&#x27;, &#x27;у&#x27;, &#x27;же&#x27;,\n",
+       "                                             &#x27;вы&#x27;, &#x27;за&#x27;, &#x27;бы&#x27;, &#x27;по&#x27;, &#x27;только&#x27;,\n",
+       "                                             &#x27;ее&#x27;, &#x27;мне&#x27;, ...])),\n",
+       "                (&#x27;lsa&#x27;,\n",
+       "                 Pipeline(steps=[(&#x27;truncatedsvd&#x27;,\n",
+       "                                  TruncatedSVD(n_components=500)),\n",
+       "                                 (&#x27;normalizer&#x27;, Normalizer(copy=False))])),\n",
+       "                (&#x27;classifier&#x27;, LogisticRegression())])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;Pipeline<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;, TextPreprocessor()),\n",
+       "                (&#x27;vectorizer&#x27;,\n",
+       "                 TfidfVectorizer(max_df=0.9, min_df=500,\n",
+       "                                 stop_words=[&#x27;и&#x27;, &#x27;в&#x27;, &#x27;во&#x27;, &#x27;не&#x27;, &#x27;что&#x27;, &#x27;он&#x27;,\n",
+       "                                             &#x27;на&#x27;, &#x27;я&#x27;, &#x27;с&#x27;, &#x27;со&#x27;, &#x27;как&#x27;, &#x27;а&#x27;,\n",
+       "                                             &#x27;то&#x27;, &#x27;все&#x27;, &#x27;она&#x27;, &#x27;так&#x27;, &#x27;его&#x27;,\n",
+       "                                             &#x27;но&#x27;, &#x27;да&#x27;, &#x27;ты&#x27;, &#x27;к&#x27;, &#x27;у&#x27;, &#x27;же&#x27;,\n",
+       "                                             &#x27;вы&#x27;, &#x27;за&#x27;, &#x27;бы&#x27;, &#x27;по&#x27;, &#x27;только&#x27;,\n",
+       "                                             &#x27;ее&#x27;, &#x27;мне&#x27;, ...])),\n",
+       "                (&#x27;lsa&#x27;,\n",
+       "                 Pipeline(steps=[(&#x27;truncatedsvd&#x27;,\n",
+       "                                  TruncatedSVD(n_components=500)),\n",
+       "                                 (&#x27;normalizer&#x27;, Normalizer(copy=False))])),\n",
+       "                (&#x27;classifier&#x27;, LogisticRegression())])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">TextPreprocessor</label><div class=\"sk-toggleable__content fitted\"><pre>TextPreprocessor()</pre></div> </div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;TfidfVectorizer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\">?<span>Documentation for TfidfVectorizer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>TfidfVectorizer(max_df=0.9, min_df=500,\n",
+       "                stop_words=[&#x27;и&#x27;, &#x27;в&#x27;, &#x27;во&#x27;, &#x27;не&#x27;, &#x27;что&#x27;, &#x27;он&#x27;, &#x27;на&#x27;, &#x27;я&#x27;, &#x27;с&#x27;,\n",
+       "                            &#x27;со&#x27;, &#x27;как&#x27;, &#x27;а&#x27;, &#x27;то&#x27;, &#x27;все&#x27;, &#x27;она&#x27;, &#x27;так&#x27;, &#x27;его&#x27;,\n",
+       "                            &#x27;но&#x27;, &#x27;да&#x27;, &#x27;ты&#x27;, &#x27;к&#x27;, &#x27;у&#x27;, &#x27;же&#x27;, &#x27;вы&#x27;, &#x27;за&#x27;, &#x27;бы&#x27;,\n",
+       "                            &#x27;по&#x27;, &#x27;только&#x27;, &#x27;ее&#x27;, &#x27;мне&#x27;, ...])</pre></div> </div></div><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;lsa: Pipeline<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for lsa: Pipeline</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[(&#x27;truncatedsvd&#x27;, TruncatedSVD(n_components=500)),\n",
+       "                (&#x27;normalizer&#x27;, Normalizer(copy=False))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;TruncatedSVD<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.decomposition.TruncatedSVD.html\">?<span>Documentation for TruncatedSVD</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>TruncatedSVD(n_components=500)</pre></div> </div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-6\" type=\"checkbox\" ><label for=\"sk-estimator-id-6\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;Normalizer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.Normalizer.html\">?<span>Documentation for Normalizer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>Normalizer(copy=False)</pre></div> </div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-7\" type=\"checkbox\" ><label for=\"sk-estimator-id-7\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;LogisticRegression<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>LogisticRegression()</pre></div> </div></div></div></div></div></div>"
+      ],
+      "text/plain": [
+       "Pipeline(steps=[('preprocessor', TextPreprocessor()),\n",
+       "                ('vectorizer',\n",
+       "                 TfidfVectorizer(max_df=0.9, min_df=500,\n",
+       "                                 stop_words=['и', 'в', 'во', 'не', 'что', 'он',\n",
+       "                                             'на', 'я', 'с', 'со', 'как', 'а',\n",
+       "                                             'то', 'все', 'она', 'так', 'его',\n",
+       "                                             'но', 'да', 'ты', 'к', 'у', 'же',\n",
+       "                                             'вы', 'за', 'бы', 'по', 'только',\n",
+       "                                             'ее', 'мне', ...])),\n",
+       "                ('lsa',\n",
+       "                 Pipeline(steps=[('truncatedsvd',\n",
+       "                                  TruncatedSVD(n_components=500)),\n",
+       "                                 ('normalizer', Normalizer(copy=False))])),\n",
+       "                ('classifier', LogisticRegression())])"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import re\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.base import BaseEstimator, TransformerMixin\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.decomposition import TruncatedSVD\n",
+    "from sklearn.pipeline import Pipeline, FeatureUnion\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.preprocessing import Normalizer\n",
+    "import joblib\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from pymorphy2 import MorphAnalyzer\n",
+    "\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('punkt')\n",
+    "\n",
+    "class TextPreprocessor(BaseEstimator, TransformerMixin):\n",
+    "    def __init__(self):\n",
+    "        self.stop_words = set(stopwords.words('russian'))\n",
+    "        self.morph = MorphAnalyzer()\n",
+    "\n",
+    "    def preprocess_text(self, text):\n",
+    "        # Удаление всего, что не является буквами или знаками препинания\n",
+    "        clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\\s]')\n",
+    "        text = clean_pattern.sub('', text)\n",
+    "        url_pattern = re.compile(r'http\\S+|www\\S+|https\\S+')\n",
+    "        text = url_pattern.sub(r'', text)\n",
+    "        text = text.translate(str.maketrans('', '', string.punctuation))\n",
+    "        text = text.lower()\n",
+    "        tokens = text.split()\n",
+    "        lemmatized_text = ' '.join([self.morph.parse(word)[0].normal_form for word in tokens if word not in self.stop_words])\n",
+    "        return lemmatized_text\n",
+    "\n",
+    "    def fit(self, X, y=None):\n",
+    "        return self\n",
+    "\n",
+    "    def transform(self, X, y=None):\n",
+    "        return X.apply(self.preprocess_text)\n",
+    "\n",
+    "\n",
+    "# Load and preprocess the dataset\n",
+    "df = pd.read_json('data/healthcare_facilities_reviews.jsonl', lines=True)\n",
+    "df = df[['sentiment', 'content']]\n",
+    "df['cleaned_text'] = df['content'].apply(TextPreprocessor().preprocess_text)\n",
+    "df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'negative' else 0)\n",
+    "\n",
+    "# Split the dataset (this is only for training purposes)\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Create the pipeline\n",
+    "vectorizer = TfidfVectorizer(\n",
+    "    max_df=0.9,\n",
+    "    min_df=500,\n",
+    "    stop_words=stopwords.words('russian')\n",
+    ")\n",
+    "\n",
+    "lsa = TruncatedSVD(n_components=500)\n",
+    "\n",
+    "pipeline = Pipeline([\n",
+    "    ('preprocessor', TextPreprocessor()),\n",
+    "    ('vectorizer', vectorizer),\n",
+    "    ('lsa', make_pipeline(lsa, Normalizer(copy=False))),\n",
+    "    ('classifier', LogisticRegression())\n",
+    "])\n",
+    "\n",
+    "# Train the model\n",
+    "X_train = train_df['cleaned_text']\n",
+    "y_train = train_df['sentiment']\n",
+    "pipeline.fit(X_train, y_train)\n",
+    "\n",
+    "# Save the model\n",
+    "# joblib.dump(pipeline, 'logistic_regression_pipeline.pkl')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['logistic_regression_pipeline.pkl']"
+      ]
+     },
+     "execution_count": 55,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Save the model for future use\n",
+    "joblib.dump(pipeline, 'logistic_regression_pipeline.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the model (if not already loaded)\n",
+    "pipeline_test= joblib.load('logistic_regression_pipeline.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Predicted class: 1\n",
+      "Predicted proba: 0.898\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Sample text for prediction\n",
+    "sample_text = \"Ужасная клиника, обслуживание из рук вон плохое, хотеловь бы выразить свое разочарование данным заведением. Советую обходить его мимо.\"\n",
+    "\n",
+    "# Use the pipeline to predict the class\n",
+    "predicted_class = pipeline_test.predict(pd.Series([sample_text]))\n",
+    "predicted_prob = pipeline_test.predict_proba(pd.Series([sample_text]))\n",
+    "print(f\"Predicted class: {predicted_class[0]}\")\n",
+    "print(f\"Predicted proba: {round(predicted_prob[0][1], 3)}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

pages/policlinic.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import streamlit as st
+import joblib
+import pandas as pd
+# Load the trained pipeline
+pipeline = joblib.load('logistic_regression_pipeline.pkl')
+# Streamlit application
+st.title('Классификация отзывов на русском языке')
+input_text = st.text_area('Введите текст отзыва')
+if st.button('Предсказать'):
+    prediction = pipeline.predict(pd.Series([input_text]))
+    st.write(f'Предсказанный класс с помощью логрег: {prediction[0]}')