{
"cells": [
{
"cell_type": "code",
"execution_count": 32,
"id": "1ce0c43d",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import numpy as np \n",
"import pandas as pd \n",
"from transformers import BertTokenizer, BertModel, BertForMaskedLM, AutoTokenizer, AutoModelForMaskedLM,AutoModel\n",
"from scipy.spatial.distance import cosine \n",
"import tokenizers \n",
"import pandas as pd \n",
"from sklearn.model_selection import train_test_split,GridSearchCV\n",
"from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score\n",
"from nltk.corpus import stopwords\n",
"import snowballstemmer \n",
"from sklearn.svm import SVC\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"import snowballstemmer\n",
"import numpy\n",
"import os \n",
"import re\n",
"import json\n",
"import pickle "
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "1b519b36",
"metadata": {},
"outputs": [],
"source": [
"model = AutoModelForMaskedLM.from_pretrained(\"Shushant/nepaliBERT\", output_hidden_states = True, return_dict = True, output_attentions = True)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "7dc414c6",
"metadata": {},
"outputs": [],
"source": [
"tokenizers = AutoTokenizer.from_pretrained(\"Shushant/nepaliBERT\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1871cd20",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"tokenizers.tokenize(\"के मौजुदा लोकतान्त्रिक व्यवस्था राज्य पुनःसंरचनासँग जोडिएका हिजोका सवालहरूलाई यथास्थितिमा छोडेर सबल होला?\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "00ca9f25",
"metadata": {},
"outputs": [],
"source": [
"# text = 'अनि तेस्रो चिन्ता मौसम परिवर्तनले हिमशिखरहरूमा परेका आघातसँगसँगै सिमानावारिपारि नदीले ल्याएका प्रकोपहरू कसरी सम्हाल्ने'\n",
"# marked_text = \" [CLS] \"+text+\" [SEP] \"\n",
"# tokenized_text = tokenizer.tokenize(marked_text)\n",
"# indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n",
"# segments_ids = [1] * len(indexed_tokens)\n",
"\n",
"# tokens_tensors = torch.tensor([indexed_tokens])\n",
"# segments_tensors = torch.tensor([segments_ids])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "88a853e8",
"metadata": {},
"outputs": [],
"source": [
"# with torch.no_grad():\n",
"# outputs = model(tokens_tensors, segments_tensors)\n",
"# hidden_states = outputs.hidden_states\n",
"# # print(hidden_states[-1])\n",
"# token_embeddings = hidden_states[-1]\n",
" \n",
"# token_embeddings = torch.squeeze(token_embeddings, dim = 0)\n",
" \n",
"# list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]\n",
"# print(list_token_embeddings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fc8c04e",
"metadata": {},
"outputs": [],
"source": [
"nepali_stemmer = snowballstemmer.NepaliStemmer()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "06bc3947",
"metadata": {},
"outputs": [],
"source": [
"texts = ['तर','दुधमा तर बसेन|','तिम्रो घर आउन मन लाग्छ तर अल्छि लाग्छ']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5cd86297",
"metadata": {},
"outputs": [],
"source": [
"def bert_text_preparation(text, tokenizer ):\n",
" \"\"\"Preparing input for BERT\"\"\"\n",
" \n",
" marked_text = \" [CLS] \" + text + \" [SEP] \"\n",
" tokenized_text = tokenizer.tokenize(marked_text)\n",
" indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n",
" segments_ids = [1] * len(indexed_tokens) \n",
" \n",
" # Convert inputs to Pytorch tensors\n",
" tokens_tensors = torch.tensor([indexed_tokens])\n",
" segments_tensors = torch.tensor([segments_ids])\n",
" \n",
" return tokenized_text, tokens_tensors, segments_tensors"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a70ff12b",
"metadata": {},
"outputs": [],
"source": [
"def get_bert_embeddings(tokens_tensor, segments_tensors, model):\n",
" # Gradient claculation id disabled \n",
" # Model is in inference mode\n",
" \n",
" with torch.no_grad():\n",
" outputs = model(tokens_tensor, segments_tensors)\n",
" # removing the first hidden state\n",
" # the first state is the input state \n",
" hidden_states = outputs.hidden_states\n",
" \n",
" # Getting embeddings from final Bert Layer\n",
" tokens_embeddings = hidden_states[-1]\n",
" # Collasping the tensor into 1-dimension \n",
" tokens_embeddings = torch.squeeze(tokens_embeddings, dim = 0)\n",
" # Converting torchtensors to lists \n",
" list_token_embeddings = [token_embed.tolist() for token_embed in tokens_embeddings]\n",
" \n",
" return list_token_embeddings "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d4db1a5",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"target_word_embeddings = []\n",
"\n",
"for text in texts:\n",
" tokenized_text, tokens_tensors, segments_tensors = bert_text_preparation(text, tokenizers)\n",
" list_token_embeddings = get_bert_embeddings(tokens_tensors, segments_tensors, model)\n",
"# print(len(list_token_embeddings))\n",
" ## list_token_embeddings has embeddings of the given words\n",
" word_index = tokenized_text.index('तर')\n",
" word_embedding = list_token_embeddings[word_index]\n",
"# print(word_embedding)\n",
" target_word_embeddings.append(word_embedding)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c79f53c",
"metadata": {},
"outputs": [],
"source": [
"target_word_embeddings[0] == target_word_embeddings[1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eeb28025",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "0578cc53",
"metadata": {},
"outputs": [],
"source": [
"len(target_word_embeddings)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "e5144fea",
"metadata": {},
"outputs": [],
"source": [
"# list_of_distances = []\n",
"# for text1, embed1 in zip(texts, target_word_embeddings):\n",
"# for text2, embed2 in zip(texts, target_word_embeddings):\n",
"# cos_dist = 1 - cosine(embed1,embed2)\n",
"# list_of_distances.append([text1, text2, cos_dist])\n",
"\n",
"\n",
"# distances_df = pd.DataFrame(list_of_distances, columns = ['text1','text2','distance'])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "07d31ba6",
"metadata": {},
"outputs": [],
"source": [
"# df = pd.read_csv(\"finalData.csv\")\n",
"df = pd.read_csv('collected_labeled_data.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b92d7bd7",
"metadata": {},
"outputs": [],
"source": [
"df['label'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "048ef9d1",
"metadata": {},
"outputs": [],
"source": [
"# df.to_csv('collected_labeled_data.csv',index = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a91654b3",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "f6649f45",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 26,
"id": "ff995c8c",
"metadata": {},
"outputs": [],
"source": [
"# train_X, test_X = train_test_split(df, test_size = 0.2)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "012c49a5",
"metadata": {},
"outputs": [],
"source": [
"# train_X.to_csv('train.csv',index = False)\n",
"# test_X.to_csv('test.csv',index = False)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "6103b035",
"metadata": {},
"outputs": [],
"source": [
"# def check_len(text):\n",
"# txt = text.split(' ')[:20]\n",
"# return ' '.join(txt)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "b0aeeb8c",
"metadata": {},
"outputs": [],
"source": [
"# df['text'] = df['text'].apply(check_len)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "57168ec5",
"metadata": {},
"outputs": [],
"source": [
"# def get_word_embeddings(text):\n",
"# tokenizer = tokenizers\n",
"# tokenized_text, tokens_tensors, segments_tensors = bert_text_preparation(text, tokenizer)\n",
"# list_token_embeddings = get_bert_embeddings(tokens_tensors, segments_tensors, model)\n",
"# ## list_token_embeddings has embeddings of the given words\n",
"# return list_token_embeddings"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "36144615",
"metadata": {},
"outputs": [],
"source": [
"stopwords= stopwords.words(\"nepali\")"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "2163997b",
"metadata": {},
"outputs": [],
"source": [
"words = ['अक्सर','आदि','कसरी','अन्तर्गत','अर्थात','अर्थात्','अलग','आयो','उदाहरण','एकदम','राम्रो','बिरुद्ध','बिशेष','नराम्रो']"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "67268e98",
"metadata": {},
"outputs": [],
"source": [
"stopwords = list(set(stopwords).difference(set(words)))"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "e9ea0fe0",
"metadata": {},
"outputs": [],
"source": [
"def remove_emojis(text):\n",
" emoji_pattern = re.compile(\"[\"\n",
" u\"\\U0001F600-\\U0001F64F\" # emoticons\n",
" u\"\\U0001F300-\\U0001F5FF\" # symbols & pictographs\n",
" u\"\\U0001F680-\\U0001F6FF\" # transport & map symbols\n",
" u\"\\U0001F1E0-\\U0001F1FF\" # flags (iOS)\n",
" u\"\\U00002500-\\U00002BEF\" # chinese char\n",
" u\"\\U00002702-\\U000027B0\"\n",
" u\"\\U00002702-\\U000027B0\"\n",
" u\"\\U000024C2-\\U0001F251\"\n",
" u\"\\U0001f926-\\U0001f937\"\n",
" u\"\\U00010000-\\U0010ffff\"\n",
" u\"\\u2640-\\u2642\" \n",
" u\"\\u2600-\\u2B55\"\n",
" u\"\\u200d\"\n",
" u\"\\u23cf\"\n",
" u\"\\u23e9\"\n",
" u\"\\u231a\"\n",
" u\"\\ufe0f\" # dingbats\n",
" u\"\\u3030\"\n",
" \"]+\", re.UNICODE)\n",
" text = emoji_pattern.sub(r'', text)\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "af7b34a1",
"metadata": {},
"outputs": [],
"source": [
"def clean_text(text):\n",
" text = remove_emojis(text)\n",
" text = text.split(' ')\n",
" clean_text_list = []\n",
" for word in text:\n",
" if word not in stopwords:\n",
" clean_text_list.append(word)\n",
" clean_text = ' '.join(clean_text_list)\n",
" stem_words = nepali_stemmer.stemWords(clean_text.split())\n",
"# stem_text = ' '.join(stem_words)\n",
"# txt = re.sub(r\"[|a-zA-z.'#0-9@,:?'\\u200b\\u200c\\u200d!/&~-]\",'',stem_text)\n",
" return ' '.join([i for i in stem_words])"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "05ec9fd9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'घाम जति लग् हामी तेती राम्रो apple'"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_text(\"घाम जति लग्यो हामीलाई तेती राम्रो हुन्छ apple \")"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "05c48277",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['घाम', 'जति', 'लग्', 'हामी', 'तेती', 'राम्रो', '', 'apple']"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nepali_stemmer.stemWords(\"घाम जति लग्यो हामीलाई तेती राम्रो हुन्छ apple \".split())"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "61c1b1dd",
"metadata": {},
"outputs": [],
"source": [
"df['text'] = df['text'].apply(clean_text)\n"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "d3b2275f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" text | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" बजार जसरी ट्रेन्ड चेन्ज गर् हेर् प्रोफिट बूकिङ... | \n",
" 2 | \n",
"
\n",
" \n",
" 1 | \n",
" 1000 अंक घट नेप्से 200 अंक बढ् ठूलो कुरो होइन ... | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" होइन सानि बैंक bonus घोसणा २ महिना (book clos... | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" खैँ MBJC कित्ता रू,10/- बढेर आज रू,1100/- 10क... | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" राम्रो | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" text label\n",
"0 बजार जसरी ट्रेन्ड चेन्ज गर् हेर् प्रोफिट बूकिङ... 2\n",
"1 1000 अंक घट नेप्से 200 अंक बढ् ठूलो कुरो होइन ... 1\n",
"2 होइन सानि बैंक bonus घोसणा २ महिना (book clos... 2\n",
"3 खैँ MBJC कित्ता रू,10/- बढेर आज रू,1100/- 10क... 2\n",
"4 राम्रो 1"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "b76b3d7e",
"metadata": {},
"outputs": [],
"source": [
"def get_bert_embedding_sentence(input_sentence):\n",
" md = model\n",
" tokenizer = tokenizers\n",
" marked_text = \" [CLS] \" + input_sentence + \" [SEP] \"\n",
" tokenized_text = tokenizer.tokenize(marked_text)\n",
"\n",
" indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n",
" segments_ids = [1] * len(indexed_tokens) \n",
" \n",
" # Convert inputs to Pytorch tensors\n",
" tokens_tensors = torch.tensor([indexed_tokens])\n",
" segments_tensors = torch.tensor([segments_ids])\n",
" \n",
" with torch.no_grad():\n",
" outputs = md(tokens_tensors, segments_tensors)\n",
" # removing the first hidden state\n",
" # the first state is the input state \n",
"\n",
" hidden_states = outputs.hidden_states\n",
"# print(hidden_states[-2])\n",
" # second_hidden_states = outputs[2]\n",
" # hidden_states has shape [13 x 1 x 22 x 768]\n",
"\n",
" # token_vecs is a tensor with shape [22 x 768]\n",
"# token_vecs = hidden_states[-2][0]\n",
" # get last four layers\n",
"# last_four_layers = [hidden_states[i] for i in (-1,-2, -3,-4)]\n",
"\n",
"\n",
" # cast layers to a tuple and concatenate over the last dimension\n",
"# cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)\n",
"# print(cat_hidden_states.shape)\n",
" token_vecs = hidden_states[-2][0]\n",
"\n",
" # take the mean of the concatenated vector over the token dimension\n",
"# sentence_embedding = torch.mean(cat_hidden_states, dim=0).squeeze()\n",
"\n",
" # Calculate the average of all 22 token vectors.\n",
" sentence_embedding = torch.mean(token_vecs, dim=0)\n",
"# sentence_embedding = torch.mean(token_vecs, dim=1)\n",
" return sentence_embedding.numpy()"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "1da99701",
"metadata": {},
"outputs": [],
"source": [
"# get_bert_embedding_sentence(\"नेपाल को ससकृती ध्वस्त पार्ने योजना\")"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "d08f787c",
"metadata": {},
"outputs": [],
"source": [
"df=df.drop(df[df['label']==2].index)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "9c8990f7",
"metadata": {},
"outputs": [],
"source": [
"df.dropna(inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "ba7e75a3",
"metadata": {},
"outputs": [],
"source": [
"df['word_embeddings'] = df['text'].apply(get_bert_embedding_sentence)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "edad3099",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6056, 3)"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "4760c1d1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" text | \n",
" label | \n",
" word_embeddings | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 1000 अंक घट नेप्से 200 अंक बढ् ठूलो कुरो होइन ... | \n",
" 1 | \n",
" [-0.2517209, 0.80447733, -0.30090085, 0.363934... | \n",
"
\n",
" \n",
" 4 | \n",
" राम्रो | \n",
" 1 | \n",
" [-0.4275645, 0.90052205, -0.6469192, 0.3758416... | \n",
"
\n",
" \n",
" 6 | \n",
" जानकारी धन्यवाद रामहरी ब्रदर | \n",
" 1 | \n",
" [0.24045938, 0.72639877, -0.11193645, 0.146293... | \n",
"
\n",
" \n",
" 18 | \n",
" भारत-मधेस नेपाल-चीन सम्बन्ध विग्रन्छ, मधेसी ने... | \n",
" 0 | \n",
" [0.15390012, 0.67477095, -0.1543702, -0.212426... | \n",
"
\n",
" \n",
" 25 | \n",
" लेखनाथ न्यौपा खुलासा,महाधिबेशन एमसीसी गर् जुत्... | \n",
" 0 | \n",
" [-0.07738958, 1.039313, -0.1071973, -0.0086015... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" text label \\\n",
"1 1000 अंक घट नेप्से 200 अंक बढ् ठूलो कुरो होइन ... 1 \n",
"4 राम्रो 1 \n",
"6 जानकारी धन्यवाद रामहरी ब्रदर 1 \n",
"18 भारत-मधेस नेपाल-चीन सम्बन्ध विग्रन्छ, मधेसी ने... 0 \n",
"25 लेखनाथ न्यौपा खुलासा,महाधिबेशन एमसीसी गर् जुत्... 0 \n",
"\n",
" word_embeddings \n",
"1 [-0.2517209, 0.80447733, -0.30090085, 0.363934... \n",
"4 [-0.4275645, 0.90052205, -0.6469192, 0.3758416... \n",
"6 [0.24045938, 0.72639877, -0.11193645, 0.146293... \n",
"18 [0.15390012, 0.67477095, -0.1543702, -0.212426... \n",
"25 [-0.07738958, 1.039313, -0.1071973, -0.0086015... "
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "bc3840ee",
"metadata": {},
"outputs": [],
"source": [
"# df.to_csv('embedding_data.csv',index = False)"
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "2da7b924",
"metadata": {},
"outputs": [],
"source": [
"X,y = df['word_embeddings'], df['label']"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "6bc72bb6",
"metadata": {},
"outputs": [],
"source": [
"# scaler = StandardScaler()\n",
"# pca = PCA(n_components = 768)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "99ad87ec",
"metadata": {},
"outputs": [],
"source": [
"# scaled_X = scaler.fit_transform(X.tolist())\n",
"# pca_X = pca.fit_transform(scaled_X)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "9689b1a4",
"metadata": {},
"outputs": [],
"source": [
"train_X, test_X, train_y, test_y = train_test_split(X,y, test_size = 0.2, random_state = 420)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "828e1a7a",
"metadata": {},
"outputs": [],
"source": [
"svc = SVC()"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "d6524c9d",
"metadata": {},
"outputs": [],
"source": [
"# train_X = [i[0] for i in train_X]\n",
"# test_X = [i[0] for i in test_X]"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "f8311883",
"metadata": {},
"outputs": [],
"source": [
"# train_X[0][0].shape"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "2af91c5f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SVC()"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"svc.fit(train_X.tolist(), train_y)\n",
"#svc.fit(train_X, train_y)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "16d5e606",
"metadata": {},
"outputs": [],
"source": [
"svc_pred = svc.predict(test_X.tolist())\n",
"# svc_pred = svc.predict(test_X)"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "fdd814fe",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[424 91]\n",
" [ 79 618]]\n"
]
}
],
"source": [
"print(confusion_matrix(test_y, svc_pred))"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "c87a1d85",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.84 0.82 0.83 515\n",
" 1 0.87 0.89 0.88 697\n",
"\n",
" accuracy 0.86 1212\n",
" macro avg 0.86 0.85 0.86 1212\n",
"weighted avg 0.86 0.86 0.86 1212\n",
"\n"
]
}
],
"source": [
"print(classification_report(test_y, svc_pred))"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "78fe89bc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8597359735973598"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy_score(test_y, svc_pred)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "87c34455",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8790896159317211"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f1_score(test_y, svc_pred)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "fa889bcb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"नराम्रो is negative sentiment\n"
]
}
],
"source": [
"sent = \"नराम्रो\"\n",
"predicted_label = svc.predict(np.array(get_bert_embedding_sentence(sent).tolist()).reshape(1,-1))[0]\n",
"if predicted_label == 0:\n",
" print(f'{sent} is negative sentiment')\n",
"else:\n",
" print(f'{sent} is positive sentiment')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "2c5d51e6",
"metadata": {},
"outputs": [],
"source": [
"pickle.dump(svc, open('scv_sentiment','wb'))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "00640092",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# pickle.dump(svc,open('svc_sentiment','wb'))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "cdc460bd",
"metadata": {},
"outputs": [],
"source": [
"svc_sentiment = pickle.load(open('svc_sentiment','rb'))"
]
},
{
"cell_type": "code",
"execution_count": 113,
"id": "0791ecca",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"svc.predict(np.array(get_bert_embedding_sentence(\"देश बिग्रियो\").tolist()).reshape(1,-1))[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "32c10466",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}