bitirme-proje / sentimentAnalysis.py
Bedirhan's picture
Upload sentimentAnalysis.py
a1e75ef
raw
history blame
3.87 kB
import nltk
import pandas as pd
from nltk.corpus import stopwords
import re
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from scipy.special import softmax
class sentimentAnalysis():
def __init__(self, lang, text2analysePath):
self.lang = lang
self.text2analysePath = text2analysePath
self.engLabels = ["negative", "neutral", "positive"]
nltk.download("stopwords")
def downloadModels(self):
txtt = open(self.text2analysePath, 'r', encoding="utf-8")
if self.lang == "English" or self.lang == "İngilizce" or self.lang == "ingilizce" or self.lang == "english":
MODEL = f"sentimentModels/cardiffnlp/twitter-roberta-base-sentiment"
self.tokenizer = AutoTokenizer.from_pretrained(MODEL)
self.model = AutoModelForSequenceClassification.from_pretrained(MODEL)
self.model.save_pretrained(MODEL)
self.tokenizer.save_pretrained(MODEL)
self.engPrepareText(txtt)
elif self.lang == "Turkish" or self.lang == "Türkçe" or self.lang == "türkçe" or self.lang == "turkish":
self.model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
self.tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
self.sa = pipeline("sentiment-analysis", tokenizer=self.tokenizer, model=self.model)
self.trPrepareText(txtt)
else:
print("Dil bulunamadı!------The language has not been found!")
def engPrepareText(self, txtt):
a = []
for i in txtt.readlines():
i = i.lower()
i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
spl = i.split(' ')
new_word = [word for word in spl if not word in set(stopwords.words("english"))]
a.append(' '.join(new_word))
dFen = pd.DataFrame(a, columns=["texts"])
self.engAnalyse(dFen)
def trPrepareText(self, txtt):
a = []
for i in txtt.readlines():
i = i.lower()
i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
spl = i.split(' ')
new_word = [word for word in spl if not word in set(stopwords.words("turkish"))]
a.append(' '.join(new_word))
dFtr = pd.DataFrame(a, columns=["metinler"])
self.trAnalyse(dFtr)
def engAnalyse(self, dFen):
for i in range(len(dFen)):
text = dFen["texts"][i]
encoded_input = self.tokenizer(text, return_tensors='pt')
output = self.model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
ranking = np.argsort(scores)
ranking = ranking[::-1]
print(f"text: {text}")
for i in range(scores.shape[0]):
l = self.engLabels[ranking[i]]
s = scores[ranking[i]]
print(f"{i + 1}) {l + ':'} {np.round(float(s), 4)}")
def trAnalyse(self, dFtr):
for i in range(len(dFtr)):
text = dFtr["metinler"][i]
p = self.sa(text)[0]
if p["label"] == "positive":
print(f"text: {text}")
print(f"1-) positive: {np.round(float(p['score']), 4)}")
print(f"2-) negative: {np.round(float(1 - p['score']), 4)}")
else:
print(f"text: {text}")
print(f"1-) positive: {np.round(float(1 - p['score']), 4)}")
print(f"2-) negative: {np.round(float(p['score']), 4)}")
lang = "ingilizce"
path = "texts/denemeler/text.txt"
sA = sentimentAnalysis(lang, path).downloadModels()