Spaces:
Runtime error
Runtime error
File size: 3,872 Bytes
a1e75ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import nltk
import pandas as pd
from nltk.corpus import stopwords
import re
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from scipy.special import softmax
class sentimentAnalysis():
def __init__(self, lang, text2analysePath):
self.lang = lang
self.text2analysePath = text2analysePath
self.engLabels = ["negative", "neutral", "positive"]
nltk.download("stopwords")
def downloadModels(self):
txtt = open(self.text2analysePath, 'r', encoding="utf-8")
if self.lang == "English" or self.lang == "İngilizce" or self.lang == "ingilizce" or self.lang == "english":
MODEL = f"sentimentModels/cardiffnlp/twitter-roberta-base-sentiment"
self.tokenizer = AutoTokenizer.from_pretrained(MODEL)
self.model = AutoModelForSequenceClassification.from_pretrained(MODEL)
self.model.save_pretrained(MODEL)
self.tokenizer.save_pretrained(MODEL)
self.engPrepareText(txtt)
elif self.lang == "Turkish" or self.lang == "Türkçe" or self.lang == "türkçe" or self.lang == "turkish":
self.model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
self.tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
self.sa = pipeline("sentiment-analysis", tokenizer=self.tokenizer, model=self.model)
self.trPrepareText(txtt)
else:
print("Dil bulunamadı!------The language has not been found!")
def engPrepareText(self, txtt):
a = []
for i in txtt.readlines():
i = i.lower()
i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
spl = i.split(' ')
new_word = [word for word in spl if not word in set(stopwords.words("english"))]
a.append(' '.join(new_word))
dFen = pd.DataFrame(a, columns=["texts"])
self.engAnalyse(dFen)
def trPrepareText(self, txtt):
a = []
for i in txtt.readlines():
i = i.lower()
i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
spl = i.split(' ')
new_word = [word for word in spl if not word in set(stopwords.words("turkish"))]
a.append(' '.join(new_word))
dFtr = pd.DataFrame(a, columns=["metinler"])
self.trAnalyse(dFtr)
def engAnalyse(self, dFen):
for i in range(len(dFen)):
text = dFen["texts"][i]
encoded_input = self.tokenizer(text, return_tensors='pt')
output = self.model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
ranking = np.argsort(scores)
ranking = ranking[::-1]
print(f"text: {text}")
for i in range(scores.shape[0]):
l = self.engLabels[ranking[i]]
s = scores[ranking[i]]
print(f"{i + 1}) {l + ':'} {np.round(float(s), 4)}")
def trAnalyse(self, dFtr):
for i in range(len(dFtr)):
text = dFtr["metinler"][i]
p = self.sa(text)[0]
if p["label"] == "positive":
print(f"text: {text}")
print(f"1-) positive: {np.round(float(p['score']), 4)}")
print(f"2-) negative: {np.round(float(1 - p['score']), 4)}")
else:
print(f"text: {text}")
print(f"1-) positive: {np.round(float(1 - p['score']), 4)}")
print(f"2-) negative: {np.round(float(p['score']), 4)}")
lang = "ingilizce"
path = "texts/denemeler/text.txt"
sA = sentimentAnalysis(lang, path).downloadModels()
|