Spaces:
Runtime error
Runtime error
File size: 4,047 Bytes
deb200f a34ad6e deb200f 00c6db8 deb200f 485f576 73b620f c19c9f8 deb200f 8ad74f7 deb200f 8ad74f7 deb200f 8ad74f7 deb200f a27a834 deb200f a27a834 deb200f 898101c deb200f 898101c deb200f 898101c deb200f 898101c deb200f 898101c deb200f a27a834 deb200f a27a834 deb200f 7fe3481 deb200f 7fe3481 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import re
from gensim.models.keyedvectors import KeyedVectors
from transformers import pipeline
import pickle
import numpy as np
import pandas as pd
w2v = KeyedVectors.load('models/word2vec')
w2v_vocab = set(sorted(w2v.index_to_key))
model = pickle.load(open('models/w2v_ovr_svc.sav', 'rb'))
classifier = pipeline("zero-shot-classification",
model="facebook/bart-large-mnli", framework='pt'
)
labels = [
'communication', 'waiting time',
'information', 'user interface',
'facilities', 'location', 'price'
]
sample_file = pd.read_csv('sample.csv').to_csv(index=False).encode('utf-8')
print('utils imported!')
def get_sentiment_label_facebook(list_of_sent_dicts):
if list_of_sent_dicts['labels'][0] == 'negative':
return 'negative'
else:
return 'positive'
def get_single_prediction(text):
# manipulate data into a format that we pass to our model
text = text.lower() #lower case
text = re.sub('[^0-9a-zA-Z\s]', '', text) #remove special char, punctuation
# Remove OOV words
text = ' '.join([i for i in text.split() if i in w2v_vocab])
# Vectorise text and store in new dataframe. Sentence vector = average of word vectors
text_vectors = np.mean([w2v[i] for i in text.split()], axis=0)
# Make predictions
results = model.predict_proba(text_vectors.reshape(1,300)).squeeze().round(2)
pred_prob = pd.DataFrame({'topic': labels, 'probability': results}).sort_values('probability', ascending=True)
# Get sentiment
sentiment_results = classifier(text,
candidate_labels=['positive', 'negative'],
hypothesis_template='The sentiment of this is {}')
sentiment_prob = pd.DataFrame({'sentiment': sentiment_results['labels'], 'probability': sentiment_results['scores']})
return (pred_prob, sentiment_prob)
def get_multiple_predictions(csv):
df = pd.read_csv(csv)
df.columns = ['sequence']
df['sequence_clean'] = df['sequence'].str.lower() #lower case
df['sequence_clean'] = df['sequence_clean'].str.strip()
df['sequence_clean'] = df['sequence_clean'].str.replace('[^0-9a-zA-Z\s]','') #remove special char, punctuation
# Remove OOV words
df['sequence_clean'] = df['sequence_clean'].apply(lambda x: ' '.join([i for i in x.split() if i in w2v_vocab]))
# Remove rows with blank string
invalid = df[(pd.isna(df['sequence_clean'])) | (df['sequence_clean'] == '')]
invalid.drop(columns=['sequence_clean'], inplace=True)
# Drop rows with blank string
df.dropna(inplace=True)
df = df[df['sequence_clean'] != ''].reset_index(drop=True)
# Vectorise text and store in new dataframe. Sentence vector = average of word vectors
series_text_vectors = pd.DataFrame(df['sequence_clean'].apply(lambda x: np.mean([w2v[i] for i in x.split()], axis=0)).values.tolist())
# Get predictions
pred_results = pd.DataFrame(model.predict(series_text_vectors), columns = labels)
# Join back to original sequence
final_results = df.join(pred_results)
final_results['others'] = final_results[labels].max(axis=1)
final_results['others'] = final_results['others'].apply(lambda x: 1 if x == 0 else 0)
# Get sentiment labels
final_results['sentiment'] = final_results['sequence_clean'].apply(lambda x: get_sentiment_label_facebook(classifier(x,
candidate_labels=['positive', 'negative'],
hypothesis_template='The sentiment of this is {}'))
)
final_results.drop(columns=['sequence_clean'], inplace=True)
# Append invalid rows
if len(invalid) == 0:
return final_results.to_csv(index=False).encode('utf-8')
else:
return pd.concat([final_results, invalid]).reset_index(drop=True).to_csv(index=False).encode('utf-8') |