Spaces:
Build error
Build error
import os | |
import re | |
import time | |
import requests | |
import ast | |
import pickle | |
import json | |
import torch | |
import pandas as pd | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.support.ui import WebDriverWait | |
from langdetect import detect | |
from nepali_unicode_converter.convert import Converter | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.common.action_chains import ActionChains | |
# dataset = pd.read_csv("/media/gpu/157/Nepali_sentiment_Analysis/collected_labeled_data.csv") | |
review_url = "https://my.daraz.com.np/pdp/review/getReviewList?itemId=_id_&pageSize=5&filter=0&sort=0&pageNo=1" | |
model = pickle.load(open('bert_model/model','rb')) | |
tokenizers = pickle.load(open('tokenizers.pkl','rb')) | |
svc_sentiment = pickle.load(open('scv_sentiment','rb')) | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") | |
def remove_emojis(text): | |
emoji_pattern = re.compile("[" | |
u"\U0001F600-\U0001F64F" # emoticons | |
u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
u"\U0001F680-\U0001F6FF" # transport & map symbols | |
u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
u"\U00002500-\U00002BEF" # chinese char | |
u"\U00002702-\U000027B0" | |
u"\U00002702-\U000027B0" | |
u"\U000024C2-\U0001F251" | |
u"\U0001f926-\U0001f937" | |
u"\U00010000-\U0010ffff" | |
u"\u2640-\u2642" | |
u"\u2600-\u2B55" | |
u"\u200d" | |
u"\u23cf" | |
u"\u23e9" | |
u"\u231a" | |
u"\ufe0f" # dingbats | |
u"\u3030" | |
"]+", re.UNICODE) | |
text = emoji_pattern.sub(r'', text) | |
return text | |
def get_bert_embedding_sentence(input_sentence): | |
md = model | |
tokenizer = tokenizers | |
marked_text = " [CLS] " + input_sentence + " [SEP] " | |
tokenized_text = tokenizer.tokenize(marked_text) | |
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) | |
segments_ids = [1] * len(indexed_tokens) | |
tokens_tensors = torch.tensor([indexed_tokens]) | |
segments_tensors = torch.tensor([segments_ids]) | |
with torch.no_grad(): | |
outputs = md(tokens_tensors, segments_tensors) | |
hidden_states = outputs.hidden_states | |
token_vecs = hidden_states[-2][0] | |
sentence_embedding = torch.mean(token_vecs, dim=0) | |
return sentence_embedding.numpy() | |
def scrap_data(): | |
positive_sentimet = dataset.loc[dataset['label'] == 1] | |
negative_sentiment = dataset.loc[dataset['label'] == 0] | |
return positive_sentimet, negative_sentiment | |
def comment_sentiment(text): | |
lang_list = ["hi","ne","mr"] | |
converter = Converter() | |
if detect(text) == "ne": | |
embedding = get_bert_embedding_sentence(text) | |
svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0] | |
""" | |
if detect(text) not in lang_list: | |
result = converter.convert(text) | |
embedding = get_bert_embedding_sentence(result) | |
svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0] | |
# predicted_label.append(svc_pred) | |
# comment_text.append(review["reviewContent"]) | |
else: | |
embedding = get_bert_embedding_sentence(text) | |
svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0] | |
# predicted_label.append(svc_pred) | |
# comment_text.append(review["reviewContent"]) | |
""" | |
return svc_pred | |
def scrape_comment(url): | |
lang_list = ["hi","ne","mr"] | |
converter = Converter() | |
id = url.split("-")[-2].replace("i","") | |
api_url = review_url.replace("_id_",id) | |
print("---------------------------------") | |
response = requests.get(api_url).text | |
print(response) | |
response = json.loads(response) | |
df = pd.DataFrame(columns=["text",'label']) | |
reviews = response["model"]["items"] | |
predicted_label =[] | |
comment_text =[] | |
for review in reviews: | |
text = review["reviewContent"] | |
try: | |
if detect(text) not in lang_list: | |
result = converter.convert(text) | |
embedding = get_bert_embedding_sentence(result) | |
svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0] | |
predicted_label.append(svc_pred) | |
comment_text.append(review["reviewContent"]) | |
else: | |
embedding = get_bert_embedding_sentence(text) | |
svc_pred = svc_sentiment.predict(embedding.reshape(1,-1))[0] | |
predicted_label.append(svc_pred) | |
comment_text.append(review["reviewContent"]) | |
except Exception as e: | |
print(e) | |
pass | |
df['text'] = comment_text | |
df['label'] = predicted_label | |
positive_sentimet = df.loc[df['label'] == 1] | |
negative_sentiment = df.loc[df['label'] == 0] | |
return positive_sentimet, negative_sentiment | |
# def scrap_twitter(url): | |
# tweets = driver.find_elements(By.XPATH,'//*[@id="id__nspdargek9"]/span/text()') | |
# print(tweets) | |
def scrape_twitter(url): | |
''' | |
to scrape tweet from given username provide username and tweet id | |
''' | |
driver = webdriver.Chrome("driver/chromedriver",options=chrome_options) | |
# driver.get(f"https://twitter.com/{username}/status/{tweet_id}") | |
driver.get(url) | |
time.sleep(5) #change according to your pc and internet connection | |
tweets = [] | |
result = False | |
old_height = driver.execute_script("return document.body.scrollHeight") | |
#set initial all_tweets to start loop | |
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]') | |
while result == False: | |
for item in all_tweets[1:]: # skip tweet already scrapped | |
try: | |
text = item.find_element(By.XPATH, './/div[@data-testid="tweetText"]').text | |
except: | |
text = '[empty]' | |
#Append new tweets replies to tweet array | |
tweets.append(text) | |
#scroll down the page | |
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") | |
time.sleep(2) | |
try: | |
try: | |
button = driver.find_element_by_css_selector("div.css-901oao.r-1cvl2hr.r-37j5jr.r-a023e6.r-16dba41.r-rjixqe.r-bcqeeo.r-q4m81j.r-qvutc0") | |
except: | |
button = driver.find_element_by_css_selector("div.css-1dbjc4n.r-1ndi9ce") #there are two kinds of buttons | |
ActionChains(driver).move_to_element(button).click(button).perform() | |
time.sleep(2) | |
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") | |
time.sleep(2) | |
except: | |
pass | |
new_height = driver.execute_script("return document.body.scrollHeight") | |
if new_height == old_height: | |
result = True | |
old_height = new_height | |
#update all_tweets to keep loop | |
all_tweets = driver.find_elements(By.XPATH, '//div[@data-testid]//article[@data-testid="tweet"]') | |
driver.close() | |
text = [] | |
predicted_label = [] | |
for comments in tweets: | |
try: | |
result = comment_sentiment(comments) | |
comments = remove_emojis(comments) | |
text.append(comments) | |
predicted_label.append(result) | |
except Exception as e: | |
pass | |
df = pd.DataFrame(columns=["text","label"]) | |
df['text'] = text | |
df['label'] = predicted_label | |
positive_sentimet = df.loc[df['label'] == 1] | |
negative_sentiment = df.loc[df['label'] == 0] | |
return positive_sentimet, negative_sentiment | |
def scrape_youtube(url): | |
driver = webdriver.Chrome("driver/chromedriver",options=chrome_options) | |
data =[] | |
wait = WebDriverWait(driver,15) | |
driver.get(url) | |
predicted_label = [] | |
for item in range(5): | |
wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END) | |
time.sleep(5) | |
for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#content"))): | |
data.append(comment.text) | |
text =[] | |
for comments in data: | |
try: | |
result =comment_sentiment(comments) | |
comments = remove_emojis(comments) | |
text.append(comments) | |
predicted_label.append(result) | |
except Exception as e: | |
# raise | |
pass | |
driver.close() | |
df = pd.DataFrame(columns=["text","label"]) | |
df['text'] = text | |
df['label'] = predicted_label | |
positive_sentimet = df.loc[df['label'] == 1] | |
negative_sentiment = df.loc[df['label'] == 0] | |
return positive_sentimet, negative_sentiment | |
if __name__ == "__main__": | |
url = "https://www.youtube.com/watch?v=uD58-EHwaeI" | |
positive_sentimet, negative_sentiment= scrap_youtube(url=url) | |
print(positive_sentimet, negative_sentiment) | |