|
|
|
"""G project.ipynb |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/13NvZhwwfiJloW8ZsdQ6HLf-jfSRc-tfv |
|
""" |
|
|
|
!wget "https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-train.txt" |
|
!wget "https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-dev.txt" |
|
!wget "https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-test-tweets.txt" |
|
!wget "https://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-test-taskA-gold-labels.txt" |
|
|
|
import pandas as pd |
|
import csv |
|
train_data = pd.read_csv("OSACT2022-sharedTask-train.txt", sep="\t", quoting=csv.QUOTE_NONE) |
|
dev_data = pd.read_csv("OSACT2022-sharedTask-dev.txt", sep="\t", quoting=csv.QUOTE_NONE) |
|
test_data = pd.read_csv("OSACT2022-sharedTask-test-tweets.txt", sep="\t", quoting=csv.QUOTE_NONE) |
|
train_data |
|
|
|
train_data = train_data.drop(columns=['1', 'NOT_HS', 'NOT_VLG' , 'NOT_VIO']) |
|
train_data |
|
|
|
train_data = train_data.rename(columns={"@USER ุฑุฏููุง ุน ุงูุชุทูุฒ ๐๐๐ป": "Text"}) |
|
train_data = train_data.rename(columns={"OFF": "label"}) |
|
train_data |
|
|
|
dev_data |
|
|
|
dev_data = dev_data.drop(columns=['8888', 'NOT_HS', 'NOT_VLG' , 'NOT_VIO']) |
|
|
|
dev_data = dev_data.rename(columns={"@USER ุงูุทุฑุช ุนููู ุจุนูุงุก ูุงุซููู ู
ู ูุฑูุฎูุง ุงูุฌู ๐ช๐": "Text"}) |
|
dev_data = dev_data.rename(columns={"NOT_OFF": "label"}) |
|
dev_data |
|
|
|
test_data |
|
|
|
test_data = test_data.drop(columns=['10158']) |
|
|
|
test_data = test_data.rename(columns={"@USER ูุชูุฒุฑ ู
ุนุงูุง ููุง ุงูู ๐ก๐ก๐ก๐ก": "Text"}) |
|
test_data |
|
|
|
test_labels = pd.read_csv("OSACT2022-sharedTask-test-taskA-gold-labels.txt", sep="\t", quoting=csv.QUOTE_NONE) |
|
test_labels = test_labels.rename(columns={"NOT_OFF": "label"}) |
|
test_data = test_data.join(test_labels) |
|
test_data |
|
|
|
"""# **DOWNLOADING A LIST OF ARABIC STOPWORDS**""" |
|
|
|
|
|
|
|
|
|
!wget https://raw.githubusercontent.com/alaa-a-a/multi-dialect-arabic-stop-words/main/Stop-words/stop_list_1177.txt |
|
arabic_stop_words = [] |
|
with open ('./stop_list_1177.txt',encoding='utf-8') as f : |
|
for word in f.readlines() : |
|
arabic_stop_words.append(word.split("\n")[0]) |
|
|
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import WordPunctTokenizer |
|
from nltk.stem.isri import ISRIStemmer |
|
import string |
|
import re |
|
from bs4 import BeautifulSoup |
|
nltk.download('stopwords') |
|
|
|
|
|
tok = WordPunctTokenizer() |
|
|
|
def normalize_arabic(text): |
|
text = re.sub("[ุฅุฃุขุง]", "ุง", text) |
|
text = re.sub("ู", "ู", text) |
|
text = re.sub("ุค", "ุก", text) |
|
text = re.sub("ุฆ", "ุก", text) |
|
text = re.sub("ุฉ", "ู", text) |
|
text = re.sub("ฺฏ", "ู", text) |
|
return text |
|
|
|
|
|
def remove_diacritics(text): |
|
arabic_diacritics = re.compile(""" |
|
ู | # Tashdid |
|
ู | # Fatha |
|
ู | # Tanwin Fath |
|
ู | # Damma |
|
ู | # Tanwin Damm |
|
ู | # Kasra |
|
ู | # Tanwin Kasr |
|
ู | # Sukun |
|
ู # Tatwil/Kashida |
|
""", re.VERBOSE) |
|
return re.sub(arabic_diacritics, '', text) |
|
|
|
|
|
def remove_punctuations(text): |
|
arabic_punctuations = '''`รทรุ<>_()*&^%][ูุ/:"ุ.,'{}~ยฆ+|!โโฆโโู''' |
|
english_punctuations = string.punctuation |
|
punctuations_list = arabic_punctuations + english_punctuations |
|
translator = str.maketrans('', '', punctuations_list) |
|
return text.translate(translator) |
|
|
|
|
|
def remove_repeating_char(text): |
|
|
|
return re.sub(r'(.)\1+', r'\1\1', text) |
|
|
|
def remove_stop_words(text): |
|
word_list = nltk.tokenize.wordpunct_tokenize(text.lower()) |
|
word_list = [ w for w in word_list if not w in arabic_stop_words] |
|
return (" ".join(word_list)).strip() |
|
|
|
|
|
|
|
def remove_non_arabic_letters(text): |
|
text = re.sub(r'([@A-Za-z0-9_]+)|#|http\S+', ' ', text) |
|
text = re.sub(r'ููููููููููููู', '', text) |
|
return text |
|
|
|
|
|
|
|
|
|
def clean_str(text): |
|
text = remove_non_arabic_letters(text) |
|
text = remove_punctuations(text) |
|
text = remove_diacritics(text) |
|
text = remove_repeating_char(text) |
|
|
|
|
|
|
|
soup = BeautifulSoup(text, 'lxml') |
|
souped = soup.get_text() |
|
pat1 = r'@[A-Za-z0-9]+' |
|
pat2 = r'https?://[A-Za-z0-9./]+' |
|
combined_pat = r'|'.join((pat1, pat2)) |
|
stripped = re.sub(combined_pat, '', souped) |
|
try: |
|
clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?") |
|
except: |
|
clean = stripped |
|
|
|
words = tok.tokenize(clean) |
|
return (" ".join(words)).strip() |
|
|
|
"""## **applying preprocessing on our dataset**""" |
|
|
|
print("Cleaning and parsing the training dataset...\n") |
|
|
|
train_data["Text"] = train_data["Text"].apply(lambda x: clean_str(x)) |
|
|
|
train_data.head() |
|
|
|
print("Cleaning and parsing the development dataset...\n") |
|
|
|
dev_data["Text"] = dev_data["Text"].apply(lambda x: clean_str(x)) |
|
|
|
dev_data.head() |
|
|
|
print("Cleaning and parsing the test dataset...\n") |
|
|
|
test_data["Text"] = test_data["Text"].apply(lambda x: clean_str(x)) |
|
|
|
test_data.head() |
|
|
|
label2id = {"NOT_OFF": 0,"OFF": 1} |
|
id2label = {0: "NOT_OFF", 1: "OFF"} |
|
|
|
train_data['label'] = train_data['label'].apply(lambda x: label2id[x]) |
|
train_data=train_data[["Text", "label"]] |
|
train_data.head() |
|
|
|
dev_data['label'] = dev_data['label'].apply(lambda x: label2id[x]) |
|
dev_data=dev_data[["Text", "label"]] |
|
dev_data.head() |
|
|
|
test_data['label'] = test_data['label'].apply(lambda x: label2id[x]) |
|
test_data=test_data[["Text", "label"]] |
|
test_data |
|
|
|
import pandas as pd |
|
from imblearn.over_sampling import RandomOverSampler |
|
from collections import Counter |
|
|
|
X = train_data[['Text']] |
|
y = train_data['label'] |
|
|
|
print('Original class distribution:', Counter(y)) |
|
|
|
ros = RandomOverSampler(random_state=42) |
|
|
|
X_resampled, y_resampled = ros.fit_resample(X, y) |
|
|
|
train_data_resampled = pd.DataFrame(X_resampled, columns=['Text']) |
|
train_data_resampled['label'] = y_resampled |
|
|
|
print('Resampled class distribution:', Counter(y_resampled)) |
|
|
|
y_resampled.value_counts() |
|
|
|
train_data_resampled.head() |
|
|
|
from sklearn.model_selection import train_test_split |
|
|
|
X_train = train_data_resampled['Text'].values |
|
y_train = train_data_resampled['label'].values |
|
|
|
X_val = dev_data['Text'].values |
|
y_val = dev_data['label'].values |
|
|
|
|
|
|
|
print("Training data shape:", X_train.shape, y_train.shape) |
|
print("Validation data shape:", X_val.shape, y_val.shape) |
|
|
|
train_text_lengths = [len(text.split()) for text in X_train] |
|
max_length = max(train_text_lengths) |
|
|
|
print("Maximum length of text:", max_length) |
|
|
|
"""### APPLYING QARIB MODEL""" |
|
|
|
! pip install transformers[torch] |
|
|
|
import numpy as np |
|
|
|
|
|
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score |
|
|
|
from transformers import AutoConfig, BertForSequenceClassification, AutoTokenizer |
|
from transformers.data.processors import SingleSentenceClassificationProcessor, InputFeatures |
|
from transformers import Trainer , TrainingArguments |
|
|
|
train_df = pd.DataFrame({ |
|
'label':y_train, |
|
'text': X_train |
|
}) |
|
|
|
dev_df = pd.DataFrame({ |
|
'label':y_val, |
|
'text': X_val |
|
}) |
|
|
|
test_df = pd.DataFrame({ |
|
'label':test_data['label'], |
|
'text': test_data['Text'] |
|
}) |
|
|
|
PREFIX_LIST = [ |
|
"ุงู", |
|
"ู", |
|
"ู", |
|
"ุจ", |
|
"ู", |
|
"ู", |
|
"ูู", |
|
"\u0627\u0644", |
|
"\u0648", |
|
"\u0641", |
|
"\u0628", |
|
"\u0643", |
|
"\u0644", |
|
"\u0644\u0644", |
|
"ุณ", |
|
] |
|
SUFFIX_LIST = [ |
|
"ู", |
|
"ูุง", |
|
"ู", |
|
"ู", |
|
"ูู
ุง", |
|
"ูู
ุง", |
|
"ูุง", |
|
"ูู
", |
|
"ูู
", |
|
"ูู", |
|
"ูู", |
|
"ุง", |
|
"ุงู", |
|
"ูู", |
|
"ูู", |
|
"ูุง", |
|
"ุงุช", |
|
"ุช", |
|
"ู", |
|
"ุฉ", |
|
"\u0647", |
|
"\u0647\u0627", |
|
"\u0643", |
|
"\u064a", |
|
"\u0647\u0645\u0627", |
|
"\u0643\u0645\u0627", |
|
"\u0646\u0627", |
|
"\u0643\u0645", |
|
"\u0647\u0645", |
|
"\u0647\u0646", |
|
"\u0643\u0646", |
|
"\u0627", |
|
"\u0627\u0646", |
|
"\u064a\u0646", |
|
"\u0648\u0646", |
|
"\u0648\u0627", |
|
"\u0627\u062a", |
|
"\u062a", |
|
"\u0646", |
|
"\u0629", |
|
] |
|
|
|
|
|
|
|
_PREFIX_SYMBOLS = [x + "+" for x in PREFIX_LIST] |
|
_SUFFIX_SYMBOLS = ["+" + x for x in SUFFIX_LIST] |
|
NEVER_SPLIT_TOKENS = list(set(_PREFIX_SYMBOLS + _SUFFIX_SYMBOLS)) |
|
|
|
model_name = "qarib/bert-base-qarib" |
|
num_labels = 2 |
|
config = AutoConfig.from_pretrained(model_name,num_labels=num_labels, output_attentions=True) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name, |
|
do_lower_case=False, |
|
do_basic_tokenize=True, |
|
never_split=NEVER_SPLIT_TOKENS) |
|
tokenizer.max_len = 64 |
|
model = BertForSequenceClassification.from_pretrained(model_name, config=config) |
|
|
|
train_dataset = SingleSentenceClassificationProcessor(mode='classification') |
|
dev_dataset = SingleSentenceClassificationProcessor(mode='classification') |
|
|
|
train_dataset.add_examples(texts_or_text_and_labels=train_df['text'],labels=train_df['label'],overwrite_examples = True) |
|
dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'],labels=dev_df['label'],overwrite_examples = True) |
|
print(train_dataset.examples[0]) |
|
|
|
train_features = train_dataset.get_features(tokenizer = tokenizer, max_length =64) |
|
dev_features = dev_dataset.get_features(tokenizer = tokenizer, max_length =64) |
|
|
|
|
|
print(len(train_features)) |
|
print(len(dev_features)) |
|
|
|
def compute_metrics(p): |
|
print(np.shape(p.predictions[0])) |
|
print(np.shape(p.predictions[1])) |
|
print(len(p.label_ids)) |
|
preds = np.argmax(p.predictions[0], axis=1) |
|
assert len(preds) == len(p.label_ids) |
|
print(classification_report(p.label_ids,preds)) |
|
print(confusion_matrix(p.label_ids,preds)) |
|
|
|
macro_f1 = f1_score(p.label_ids,preds,average='macro') |
|
macro_precision = precision_score(p.label_ids,preds,average='macro') |
|
macro_recall = recall_score(p.label_ids,preds,average='macro') |
|
acc = accuracy_score(p.label_ids,preds) |
|
return { |
|
'macro_f1' : macro_f1, |
|
'macro_precision': macro_precision, |
|
'macro_recall': macro_recall, |
|
'accuracy': acc |
|
} |
|
|
|
! mkdir train |
|
training_args = TrainingArguments("./train") |
|
training_args.do_train = True |
|
training_args.evaluate_during_training = True |
|
training_args.adam_epsilon = 1e-8 |
|
training_args.learning_rate = 2e-5 |
|
training_args.warmup_steps = 0 |
|
training_args.per_device_train_batch_size = 64 |
|
training_args.per_device_eval_batch_size = 64 |
|
training_args.num_train_epochs = 2 |
|
training_args.logging_steps = 300 |
|
training_args.save_steps = 2000 |
|
training_args.seed = 42 |
|
print(training_args.logging_steps) |
|
|
|
|
|
trainer = Trainer(model=model, |
|
args = training_args, |
|
train_dataset = train_features, |
|
eval_dataset = dev_features, |
|
compute_metrics = compute_metrics) |
|
|
|
trainer.train() |
|
|
|
trainer.evaluate() |
|
|
|
!pip install fasttext |
|
import fasttext |
|
import fasttext.util |
|
from huggingface_hub import hf_hub_download |
|
|
|
model_path = hf_hub_download(repo_id="facebook/fasttext-ar-vectors", filename="model.bin") |
|
|
|
model_fasttext = fasttext.load_model(model_path) |
|
|
|
|
|
print(len(model_fasttext.words)) |
|
model_fasttext['bread'].shape |
|
|
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import WordPunctTokenizer |
|
from nltk.stem.isri import ISRIStemmer |
|
import string |
|
import re |
|
from bs4 import BeautifulSoup |
|
nltk.download('stopwords') |
|
|
|
|
|
tok = WordPunctTokenizer() |
|
|
|
def normalize_arabic(text): |
|
text = re.sub("[ุฅุฃุขุง]", "ุง", text) |
|
text = re.sub("ู", "ู", text) |
|
text = re.sub("ุค", "ุก", text) |
|
text = re.sub("ุฆ", "ุก", text) |
|
text = re.sub("ุฉ", "ู", text) |
|
text = re.sub("ฺฏ", "ู", text) |
|
return text |
|
|
|
|
|
def remove_diacritics(text): |
|
arabic_diacritics = re.compile(""" |
|
ู | # Tashdid |
|
ู | # Fatha |
|
ู | # Tanwin Fath |
|
ู | # Damma |
|
ู | # Tanwin Damm |
|
ู | # Kasra |
|
ู | # Tanwin Kasr |
|
ู | # Sukun |
|
ู # Tatwil/Kashida |
|
""", re.VERBOSE) |
|
return re.sub(arabic_diacritics, '', text) |
|
|
|
|
|
def remove_punctuations(text): |
|
arabic_punctuations = '''`รทรุ<>_()*&^%][ูุ/:"ุ.,'{}~ยฆ+|!โโฆโโู''' |
|
english_punctuations = string.punctuation |
|
punctuations_list = arabic_punctuations + english_punctuations |
|
translator = str.maketrans('', '', punctuations_list) |
|
return text.translate(translator) |
|
|
|
|
|
def remove_repeating_char(text): |
|
|
|
return re.sub(r'(.)\1+', r'\1\1', text) |
|
|
|
def remove_stop_words(text): |
|
|
|
englishStopWords = stopwords.words('english') |
|
|
|
all_stopwords = set(englishStopWords + arabic_stop_words) |
|
|
|
word_list = nltk.tokenize.wordpunct_tokenize(text.lower()) |
|
word_list = [ w for w in word_list if not w in all_stopwords ] |
|
return (" ".join(word_list)).strip() |
|
|
|
def get_root(text): |
|
word_list = nltk.tokenize.wordpunct_tokenize(text.lower()) |
|
result = [] |
|
arstemmer = ISRIStemmer() |
|
for word in word_list: result.append(arstemmer.stem(word)) |
|
return (' '.join(result)).strip() |
|
|
|
def clean_tweet(text): |
|
text = re.sub(r'([@A-Za-z0-9_]+)|#|http\S+', ' ', text) |
|
text = re.sub(r'ููููููููููููู', '', text) |
|
return text |
|
|
|
|
|
|
|
|
|
def clean_str(text): |
|
text = clean_tweet(text) |
|
|
|
text = remove_punctuations(text) |
|
text = remove_diacritics(text) |
|
text = remove_repeating_char(text) |
|
|
|
|
|
|
|
text = text.replace('ูู', 'ู') |
|
text = text.replace('ูู', 'ู') |
|
text = text.replace('ุงุง', 'ุง') |
|
|
|
|
|
|
|
soup = BeautifulSoup(text, 'lxml') |
|
souped = soup.get_text() |
|
pat1 = r'@[A-Za-z0-9]+' |
|
pat2 = r'https?://[A-Za-z0-9./]+' |
|
combined_pat = r'|'.join((pat1, pat2)) |
|
stripped = re.sub(combined_pat, '', souped) |
|
try: |
|
clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?") |
|
except: |
|
clean = stripped |
|
|
|
words = tok.tokenize(clean) |
|
return (" ".join(words)).strip() |
|
|
|
!gdown "165kzfZDsRTZAAfZKedeZiUlKzMcHNgPd" |
|
!gdown "1WdgbvqDYIa-g5ijjsz5zb-3lVvUXUtmS&confirm=t" |
|
!gdown "1foNTGFjhWAxS-_SfF7rga80UmFT7BDJ0&confirm=t" |
|
|
|
!pip install pyarabic |
|
!pip install farasapy |
|
!pip install transformers[torch] |
|
!pip install Keras-Preprocessing |
|
|
|
! git clone https://github.com/facebookresearch/fastText.git |
|
! cd fastText && sudo pip install . |
|
|
|
from transformers import pipeline |
|
unmasker_MARBERT = pipeline('fill-mask', model='UBC-NLP/MARBERT', top_k=50) |
|
|
|
def light_preprocess(text): |
|
text = clean_tweet(text) |
|
|
|
text = remove_punctuations(text) |
|
text = remove_diacritics(text) |
|
text = remove_repeating_char(text) |
|
text = text.replace('ูู', 'ู') |
|
text = text.replace('ูู', 'ู') |
|
text = text.replace('ุงุง', 'ุง') |
|
return text |
|
|
|
nltk.download('stopwords') |
|
englishStopWords = stopwords.words('english') |
|
arabic_punctuations = '''`รทรุ<>_()*&^%][ูุ/:"ุ.,'{}~ยฆ+|!โโฆโโู''' |
|
english_punctuations = string.punctuation |
|
punctuations_list = arabic_punctuations + english_punctuations |
|
|
|
all_stopwords = set(englishStopWords + arabic_stop_words) |
|
|
|
!pip install torch |
|
|
|
import torch |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
def classsify_tweets(tweet): |
|
df = pd.DataFrame({"tweet": tweet}) |
|
df['clean_tweet'] = df['tweet'].apply(lambda x: clean_str(x)) |
|
|
|
dev_df = pd.DataFrame({ |
|
'id':range(len(df)), |
|
'text': df["clean_tweet"] |
|
}) |
|
|
|
test_example = SingleSentenceClassificationProcessor(mode='classification') |
|
test_example.add_examples(texts_or_text_and_labels=dev_df['text'], overwrite_examples = True) |
|
|
|
test_features = test_example.get_features(tokenizer = tokenizer, max_length =64) |
|
|
|
input_ids = [i.input_ids for i in test_features] |
|
attention_masks = [i.attention_mask for i in test_features] |
|
|
|
inputs = torch.tensor(input_ids) |
|
masks = torch.tensor(attention_masks) |
|
|
|
|
|
model.eval() |
|
|
|
|
|
model.to(device) |
|
|
|
torch.cuda.empty_cache() |
|
|
|
inputs = inputs.to(device) |
|
masks = masks.to(device) |
|
|
|
|
|
output = model(inputs, attention_mask=masks)["logits"] |
|
|
|
output = output.cpu().detach().numpy() |
|
|
|
return output |
|
|
|
size = len(test_data) |
|
print("size of test set:", size) |
|
correct_class_tweets = [] |
|
correct_class = [] |
|
for i in range(0, size): |
|
txt = test_data['Text'].astype('U')[i] |
|
cls = test_data['label'][i] |
|
label = id2label[np.argmax(classsify_tweets([txt]), axis=1)[0]] |
|
if label == cls and label == 1: |
|
correct_class_tweets.append(txt) |
|
correct_class.append(cls) |
|
|
|
from scipy.spatial import distance |
|
from farasa.stemmer import FarasaStemmer |
|
frasa_stemmer = FarasaStemmer(interactive=True) |
|
|
|
!pip install emoji |
|
|
|
import emoji |
|
|
|
def select_best_replacement(pos, x_cur, verbose=False): |
|
""" Select the most effective replacement to word at pos (pos) in (x_cur)""" |
|
|
|
if bool(emoji.emoji_count(x_cur.split()[pos])): |
|
return None |
|
|
|
embedding_masked_word = model_fasttext[x_cur.split()[pos]] |
|
|
|
x_masked = (" ".join(x_cur.split()[:pos]) + " [MASK] " + " ".join(x_cur.split()[pos + 1:])).strip() |
|
unmasked_seq = unmasker_MARBERT(x_masked)[:20] |
|
|
|
max_sim = -1 |
|
best_perturb_dict = {} |
|
for seq in unmasked_seq: |
|
if frasa_stemmer.stem(seq['token_str']) in frasa_stemmer.stem(x_cur.split()[pos]): |
|
continue |
|
if seq['token_str'] in punctuations_list or pos >= len(seq["sequence"].split()): |
|
continue |
|
embedding_masked_word_new = model_fasttext[seq['token_str']] |
|
if np.sum(embedding_masked_word) == 0 or np.sum(embedding_masked_word_new) == 0: |
|
continue |
|
if verbose: print("New word: ", seq['token_str']) |
|
sim = 1 - distance.cosine(embedding_masked_word, embedding_masked_word_new) |
|
if sim > max_sim: |
|
max_sim = sim |
|
best_perturb_dict["sim"] = sim |
|
best_perturb_dict["Masked word"] = x_cur.split()[pos] |
|
best_perturb_dict["New word"] = seq['token_str'] |
|
best_perturb_dict["New seq"] = x_cur.replace(x_cur.split()[pos], seq['token_str']) |
|
|
|
return best_perturb_dict.get("New seq", None) |
|
|
|
|
|
perturb_counter = 0 |
|
for tweet_ix, tweet in enumerate(correct_class_tweets): |
|
print("Tweet index: ", tweet_ix) |
|
|
|
x_adv = light_preprocess(tweet) |
|
x_len = len(x_adv.split()) |
|
orig_class = np.argmax(classsify_tweets([x_adv]), axis=1)[0] |
|
orig_label = id2label[orig_class] |
|
print(f"Original tweet: {x_adv} : Original label: {orig_label}.") |
|
splits = len(x_adv.split()) |
|
perturbed_flag = False |
|
for split_ix in range(splits): |
|
perturbed = select_best_replacement(split_ix, x_adv) |
|
if perturbed: |
|
new_class = np.argmax(classsify_tweets([perturbed]), axis=1)[0] |
|
if orig_class != new_class: |
|
print(f"Perturbed tweet: {perturbed} : New label: {id2label[new_class]}.") |
|
print(10 * "==") |
|
if not perturbed_flag: |
|
perturb_counter += 1 |
|
perturbed_flag = True |
|
if not perturbed_flag: |
|
print(10 * "==") |
|
print(f"Successful perturbation {perturb_counter} out of {len(correct_class_tweets)}.") |
|
|
|
off_tweets_count = sum(test_data['label'] == 1 ) |
|
print(f"Number of offensive tweets in the dataset: {off_tweets_count}") |
|
|
|
size = len(test_data) |
|
print("size of test set:", size) |
|
correct_class_tweets = [] |
|
correct_class = [] |
|
for i in range(0, size): |
|
txt = test_data['Text'].astype('U')[i] |
|
cls = test_data['label'][i] |
|
label = id2label[np.argmax(classsify_tweets([txt]), axis=1)[0]] |
|
print(f"Tweet: {txt} | Actual: {cls} | Predicted: {label}") |
|
if label == cls and label == "OFF": |
|
correct_class_tweets.append(txt) |
|
correct_class.append(cls) |
|
print(f"Correctly classifiedย asย OFF:ย {txt}") |
|
|
|
!pip install gradio |
|
|
|
import gradio as gr |
|
import torch |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
|
|
|
model_name = "qarib/bert-base-qarib" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) |
|
|
|
|
|
def light_preprocess(text): |
|
text = text.replace("@USER", "").replace("RT", "").strip() |
|
return text |
|
|
|
|
|
def predict_offensive(text): |
|
preprocessed_text = light_preprocess(text) |
|
inputs = tokenizer(preprocessed_text, return_tensors="pt", truncation=True, padding=True) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
predicted_class = torch.argmax(logits, dim=1).item() |
|
return "Offensive" if predicted_class == 1 else "Not Offensive" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=predict_offensive, |
|
inputs=gr.Textbox(lines=2, placeholder="Enter text here..."), |
|
outputs="text", |
|
title="Offensive Language Detection", |
|
description="Enter a text to check if it's offensive or not.", |
|
) |
|
|
|
|
|
iface.launch() |
|
|
|
import gradio as gr |
|
import torch |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
|
|
|
model_name_1 = "qarib/bert-base-qarib" |
|
model_name_2 = "bert-base-multilingual-cased" |
|
tokenizer_1 = AutoTokenizer.from_pretrained(model_name_1) |
|
model_1 = AutoModelForSequenceClassification.from_pretrained(model_name_1, num_labels=2) |
|
|
|
tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2) |
|
model_2 = AutoModelForSequenceClassification.from_pretrained(model_name_2, num_labels=2) |
|
|
|
|
|
def light_preprocess(text): |
|
text = text.replace("@USER", "").replace("RT", "").strip() |
|
return text |
|
|
|
|
|
def predict_offensive(text, model_choice): |
|
if model_choice == "Model 1": |
|
tokenizer = tokenizer_1 |
|
model = model_1 |
|
else: |
|
tokenizer = tokenizer_2 |
|
model = model_2 |
|
|
|
preprocessed_text = light_preprocess(text) |
|
inputs = tokenizer(preprocessed_text, return_tensors="pt", truncation=True, padding=True) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
predicted_class = torch.argmax(logits, dim=1).item() |
|
return "Offensive" if predicted_class == 1 else "Not Offensive" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=predict_offensive, |
|
inputs=[ |
|
gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"), |
|
gr.Dropdown(choices=["Model 1", "Model 2"], label="Select Model") |
|
], |
|
outputs=gr.Textbox(label="Prediction"), |
|
title="Offensive Language Detection", |
|
description="Enter a text to check if it's offensive or not using the selected model.", |
|
theme="default", |
|
css=".gradio-container { background-color: #f0f0f0; } .output-textbox { font-size: 20px; color: #007BFF; }" |
|
) |
|
|
|
|
|
iface.launch() |
|
|
|
!pip install gradio |
|
import gradio as gr |
|
import torch |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
|
|
|
model_name_1 = "qarib/bert-base-qarib" |
|
model_name_2 = "bert-base-multilingual-cased" |
|
tokenizer_1 = AutoTokenizer.from_pretrained(model_name_1) |
|
model_1 = AutoModelForSequenceClassification.from_pretrained(model_name_1, num_labels=2) |
|
|
|
tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2) |
|
model_2 = AutoModelForSequenceClassification.from_pretrained(model_name_2, num_labels=2) |
|
|
|
|
|
def light_preprocess(text): |
|
text = text.replace("@USER", "").replace("RT", "").strip() |
|
return text |
|
|
|
|
|
def predict_offensive(text, model_choice): |
|
if model_choice == "Model 1": |
|
tokenizer = tokenizer_1 |
|
model = model_1 |
|
else: |
|
tokenizer = tokenizer_2 |
|
model = model_2 |
|
|
|
preprocessed_text = light_preprocess(text) |
|
inputs = tokenizer(preprocessed_text, return_tensors="pt", truncation=True, padding=True) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
predicted_class = torch.argmax(logits, dim=1).item() |
|
return "Offensive" if predicted_class == 1 else "Not Offensive" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=predict_offensive, |
|
inputs=[ |
|
gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"), |
|
gr.Dropdown(choices=["Model 1", "Model 2"], label="Select Model") |
|
], |
|
outputs=gr.Textbox(label="Prediction"), |
|
title="Offensive Language Detection", |
|
description="Enter a text to check if it's offensive or not using the selected model.", |
|
theme="default", |
|
) |
|
|
|
|
|
iface.launch() |