Spaces:
Runtime error
Runtime error
milestone-3
Browse files- app.py +22 -21
- milestone_3.py → train.py +5 -6
app.py
CHANGED
@@ -1,30 +1,31 @@
|
|
1 |
import streamlit as st
|
2 |
-
from transformers import AutoTokenizer,
|
3 |
import numpy as np
|
4 |
import torch
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
|
14 |
|
15 |
-
|
|
|
16 |
|
17 |
-
|
18 |
-
if analyze_button:
|
19 |
-
if selected_model=="Model 1":
|
20 |
-
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
|
21 |
-
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
|
22 |
-
else:
|
23 |
-
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
24 |
-
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
25 |
-
inputs = tokenizer(text, return_tensors="pt")
|
26 |
-
with torch.no_grad():
|
27 |
-
logits = model(**inputs).logits
|
28 |
-
prediction_id = logits.argmax().item()
|
29 |
-
results = model.config.id2label[prediction_id]
|
30 |
-
st.write(results)
|
|
|
1 |
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
+
import pandas as pd
|
6 |
+
import torch.nn.functional as F
|
7 |
|
8 |
+
model_name = "unitary/toxic-bert"
|
9 |
+
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
11 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
12 |
+
|
13 |
+
|
14 |
+
df = pd.DataFrame(columns=("Tweet", "Toxicity", "Probability"))
|
15 |
|
16 |
+
sample_tweets = ["Ask Sityush to clean up his behavior than issue me nonsensical warnings...", "be a man and lets discuss it-maybe over the phone?", "Don't look, come or think of comming back! Tosser."]
|
17 |
|
18 |
+
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
|
19 |
+
results = classifier(sample_tweets)
|
20 |
+
|
21 |
+
batch = tokenizer(sample_tweets, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
22 |
+
|
23 |
+
# assignment 3
|
24 |
+
st.title("CS482 Project Sentiment Analysis")
|
25 |
|
26 |
+
st.markdown("**:red[unitary/toxic-bert]**")
|
27 |
|
28 |
+
for i in range(len(sample_tweets)):
|
29 |
+
df.loc[len(df.index)] = [sample_tweets[i], results[i]["label"], results[i]["score"]]
|
30 |
|
31 |
+
st.table(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
milestone_3.py → train.py
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
-
from transformers import
|
2 |
import torch
|
3 |
from torch.utils.data import Dataset
|
4 |
# from torch.optim import AdamW
|
@@ -7,7 +7,7 @@ from sklearn.model_selection import train_test_split
|
|
7 |
|
8 |
|
9 |
# assignment 3
|
10 |
-
model_name = "
|
11 |
|
12 |
class ToxicDataset(Dataset):
|
13 |
|
@@ -18,7 +18,6 @@ class ToxicDataset(Dataset):
|
|
18 |
def __getitem__(self, idx):
|
19 |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
|
20 |
item["labels"] = torch.tensor(self.labels[idx])
|
21 |
-
print(item)
|
22 |
return item
|
23 |
|
24 |
def __len__(self):
|
@@ -35,7 +34,7 @@ train_texts, val_texts, train_labels, val_labels = train_test_split(toxic_data.t
|
|
35 |
|
36 |
|
37 |
print("Data split. Tokenizing data...")
|
38 |
-
tokenizer =
|
39 |
|
40 |
train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt')
|
41 |
val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt')
|
@@ -59,7 +58,7 @@ training_args = TrainingArguments(
|
|
59 |
|
60 |
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
61 |
|
62 |
-
model =
|
63 |
|
64 |
trainer = Trainer(
|
65 |
model=model,
|
@@ -101,7 +100,7 @@ trainer.train()
|
|
101 |
|
102 |
print("Training complete. Saving model...")
|
103 |
|
104 |
-
save_directory = "
|
105 |
model.save_pretrained(save_directory)
|
106 |
|
107 |
print("Model saved.")
|
|
|
1 |
+
from transformers import BertTokenizerFast, BertModel, Trainer, TrainingArguments
|
2 |
import torch
|
3 |
from torch.utils.data import Dataset
|
4 |
# from torch.optim import AdamW
|
|
|
7 |
|
8 |
|
9 |
# assignment 3
|
10 |
+
model_name = "bert-base-uncased"
|
11 |
|
12 |
class ToxicDataset(Dataset):
|
13 |
|
|
|
18 |
def __getitem__(self, idx):
|
19 |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
|
20 |
item["labels"] = torch.tensor(self.labels[idx])
|
|
|
21 |
return item
|
22 |
|
23 |
def __len__(self):
|
|
|
34 |
|
35 |
|
36 |
print("Data split. Tokenizing data...")
|
37 |
+
tokenizer = BertTokenizerFast.from_pretrained(model_name)
|
38 |
|
39 |
train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt')
|
40 |
val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt')
|
|
|
58 |
|
59 |
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
60 |
|
61 |
+
model = BertModel.from_pretrained(model_name, num_labels=6)
|
62 |
|
63 |
trainer = Trainer(
|
64 |
model=model,
|
|
|
100 |
|
101 |
print("Training complete. Saving model...")
|
102 |
|
103 |
+
save_directory = "./results/model"
|
104 |
model.save_pretrained(save_directory)
|
105 |
|
106 |
print("Model saved.")
|