Victoria Reis commited on
Commit
0b0b452
1 Parent(s): db3c5f3

feature:app v1

Browse files
Files changed (3) hide show
  1. README.md +4 -4
  2. app.py +168 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
  title: Portuguese Hate Speech Identifier
3
- emoji: 🦀
4
- colorFrom: red
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 4.7.1
8
  app_file: app.py
9
- pinned: false
10
  license: mit
11
  ---
12
 
 
1
  ---
2
  title: Portuguese Hate Speech Identifier
3
+ emoji: 🤗🤬
4
+ colorFrom: yellow
5
+ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 4.7.1
8
  app_file: app.py
9
+ pinned: true
10
  license: mit
11
  ---
12
 
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import torch
4
+ from collections import Counter
5
+ from scipy.special import softmax
6
+ import plotly.express as px
7
+
8
+ # Article string
9
+ article_string = "Author: <a href=\"https://huggingface.co/FpOliveira\">Felipe Ramos de Oliveira</a>. Read more about our <a href=\"https://github.com/Silly-Machine/TuPi-Portuguese-Hate-Speech-Dataset\">The Portuguese hate speech dataset (TuPI) </a>."
10
+
11
+ # App title
12
+ app_title = "Portuguese hate speech identifier (Multiclass) - Identificador de discurso de ódio em português (Multiclasse)"
13
+
14
+ # App description
15
+ app_description = """
16
+ EN: This application employs multiple natural language models to identify different types of hate speech in portuguese. You have the option to enter your own phrases by filling in the "Text" field or choosing one of the examples provided below.
17
+ \nPT: Esta aplicativo emprega múltiplos modelos de linguagem natural para identificar diferentes tipos de discursos de ódio em português. Você tem a opção de inserir suas próprias frases preenchendo o campo "Text" ou escolhendo um dos exemplos abaixo
18
+ """
19
+
20
+
21
+ # App examples
22
+ app_examples = [
23
+ ["bom dia flor do dia!!!"],
24
+ ["o ódio é muito grande no coração da ex-deputada federal joise hasselmann contra a família bolsonaro"],
25
+ ["mano deus me livre q nojo da porra!🤮🤮🤮🤮🤮"],
26
+ ["obrigada princesa, porra, tô muito feliz snrsss 🤩🤩🤩❤️"],
27
+ ["mds mas o viado vir responder meus status falando q a taylor foi racista foi o auge 😂😂"],
28
+ ["Pra ser minha inimiga no mínimo tem que ter um rostinho bonito e delicado, não se considere minha rival com essa sua cara de cavalo não, feia, cara de traveco, cabeçuda, queixo quadrado 🤣🤣"]
29
+ ]
30
+
31
+ # Output textbox component description
32
+ output_textbox_component_description = """
33
+ EN: This box will display hate speech results based on the average score of multiple models.
34
+ PT: Esta caixa exibirá resultados da classificação de discurso de ódio com base na pontuação média de vários modelos.
35
+ """
36
+
37
+ # Output JSON component description
38
+ output_json_component_description = {
39
+ "breakdown": """
40
+ This box presents a detailed breakdown of the evaluation for each model.
41
+ """,
42
+ "detalhamento": """
43
+ (Esta caixa apresenta um detalhamento da avaliação para cada modelo.)
44
+ """
45
+ }
46
+
47
+ # Hate speech categories
48
+ hate_speech_categories = {
49
+ 0: "ageism",
50
+ 1: "aporophobia",
51
+ 2: "body shame",
52
+ 3: "capacitism",
53
+ 4: "lgbtphobia",
54
+ 5: "political",
55
+ 6: "racism",
56
+ 7: "religious intolerance",
57
+ 8: "misogyny",
58
+ 9: "xenophobia",
59
+ 10: "other",
60
+ 11: "not hate"
61
+ }
62
+
63
+ # Model list
64
+ model_list = [
65
+ "FpOliveira/tupi-bert-large-portuguese-cased-multiclass-multilabel",
66
+ "FpOliveira/tupi-bert-base-portuguese-cased-multiclass-multilabel",
67
+ "FpOliveira/tupi-gpt2-small-multiclass-multilabel",
68
+ ]
69
+
70
+ # User-friendly names for models
71
+ user_friendly_name = {
72
+ "FpOliveira/tupi-bert-large-portuguese-cased-multiclass-multilabel": "BERTimbau large (TuPi)",
73
+ "FpOliveira/tupi-bert-base-portuguese-cased-multiclass-multilabel": "BERTimbau base (TuPi)",
74
+ "FpOliveira/tupi-gpt2-small-multiclass-multilabel":"GPT2 small (TuPi)"
75
+ }
76
+
77
+ # Reverse mapping for user-friendly names
78
+ reverse_user_friendly_name = {v: k for k, v in user_friendly_name.items()}
79
+
80
+ # List of user-friendly model names
81
+ user_friendly_name_list = list(user_friendly_name.values())
82
+
83
+ # Model array
84
+ model_array = []
85
+
86
+ # Populate model array
87
+ for model_name in model_list:
88
+ row = {}
89
+ row["name"] = model_name
90
+ row["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
91
+ row["model"] = AutoModelForSequenceClassification.from_pretrained(model_name)
92
+ model_array.append(row)
93
+
94
+
95
+ # Function to find the most frequent element in an array
96
+ def most_frequent(array):
97
+ occurence_count = Counter(array)
98
+ return occurence_count.most_common(1)[0][0]
99
+
100
+
101
+ # Prediction function
102
+ def predict(s1, chosen_model):
103
+ # Clear previous figure instance
104
+ fig = None
105
+
106
+ if not chosen_model:
107
+ chosen_model = user_friendly_name_list[0]
108
+ scores = {}
109
+ full_chosen_model_name = reverse_user_friendly_name[chosen_model]
110
+ for row in model_array:
111
+ name = row["name"]
112
+ if name != full_chosen_model_name:
113
+ continue
114
+ else:
115
+ tokenizer = row["tokenizer"]
116
+ model = row["model"]
117
+ model_input = tokenizer(*([s1],), padding=True, return_tensors="pt")
118
+ with torch.no_grad():
119
+ output = model(**model_input)
120
+ logits = output[0][0].detach().numpy()
121
+ logits = softmax(logits).tolist()
122
+ break
123
+
124
+ # Get the indices of all probabilities
125
+ all_indices = range(len(logits))
126
+
127
+ # Get the indices of the top two probabilities
128
+ top_indices = sorted(range(len(logits)), key=lambda i: logits[i], reverse=True)
129
+
130
+ # Filter out invalid indices
131
+ valid_indices = [index for index in top_indices if index < len(hate_speech_categories)]
132
+
133
+ # Get the categories and probabilities for all classes
134
+ all_categories = [hate_speech_categories[index] for index in valid_indices]
135
+ all_probabilities = [logits[index] for index in valid_indices]
136
+
137
+ # Create a bar plot using Plotly
138
+ fig = px.bar(x=all_categories, y=all_probabilities, labels={'x': 'Categories', 'y': 'Probabilities'},
139
+ title="Classes Predicted Probabilities", text=all_probabilities)
140
+ fig.update_traces(texttemplate='%{text:.4f}', textposition='outside')
141
+ fig.show()
142
+
143
+ # # Save the figure as HTML
144
+ # html_figure = fig.to_html()
145
+
146
+ # Get the categories and probabilities for the top two
147
+ top_category = [hate_speech_categories[index] for index in top_indices]
148
+ top_probability = [logits[index] for index in top_indices]
149
+ result = top_category[0], 1-top_probability[1]
150
+ return result
151
+
152
+ # Input components
153
+ inputs = [
154
+ gr.Textbox(label="Text", value=app_examples[0][0]),
155
+ gr.Dropdown(label="Model", choices=user_friendly_name_list, value=user_friendly_name_list[0])
156
+ ]
157
+
158
+ # Output components
159
+ outputs = [
160
+ gr.Label(label="Predominant category"),
161
+ gr.Label(label="Probability"),
162
+ ]
163
+
164
+ # Gradio interface
165
+ gr.Interface(fn=predict, inputs=inputs, outputs=outputs, title=app_title,
166
+ description=app_description,
167
+ examples=app_examples,
168
+ article=article_string).launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ gradio
3
+ transformers
4
+ scipy
5
+ plotly