BK-AI commited on
Commit
4829b64
β€’
1 Parent(s): 3a52501

refactor app, prepare for second prediction

Browse files
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ venv/
2
+ llama*
3
+ .DS_Store
4
+ .vscode/
5
+ old/
app.py CHANGED
@@ -3,84 +3,147 @@
3
 
4
  import gradio as gr
5
  import numpy as np
6
- import requests
7
- from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline, pipeline
 
 
 
 
8
  from langdetect import detect
9
  from matplotlib import pyplot as plt
10
  import imageio
11
 
12
- # Load the model
13
- model = AutoModelForSequenceClassification.from_pretrained("saved_model")
14
- tokenizer = AutoTokenizer.from_pretrained("saved_model")
15
- pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
 
 
16
 
17
- # Function called by the UI
18
- def attribution(text):
19
-
20
- # Clean the plot
21
- plt.clf()
22
-
23
- # Detect the language
24
- language = detect(text)
25
-
26
- # Translate the input in german if necessary
27
- if language == 'fr':
28
- translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-de")
29
- translatedText = translator(text[0:1000])
30
- text = translatedText[0]["translation_text"]
31
- elif language != 'de':
32
- return "The language is not recognized, it must be either in German or in French.", None
33
-
34
- # Set the bars of the bar chart
35
- bars = ""
36
- if language == 'fr':
37
- bars = ("DDPS", "DFI", "AS-MPC", "DFJP", "DEFR", "DETEC", "DFAE", "Parl", "ChF", "DFF", "AF", "TF")
38
- else:
39
- bars = ("VBS", "EDI", "AB-BA", "EJPD", "WBF", "UVEK", "EDA", "Parl", "BK", "EFD", "BV", "BGer")
40
-
41
- # Make the prediction with the 1000 first characters
42
- results = pipe(text[0:1000], return_all_scores=True)
43
- rates = [row["score"] for row in results[0]]
44
-
45
- # Bar chart
46
- y_pos = np.arange(len(bars))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  plt.barh(y_pos, rates)
48
- plt.yticks(y_pos, bars)
49
-
50
- # Set the output text
51
- name = ""
 
 
 
 
 
 
 
 
 
52
  maxRate = np.max(rates)
53
  maxIndex = np.argmax(rates)
54
 
 
 
55
  # ML model not sure if highest probability < 60%
56
- if maxRate < 0.6:
57
- # de / fr
58
- if language == 'de':
59
- name = "Das ML-Modell ist nicht sicher. Das Departement kΓΆnnte sein : \n\n"
60
- else:
61
- name = "Le modΓ¨le ML n'est pas sΓ»r. Le dΓ©partement pourrait Γͺtre : \n\n"
62
- i = 0
63
  # Show each department that has a probability > 10%
 
64
  while i == 0:
65
  if rates[maxIndex] >= 0.1:
66
- name = name + "\t" + str(rates[maxIndex])[2:4] + "%" + "\t\t\t\t\t" + bars[maxIndex] + "\n"
 
 
67
  rates[maxIndex] = 0
68
  maxIndex = np.argmax(rates)
69
  else:
70
  i = 1
 
71
  # ML model pretty sure, show only one department
72
  else:
73
- name = str(maxRate)[2:4] + "%" + "\t\t\t\t\t\t" + bars[maxIndex]
74
-
75
- # Save the bar chart as png and load it (enables better display)
76
- plt.savefig('rates.png')
77
- im = imageio.imread('rates.png')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- return name, im
80
-
81
 
82
  # display the UI
83
- interface = gr.Interface(fn=attribution,
84
- inputs=[gr.inputs.Textbox(lines=20, placeholder="Geben Sie bitte den Titel und den Sumbmitted Text des Vorstoss ein.\nVeuillez entrer le titre et le Submitted Text de la requΓͺte.")],
85
- outputs=['text', 'image'])
86
- interface.launch()
 
 
 
3
 
4
  import gradio as gr
5
  import numpy as np
6
+ from transformers import (
7
+ AutoModelForSequenceClassification,
8
+ AutoTokenizer,
9
+ TextClassificationPipeline,
10
+ pipeline,
11
+ )
12
  from langdetect import detect
13
  from matplotlib import pyplot as plt
14
  import imageio
15
 
16
+ # move constants into extra file
17
+ ML_MODEL_SURE = 0.6
18
+ UNKNOWN_LANG_TEXT = (
19
+ "The language is not recognized, it must be either in German or in French."
20
+ )
21
+ PLACEHOLDER_TEXT = "Geben Sie bitte den Titel und den Sumbmitted Text des Vorstoss ein.\nVeuillez entrer le titre et le Submitted Text de la requΓͺte."
22
 
23
+ UNSURE_DE_TEXT = "Das ML-Modell ist nicht sicher. Das Departement kΓΆnnte sein : \n\n"
24
+ UNSURE_FR_TEXT = "Le modΓ¨le ML n'est pas sΓ»r. Le dΓ©partement pourrait Γͺtre : \n\n"
25
+ BARS_DEP_FR = (
26
+ "DDPS",
27
+ "DFI",
28
+ "AS-MPC",
29
+ "DFJP",
30
+ "DEFR",
31
+ "DETEC",
32
+ "DFAE",
33
+ "Parl",
34
+ "ChF",
35
+ "DFF",
36
+ "AF",
37
+ "TF",
38
+ )
39
+ BARS_DEP_DE = (
40
+ "VBS",
41
+ "EDI",
42
+ "AB-BA",
43
+ "EJPD",
44
+ "WBF",
45
+ "UVEK",
46
+ "EDA",
47
+ "Parl",
48
+ "BK",
49
+ "EFD",
50
+ "BV",
51
+ "BGer",
52
+ )
53
+
54
+
55
+ def load_model(modelFolder):
56
+ """Loads model from model_folder & creates a text classification pipeline."""
57
+ model = AutoModelForSequenceClassification.from_pretrained(modelFolder)
58
+ tokenizer = AutoTokenizer.from_pretrained(modelFolder)
59
+ pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
60
+ return pipe
61
+
62
+
63
+ def translate_to_de(inputText):
64
+ """Translates french user input to German for the model to reach better classification."""
65
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-de")
66
+ translatedText = translator(inputText[0:1000])
67
+ text = translatedText[0]["translation_text"]
68
+ return text
69
+
70
+
71
+ def create_bar_plot(rates, language):
72
+ barnames = BARS_DEP_FR if language == "fr" else BARS_DEP_DE
73
+
74
+ y_pos = np.arange(len(barnames))
75
  plt.barh(y_pos, rates)
76
+ plt.yticks(y_pos, barnames)
77
+
78
+ # Save the bar chart as png and load it (enables better display)
79
+ plt.savefig("rates.png")
80
+ im = imageio.v2.imread("rates.png")
81
+
82
+ return im, barnames
83
+
84
+
85
+ def show_chosen_category(barnames, rates, language):
86
+ """Creates the output text
87
+ - adds disclaimer if ML model is not sure
88
+ - when unsure, adds all categories with prob. > 10% to output"""
89
  maxRate = np.max(rates)
90
  maxIndex = np.argmax(rates)
91
 
92
+ distance = "\t\t\t\t\t"
93
+
94
  # ML model not sure if highest probability < 60%
95
+ if maxRate < ML_MODEL_SURE:
96
+ name = UNSURE_FR_TEXT if language == "fr" else UNSURE_DE_TEXT
97
+
 
 
 
 
98
  # Show each department that has a probability > 10%
99
+ i = 0
100
  while i == 0:
101
  if rates[maxIndex] >= 0.1:
102
+ chosenScore = str(rates[maxIndex])[2:4]
103
+ chosenCat = barnames[maxIndex]
104
+ name = name + "\t" + chosenScore + "%" + distance + chosenCat + "\n"
105
  rates[maxIndex] = 0
106
  maxIndex = np.argmax(rates)
107
  else:
108
  i = 1
109
+
110
  # ML model pretty sure, show only one department
111
  else:
112
+ name = str(maxRate)[2:4] + "%" + distance + barnames[maxIndex]
113
+
114
+ return name
115
+
116
+
117
+ pipeDep = load_model("saved_model_dep")
118
+ # pipeOffice = load_model("saved_model_office")
119
+
120
+
121
+ # Function called by the UI
122
+ def attribution(inputText):
123
+ plt.clf()
124
+ language = detect(inputText)
125
+
126
+ # Translate the input to german if necessary
127
+ if language == "fr":
128
+ inputText = translate_to_de(inputText)
129
+ elif language != "de":
130
+ return UNKNOWN_LANG_TEXT, None
131
+
132
+ # Make the prediction with the 1000 first characters
133
+ prediction = pipeDep(inputText[0:1000], return_all_scores=True)
134
+ rates = [row["score"] for row in prediction[0]]
135
+
136
+ # Create barplot & output text
137
+ im, barnames = create_bar_plot(rates, language)
138
+ chosenCategoryText = show_chosen_category(barnames, rates, language)
139
+
140
+ return chosenCategoryText, im
141
 
 
 
142
 
143
  # display the UI
144
+ interface = gr.Interface(
145
+ fn=attribution,
146
+ inputs=[gr.components.Textbox(lines=20, placeholder=PLACEHOLDER_TEXT)],
147
+ outputs=["text", "image"],
148
+ )
149
+ interface.launch()
requirements.txt CHANGED
@@ -4,4 +4,8 @@ langdetect
4
  matplotlib
5
  imageio
6
  torch
7
- sentencepiece
 
 
 
 
 
4
  matplotlib
5
  imageio
6
  torch
7
+ sentencepiece
8
+
9
+ gradio
10
+ langdetect
11
+ imageio
saved_model/config.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1dd5122dedc8fdf6eb1ec32b25f3246f8c3c64432abfd4d9bad4b626f378fc4
3
- size 1255
 
 
 
 
saved_model/special_tokens_map.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:303df45a03609e4ead04bc3dc1536d0ab19b5358db685b6f3da123d05ec200e3
3
- size 112
 
 
 
 
saved_model/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d6f6affc6b91020cabef56fe9289907e34a89e7f3463a93250c0d94cc61000d
3
- size 726371
 
 
 
 
saved_model/tokenizer_config.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ed472c8edcb18869d09d7bc852465911b105dd301fda14b4283b01577a5ebd7
3
- size 327
 
 
 
 
saved_model/vocab.txt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:982f8396ec746db0ed414dcc4789398ab6b365663cada50f776afb905dacbb61
3
- size 254729
 
 
 
 
saved_model_dep/config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert-base-german-cased",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3",
16
+ "4": "LABEL_4",
17
+ "5": "LABEL_5",
18
+ "6": "LABEL_6",
19
+ "7": "LABEL_7",
20
+ "8": "LABEL_8",
21
+ "9": "LABEL_9",
22
+ "10": "LABEL_10",
23
+ "11": "LABEL_11"
24
+ },
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 3072,
27
+ "label2id": {
28
+ "LABEL_0": 0,
29
+ "LABEL_1": 1,
30
+ "LABEL_10": 10,
31
+ "LABEL_11": 11,
32
+ "LABEL_2": 2,
33
+ "LABEL_3": 3,
34
+ "LABEL_4": 4,
35
+ "LABEL_5": 5,
36
+ "LABEL_6": 6,
37
+ "LABEL_7": 7,
38
+ "LABEL_8": 8,
39
+ "LABEL_9": 9
40
+ },
41
+ "layer_norm_eps": 1e-12,
42
+ "max_position_embeddings": 512,
43
+ "model_type": "bert",
44
+ "num_attention_heads": 12,
45
+ "num_hidden_layers": 12,
46
+ "pad_token_id": 0,
47
+ "position_embedding_type": "absolute",
48
+ "problem_type": "single_label_classification",
49
+ "torch_dtype": "float32",
50
+ "transformers_version": "4.19.2",
51
+ "type_vocab_size": 2,
52
+ "use_cache": true,
53
+ "vocab_size": 30000
54
+ }
{saved_model β†’ saved_model_dep}/pytorch_model.bin RENAMED
File without changes
saved_model_dep/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
saved_model_dep/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
saved_model_dep/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-german-cased", "tokenizer_class": "BertTokenizer"}
{saved_model β†’ saved_model_dep}/training_args.bin RENAMED
File without changes
saved_model_dep/vocab.txt ADDED
The diff for this file is too large to render. See raw diff