Spaces:
Runtime error
Runtime error
HeshamHaroon
commited on
Commit
•
b9f9278
1
Parent(s):
4c91389
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,7 @@ from transformers import AutoTokenizer
|
|
8 |
gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
|
9 |
gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
|
10 |
jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
|
|
|
11 |
|
12 |
# List of available tokenizers and a dictionary to load them
|
13 |
tokenizer_options = [
|
@@ -15,7 +16,8 @@ tokenizer_options = [
|
|
15 |
"aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
|
16 |
"FreedomIntelligence/AceGPT-13B",
|
17 |
"FreedomIntelligence/AceGPT-7B",
|
18 |
-
"inception-mbzuai/jais-13b"
|
|
|
19 |
]
|
20 |
|
21 |
tokenizers = {
|
@@ -28,12 +30,13 @@ tokenizers = {
|
|
28 |
"aranizer_sp86k": aranizer_sp86k.get_tokenizer,
|
29 |
"FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
|
30 |
"FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
|
31 |
-
"inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer
|
|
|
32 |
}
|
33 |
|
34 |
def compare_tokenizers(tokenizer_name, text):
|
35 |
# Handle the transformer tokenizers separately due to API differences
|
36 |
-
if tokenizer_name in ["FreedomIntelligence/AceGPT-13B", "FreedomIntelligence/AceGPT-7B", "inception-mbzuai/jais-13b"]:
|
37 |
tokenizer = tokenizers[tokenizer_name]()
|
38 |
tokens = tokenizer.tokenize(text)
|
39 |
tokens_arabic = [token.encode('utf-8').decode('utf-8') for token in tokens]
|
@@ -48,7 +51,7 @@ def compare_tokenizers(tokenizer_name, text):
|
|
48 |
tokens_arabic = [token.encode('utf-8').decode('utf-8') for token in tokens]
|
49 |
|
50 |
# Prepare the results to be displayed
|
51 |
-
results = [(tokenizer_name, tokens_arabic, encoded_output, decoded_text)]
|
52 |
return results
|
53 |
|
54 |
# Define the Gradio interface components with a dropdown for model selection
|
@@ -73,4 +76,4 @@ iface = Interface(
|
|
73 |
)
|
74 |
|
75 |
# Launching the Gradio app
|
76 |
-
iface.launch()
|
|
|
8 |
gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
|
9 |
gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
|
10 |
jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
|
11 |
+
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
|
12 |
|
13 |
# List of available tokenizers and a dictionary to load them
|
14 |
tokenizer_options = [
|
|
|
16 |
"aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
|
17 |
"FreedomIntelligence/AceGPT-13B",
|
18 |
"FreedomIntelligence/AceGPT-7B",
|
19 |
+
"inception-mbzuai/jais-13b",
|
20 |
+
"aubmindlab/bert-base-arabertv2"
|
21 |
]
|
22 |
|
23 |
tokenizers = {
|
|
|
30 |
"aranizer_sp86k": aranizer_sp86k.get_tokenizer,
|
31 |
"FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
|
32 |
"FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
|
33 |
+
"inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
|
34 |
+
"aubmindlab/bert-base-arabertv2": lambda: arabert_tokenizer
|
35 |
}
|
36 |
|
37 |
def compare_tokenizers(tokenizer_name, text):
|
38 |
# Handle the transformer tokenizers separately due to API differences
|
39 |
+
if tokenizer_name in ["FreedomIntelligence/AceGPT-13B", "FreedomIntelligence/AceGPT-7B", "inception-mbzuai/jais-13b", "aubmindlab/bert-base-arabertv2"]:
|
40 |
tokenizer = tokenizers[tokenizer_name]()
|
41 |
tokens = tokenizer.tokenize(text)
|
42 |
tokens_arabic = [token.encode('utf-8').decode('utf-8') for token in tokens]
|
|
|
51 |
tokens_arabic = [token.encode('utf-8').decode('utf-8') for token in tokens]
|
52 |
|
53 |
# Prepare the results to be displayed
|
54 |
+
results = [(tokenizer_name, tokens_arabic, encoded_output.tolist(), decoded_text)]
|
55 |
return results
|
56 |
|
57 |
# Define the Gradio interface components with a dropdown for model selection
|
|
|
76 |
)
|
77 |
|
78 |
# Launching the Gradio app
|
79 |
+
iface.launch()
|