HeshamHaroon commited on
Commit
b9f9278
1 Parent(s): 4c91389

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -8,6 +8,7 @@ from transformers import AutoTokenizer
8
  gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
9
  gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
10
  jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
 
11
 
12
  # List of available tokenizers and a dictionary to load them
13
  tokenizer_options = [
@@ -15,7 +16,8 @@ tokenizer_options = [
15
  "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
16
  "FreedomIntelligence/AceGPT-13B",
17
  "FreedomIntelligence/AceGPT-7B",
18
- "inception-mbzuai/jais-13b"
 
19
  ]
20
 
21
  tokenizers = {
@@ -28,12 +30,13 @@ tokenizers = {
28
  "aranizer_sp86k": aranizer_sp86k.get_tokenizer,
29
  "FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
30
  "FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
31
- "inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer
 
32
  }
33
 
34
  def compare_tokenizers(tokenizer_name, text):
35
  # Handle the transformer tokenizers separately due to API differences
36
- if tokenizer_name in ["FreedomIntelligence/AceGPT-13B", "FreedomIntelligence/AceGPT-7B", "inception-mbzuai/jais-13b"]:
37
  tokenizer = tokenizers[tokenizer_name]()
38
  tokens = tokenizer.tokenize(text)
39
  tokens_arabic = [token.encode('utf-8').decode('utf-8') for token in tokens]
@@ -48,7 +51,7 @@ def compare_tokenizers(tokenizer_name, text):
48
  tokens_arabic = [token.encode('utf-8').decode('utf-8') for token in tokens]
49
 
50
  # Prepare the results to be displayed
51
- results = [(tokenizer_name, tokens_arabic, encoded_output, decoded_text)]
52
  return results
53
 
54
  # Define the Gradio interface components with a dropdown for model selection
@@ -73,4 +76,4 @@ iface = Interface(
73
  )
74
 
75
  # Launching the Gradio app
76
- iface.launch()
 
8
  gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
9
  gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
10
  jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
11
+ arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
12
 
13
  # List of available tokenizers and a dictionary to load them
14
  tokenizer_options = [
 
16
  "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
17
  "FreedomIntelligence/AceGPT-13B",
18
  "FreedomIntelligence/AceGPT-7B",
19
+ "inception-mbzuai/jais-13b",
20
+ "aubmindlab/bert-base-arabertv2"
21
  ]
22
 
23
  tokenizers = {
 
30
  "aranizer_sp86k": aranizer_sp86k.get_tokenizer,
31
  "FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
32
  "FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
33
+ "inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
34
+ "aubmindlab/bert-base-arabertv2": lambda: arabert_tokenizer
35
  }
36
 
37
  def compare_tokenizers(tokenizer_name, text):
38
  # Handle the transformer tokenizers separately due to API differences
39
+ if tokenizer_name in ["FreedomIntelligence/AceGPT-13B", "FreedomIntelligence/AceGPT-7B", "inception-mbzuai/jais-13b", "aubmindlab/bert-base-arabertv2"]:
40
  tokenizer = tokenizers[tokenizer_name]()
41
  tokens = tokenizer.tokenize(text)
42
  tokens_arabic = [token.encode('utf-8').decode('utf-8') for token in tokens]
 
51
  tokens_arabic = [token.encode('utf-8').decode('utf-8') for token in tokens]
52
 
53
  # Prepare the results to be displayed
54
+ results = [(tokenizer_name, tokens_arabic, encoded_output.tolist(), decoded_text)]
55
  return results
56
 
57
  # Define the Gradio interface components with a dropdown for model selection
 
76
  )
77
 
78
  # Launching the Gradio app
79
+ iface.launch()