HugoLaurencon commited on
Commit
2ee1fd2
1 Parent(s): 62f8d3d

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +1 -0
  3. app.py +77 -0
  4. plots/ar_character_repetition_ratio.png +3 -0
  5. plots/ar_closed_class_word_ratio.png +3 -0
  6. plots/ar_flagged_word_ratio.png +3 -0
  7. plots/ar_number_of_words.png +3 -0
  8. plots/ar_perplexity_score.png +3 -0
  9. plots/ar_special_character_ratio.png +3 -0
  10. plots/ar_word_repetition_ratio.png +3 -0
  11. plots/bn_character_repetition_ratio.png +3 -0
  12. plots/bn_closed_class_word_ratio.png +3 -0
  13. plots/bn_flagged_word_ratio.png +3 -0
  14. plots/bn_number_of_words.png +3 -0
  15. plots/bn_perplexity_score.png +3 -0
  16. plots/bn_special_character_ratio.png +3 -0
  17. plots/bn_word_repetition_ratio.png +3 -0
  18. plots/ca_character_repetition_ratio.png +3 -0
  19. plots/ca_closed_class_word_ratio.png +3 -0
  20. plots/ca_flagged_word_ratio.png +3 -0
  21. plots/ca_number_of_words.png +3 -0
  22. plots/ca_perplexity_score.png +3 -0
  23. plots/ca_special_character_ratio.png +3 -0
  24. plots/ca_word_repetition_ratio.png +3 -0
  25. plots/en_character_repetition_ratio.png +3 -0
  26. plots/en_closed_class_word_ratio.png +3 -0
  27. plots/en_flagged_word_ratio.png +3 -0
  28. plots/en_number_of_words.png +3 -0
  29. plots/en_perplexity_score.png +3 -0
  30. plots/en_special_character_ratio.png +3 -0
  31. plots/en_word_repetition_ratio.png +3 -0
  32. plots/es_character_repetition_ratio.png +3 -0
  33. plots/es_closed_class_word_ratio.png +3 -0
  34. plots/es_flagged_word_ratio.png +3 -0
  35. plots/es_number_of_words.png +3 -0
  36. plots/es_perplexity_score.png +3 -0
  37. plots/es_special_character_ratio.png +3 -0
  38. plots/es_word_repetition_ratio.png +3 -0
  39. plots/eu_character_repetition_ratio.png +3 -0
  40. plots/eu_closed_class_word_ratio.png +3 -0
  41. plots/eu_flagged_word_ratio.png +3 -0
  42. plots/eu_number_of_words.png +3 -0
  43. plots/eu_perplexity_score.png +3 -0
  44. plots/eu_special_character_ratio.png +3 -0
  45. plots/eu_word_repetition_ratio.png +3 -0
  46. plots/fr_character_repetition_ratio.png +3 -0
  47. plots/fr_closed_class_word_ratio.png +3 -0
  48. plots/fr_flagged_word_ratio.png +3 -0
  49. plots/fr_number_of_words.png +3 -0
  50. plots/fr_perplexity_score.png +3 -0
.gitattributes CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *DS_Store
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ PATH_PLOTS = "./plots"
5
+
6
+ LANGUAGES = {
7
+ "Arabic": "ar",
8
+ "Basque": "eu",
9
+ "Bengali": "bn",
10
+ "Catalan": "ca",
11
+ "Chinese": "zh",
12
+ "English": "en",
13
+ "French": "fr",
14
+ "Hindi": "hi",
15
+ "Indonesian": "id",
16
+ "Portuguese": "pt",
17
+ "Spanish": "es",
18
+ "Urdu": "ur",
19
+ "Vietnamese": "vi",
20
+ }
21
+
22
+ FILTERS = [
23
+ "number of words",
24
+ "character repetition ratio",
25
+ "word repetition ratio",
26
+ "special character ratio",
27
+ "closed class word ratio",
28
+ "flagged word ratio",
29
+ "perplexity score",
30
+ ]
31
+
32
+
33
+ class Visualization:
34
+ def __init__(self):
35
+ pass
36
+
37
+ def set_title(self):
38
+ st.title("Visualization of the distributions of the filter values for the BigScience Corpus")
39
+
40
+ def choose_language(self):
41
+ chosen_language = st.sidebar.selectbox(
42
+ "Language",
43
+ options=list(LANGUAGES.keys()),
44
+ index=5 # English
45
+ )
46
+ self.chosen_language = LANGUAGES[chosen_language]
47
+
48
+ def choose_filter(self):
49
+ chosen_filter = st.sidebar.selectbox(
50
+ "Filter on the",
51
+ options=FILTERS,
52
+ index=0
53
+ )
54
+ self.chosen_filter = chosen_filter.replace(" ", "_")
55
+
56
+ def display_plot(self):
57
+ path_image = f"{PATH_PLOTS}/{self.chosen_language}_{self.chosen_filter}.png"
58
+
59
+ col1, col2, col3 = st.columns([1,6,1])
60
+ with col1:
61
+ st.write("")
62
+ with col2:
63
+ st.image(path_image)
64
+ with col3:
65
+ st.write("")
66
+
67
+ def visualization(self):
68
+ self.set_title()
69
+ self.choose_language()
70
+ self.choose_filter()
71
+ self.display_plot()
72
+
73
+
74
+ if __name__ == "__main__":
75
+ st.set_page_config(layout="wide")
76
+ visualization = Visualization()
77
+ visualization.visualization()
plots/ar_character_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: baf81a5a0cc829dbe1e2fad3742a576d907945e38612790132e2f84d5daafb34
  • Pointer size: 131 Bytes
  • Size of remote file: 174 kB
plots/ar_closed_class_word_ratio.png ADDED

Git LFS Details

  • SHA256: a354e4c26a61f69f7a547ba2e73882e7f3334c48c3cc38eafdf66d74aa2ce060
  • Pointer size: 131 Bytes
  • Size of remote file: 169 kB
plots/ar_flagged_word_ratio.png ADDED

Git LFS Details

  • SHA256: 904de90ab7b5411433642513eb24455d051b498778fcc1fdd842780b59b0cd74
  • Pointer size: 131 Bytes
  • Size of remote file: 136 kB
plots/ar_number_of_words.png ADDED

Git LFS Details

  • SHA256: 06fe8e274916534a3169a4c014d33fbae6506d4873016629ecf383581f67ac36
  • Pointer size: 131 Bytes
  • Size of remote file: 158 kB
plots/ar_perplexity_score.png ADDED

Git LFS Details

  • SHA256: 6bfaf281edad1587ced695535f957a68b0ada096b4cc89eb45ed0afe03df731e
  • Pointer size: 131 Bytes
  • Size of remote file: 179 kB
plots/ar_special_character_ratio.png ADDED

Git LFS Details

  • SHA256: 1be6fe4bb706e4ef2755384c9b004f573c09407b5ac5f4eeec2126fcfefdeb16
  • Pointer size: 131 Bytes
  • Size of remote file: 156 kB
plots/ar_word_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: 0777d3e8cf7a099031bc4ba62106ca3e9ef7b77860b11c23281739e006ff11f3
  • Pointer size: 131 Bytes
  • Size of remote file: 124 kB
plots/bn_character_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: a2c1ef996fa3dfdd18f72ae6d8a6af5a2f310576b35f066cbaec152e22e4b8ef
  • Pointer size: 131 Bytes
  • Size of remote file: 145 kB
plots/bn_closed_class_word_ratio.png ADDED

Git LFS Details

  • SHA256: 25d29968260309af1bbdb3321a4dabe1cbef8feafd3eebfd999cca6219867b4c
  • Pointer size: 131 Bytes
  • Size of remote file: 124 kB
plots/bn_flagged_word_ratio.png ADDED

Git LFS Details

  • SHA256: 32680c70a68f6894db37abfd93e61f6ef5ec45478197baf8993f81077eaaa1b7
  • Pointer size: 130 Bytes
  • Size of remote file: 93.4 kB
plots/bn_number_of_words.png ADDED

Git LFS Details

  • SHA256: 89dc8ae8a078c06daf5d90c5a3399a93f8d6dc6f0f1e741de89d228ddd42930b
  • Pointer size: 131 Bytes
  • Size of remote file: 121 kB
plots/bn_perplexity_score.png ADDED

Git LFS Details

  • SHA256: da594778413eb7fd1ff8f792aaf624a2269f5b6a993b89cff8b93be79d4dcbcc
  • Pointer size: 131 Bytes
  • Size of remote file: 149 kB
plots/bn_special_character_ratio.png ADDED

Git LFS Details

  • SHA256: 4eb4f5ba2090f0ac28ffc6b44dd16caa4f8c1d038678e53df6c795ff438eb127
  • Pointer size: 131 Bytes
  • Size of remote file: 127 kB
plots/bn_word_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: 039f065d3bfb23288f19f5fdf0adffd2e708e6f02da0a8d29252cc8ff8b227a5
  • Pointer size: 131 Bytes
  • Size of remote file: 108 kB
plots/ca_character_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: 8812dd8abb9a6a5e7f3a43942ef57275cba538cd33284afb29a6472428fa3c63
  • Pointer size: 131 Bytes
  • Size of remote file: 143 kB
plots/ca_closed_class_word_ratio.png ADDED

Git LFS Details

  • SHA256: b2eb8d0bf4cbcfdeb4b8a7339a70958d0a3ef6b9a29713ea8757f339a7f67ce6
  • Pointer size: 131 Bytes
  • Size of remote file: 130 kB
plots/ca_flagged_word_ratio.png ADDED

Git LFS Details

  • SHA256: 3eace8e80e653296e1fc5db380a43424b30a9e2f82d7f26c362b8f7735e0b677
  • Pointer size: 131 Bytes
  • Size of remote file: 116 kB
plots/ca_number_of_words.png ADDED

Git LFS Details

  • SHA256: 6c598b50d5a2a471830f26c780035848ee52fc5401c92e917507bfeeb2535ce2
  • Pointer size: 131 Bytes
  • Size of remote file: 132 kB
plots/ca_perplexity_score.png ADDED

Git LFS Details

  • SHA256: cf0530f78f3c7bf0001b5ca99bf7a38786b807916ab9659a36f638a01d01314d
  • Pointer size: 131 Bytes
  • Size of remote file: 158 kB
plots/ca_special_character_ratio.png ADDED

Git LFS Details

  • SHA256: 649605a63565597f49af5e046c3220ed2f77e1417d4e5019d95bc64956add00c
  • Pointer size: 131 Bytes
  • Size of remote file: 126 kB
plots/ca_word_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: baddc170825ec53dd34d6f6b33dd475b640629311fa4bed7df188552c3549448
  • Pointer size: 131 Bytes
  • Size of remote file: 106 kB
plots/en_character_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: 17cf5a75d99d8cf415e103ddfd499220418e5643fd9181e502ee7e9a494c0dec
  • Pointer size: 131 Bytes
  • Size of remote file: 164 kB
plots/en_closed_class_word_ratio.png ADDED

Git LFS Details

  • SHA256: b5c5f0b06125a0d0078c66a7beb5e49df17244244b646dbc3f5c1f533803492f
  • Pointer size: 131 Bytes
  • Size of remote file: 171 kB
plots/en_flagged_word_ratio.png ADDED

Git LFS Details

  • SHA256: 16d318728e392dd4bbfcce24eb58b6adbc7970f2081cdff23c7243085c96e272
  • Pointer size: 131 Bytes
  • Size of remote file: 124 kB
plots/en_number_of_words.png ADDED

Git LFS Details

  • SHA256: 48f4cb273978e5ca2604be4900b4fd741a6909eaa4361cafcf603feb0688748e
  • Pointer size: 131 Bytes
  • Size of remote file: 138 kB
plots/en_perplexity_score.png ADDED

Git LFS Details

  • SHA256: d560e61df00d1a13bd7841fcf94b8544f9976fbc5d4fd12a1f868d7287ebc402
  • Pointer size: 131 Bytes
  • Size of remote file: 168 kB
plots/en_special_character_ratio.png ADDED

Git LFS Details

  • SHA256: 2fa5ca7109c84a26d7667ac7d22c2bcf08d426c86d53243cf70ed3871df7de73
  • Pointer size: 131 Bytes
  • Size of remote file: 144 kB
plots/en_word_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: 6b9a565010dc67a9655c8d5c78b75fc14537898eea5b41a96f977ec81a7eee18
  • Pointer size: 131 Bytes
  • Size of remote file: 121 kB
plots/es_character_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: a1b13f3d5c9f5ee4fcb451acfe10445e8be9804c25244332e7471c4e56e80639
  • Pointer size: 131 Bytes
  • Size of remote file: 167 kB
plots/es_closed_class_word_ratio.png ADDED

Git LFS Details

  • SHA256: b0f182f4ea9c48b6449c93c80b5efe50c7d12db25edbafb84bae0d8e46bbee5f
  • Pointer size: 131 Bytes
  • Size of remote file: 161 kB
plots/es_flagged_word_ratio.png ADDED

Git LFS Details

  • SHA256: 5729f9d8968cec508756a523e875fea51d86905573552f8c4bdebdcfc9f5de9b
  • Pointer size: 131 Bytes
  • Size of remote file: 113 kB
plots/es_number_of_words.png ADDED

Git LFS Details

  • SHA256: 060c08dd8b6791ebdba01dba035039ecdc5f278f52106aeab7730beb7ae7f818
  • Pointer size: 131 Bytes
  • Size of remote file: 129 kB
plots/es_perplexity_score.png ADDED

Git LFS Details

  • SHA256: f4f79d8e9248b211527c93f82c4e63b18804d8d765e6ac9314a65b10d9bf310c
  • Pointer size: 131 Bytes
  • Size of remote file: 186 kB
plots/es_special_character_ratio.png ADDED

Git LFS Details

  • SHA256: 1b40dd9627369aafbc1749410b73629078f38d8abbdde81abdce134608f2b90b
  • Pointer size: 131 Bytes
  • Size of remote file: 148 kB
plots/es_word_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: 1064d62b8fc32db42271f6cf26cc74242e3772449bc49479082ad23bf1392350
  • Pointer size: 131 Bytes
  • Size of remote file: 114 kB
plots/eu_character_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: c3d825009e273f2f3167f10f78e2f2bc5e69e92965846b113f08386c7bdee17f
  • Pointer size: 131 Bytes
  • Size of remote file: 159 kB
plots/eu_closed_class_word_ratio.png ADDED

Git LFS Details

  • SHA256: 25ceaa4fdc65dc19f5e18f04bf73bb6532559f432b7823182323718e5df2c1f9
  • Pointer size: 131 Bytes
  • Size of remote file: 188 kB
plots/eu_flagged_word_ratio.png ADDED

Git LFS Details

  • SHA256: ce750c5e272c6dfef45852c4facd82fafac81d771de9c8818cd81312880e953b
  • Pointer size: 131 Bytes
  • Size of remote file: 118 kB
plots/eu_number_of_words.png ADDED

Git LFS Details

  • SHA256: d1e44bdae7266b683577a7ffb8f9ce4dbe93eb841f45fd3d0909ee04afffa999
  • Pointer size: 131 Bytes
  • Size of remote file: 133 kB
plots/eu_perplexity_score.png ADDED

Git LFS Details

  • SHA256: 79212bb07f2f3aa88f07f6512c58a2c6628b28ac63d9e6fcb54623d68ac48d27
  • Pointer size: 131 Bytes
  • Size of remote file: 158 kB
plots/eu_special_character_ratio.png ADDED

Git LFS Details

  • SHA256: d994f85ccab727aa0d13bc61a14f817a7cb2f73c96c10463f2c420406247925d
  • Pointer size: 131 Bytes
  • Size of remote file: 156 kB
plots/eu_word_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: 1909e1c7780196f01e86c211cadb55889307a7f30a3def9e2d94c6486a338351
  • Pointer size: 131 Bytes
  • Size of remote file: 109 kB
plots/fr_character_repetition_ratio.png ADDED

Git LFS Details

  • SHA256: fb72a7cc99bf105f89bec95278793984a1c56817cbb6a39b435f50255f038318
  • Pointer size: 131 Bytes
  • Size of remote file: 158 kB
plots/fr_closed_class_word_ratio.png ADDED

Git LFS Details

  • SHA256: 2eab9d238667126eca14243c58ee679c681dafa1db59f69a79f7ddb6135f9952
  • Pointer size: 131 Bytes
  • Size of remote file: 150 kB
plots/fr_flagged_word_ratio.png ADDED

Git LFS Details

  • SHA256: d76137a58b8389671c5a8d44592609095f12a48e27dc821733ad9c0a7a846187
  • Pointer size: 131 Bytes
  • Size of remote file: 125 kB
plots/fr_number_of_words.png ADDED

Git LFS Details

  • SHA256: 0706293ebc11343405826da43a83afc658217562e5acdde4b44a34b8d2244234
  • Pointer size: 131 Bytes
  • Size of remote file: 124 kB
plots/fr_perplexity_score.png ADDED

Git LFS Details

  • SHA256: bdf3406197464ee54623e5c3fb96af20107b073c638fb58c2d97d8e7930b4eb7
  • Pointer size: 131 Bytes
  • Size of remote file: 149 kB