HugoLaurencon commited on
Commit
4809033
1 Parent(s): fa81556

new filter on word repetition ratio

Browse files
app.py CHANGED
@@ -156,17 +156,17 @@ class Visualization_for_lang:
156
 
157
  conds["number_words"] = [cond_1, cond_2]
158
 
159
- if "repetitions_ratio" in columns:
160
- with st.sidebar.expander("Repetitions ratio"):
161
  val_repetitions_lengths = list(
162
- self.docs["repetitions_ratio"].iloc[0].keys()
163
  )
164
  default_index = (
165
  val_repetitions_lengths.index("10")
166
  if "10" in val_repetitions_lengths
167
  else 0
168
  )
169
- label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
170
  repetitions_length = st.selectbox(
171
  label=label_selectbox,
172
  options=val_repetitions_lengths,
@@ -175,25 +175,27 @@ class Visualization_for_lang:
175
  st.caption(
176
  "Choosing a higher or lower number does not mean that the filtering "
177
  "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
178
- "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
179
  "few or no repetitions, simply because their length gives them more diversity, and we do "
180
- "not want to discard such documents."
 
 
181
  )
182
- self.docs["repetitions_ratio"] = self.docs_checkpoint[
183
- "repetitions_ratio"
184
  ]
185
- for i in range(len(self.docs["repetitions_ratio"])):
186
- self.docs["repetitions_ratio"].iloc[i] = self.docs[
187
- "repetitions_ratio"
188
  ].iloc[i][repetitions_length]
189
 
190
- cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
191
- cutoff_repetitions_ratio = st.slider(
192
  cutoff_def, 0.0, 1.0, 1.0, step=0.01
193
  )
194
  new_key = (
195
- "repetitions_ratio",
196
- cutoff_repetitions_ratio,
197
  True,
198
  repetitions_length,
199
  )
@@ -201,7 +203,55 @@ class Visualization_for_lang:
201
  Visualization_for_lang.plot_hist(self.docs, new_key)
202
  cond = get_cond(new_key[0], new_key[1], new_key[2])
203
  Visualization_for_lang.print_discarded_by_cond(cond)
204
- conds["repetitions_ratio"] = [cond]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  if "special_characters_ratio" in columns:
207
  with st.sidebar.expander("Special characters ratio"):
@@ -361,12 +411,25 @@ class Visualization_for_lang:
361
  "docs",
362
  )
363
 
364
- if "repetitions_ratio" in columns:
365
- cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  Visualization_for_lang.display_dataset(
367
  self.docs,
368
  cond_filter,
369
- "Discarded documents for the filter on the repetitions ratio",
370
  "docs",
371
  )
372
 
@@ -606,13 +669,31 @@ class Visualization_for_lang:
606
  if is_doc_discarded(key, len(words)):
607
  is_discarded = True
608
 
609
- elif key[0] == "repetitions_ratio":
610
- repetitions_ratio = Filtering.compute_repetitions_ratio(
611
- personal_doc, int(key[3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
  )
613
- repetitions_ratio = round(repetitions_ratio, 3)
614
- st.markdown(f"Repetitions ratio: {repetitions_ratio}")
615
- if is_doc_discarded(key, repetitions_ratio):
616
  is_discarded = True
617
 
618
  elif key[0] == "special_characters_ratio":
@@ -773,7 +854,7 @@ class Visualization:
773
 
774
  def visualization(self):
775
  self.preamble()
776
- # self.warning_preamble()
777
  self.choose_lang()
778
 
779
 
 
156
 
157
  conds["number_words"] = [cond_1, cond_2]
158
 
159
+ if "character_repetition_ratio" in columns:
160
+ with st.sidebar.expander("Character repetition ratio"):
161
  val_repetitions_lengths = list(
162
+ self.docs["character_repetition_ratio"].iloc[0].keys()
163
  )
164
  default_index = (
165
  val_repetitions_lengths.index("10")
166
  if "10" in val_repetitions_lengths
167
  else 0
168
  )
169
+ label_selectbox = "Length of repetitions in characters (that will influence the character repetition ratio)."
170
  repetitions_length = st.selectbox(
171
  label=label_selectbox,
172
  options=val_repetitions_lengths,
 
175
  st.caption(
176
  "Choosing a higher or lower number does not mean that the filtering "
177
  "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
178
+ "tends to associate a high character repetition ratio to very long documents (like book chapters), but with "
179
  "few or no repetitions, simply because their length gives them more diversity, and we do "
180
+ "not want to discard such documents. It is generally better to increase this number, so that false "
181
+ "positives are very short documents (which we want to delete anyway) rather than long ones. However, "
182
+ "a low number can be useful for Chinese, where a character can designate a whole word."
183
  )
184
+ self.docs["character_repetition_ratio"] = self.docs_checkpoint[
185
+ "character_repetition_ratio"
186
  ]
187
+ for i in range(len(self.docs["character_repetition_ratio"])):
188
+ self.docs["character_repetition_ratio"].iloc[i] = self.docs[
189
+ "character_repetition_ratio"
190
  ].iloc[i][repetitions_length]
191
 
192
+ cutoff_def = "If the character repetition ratio of a document is higher than this number, the document is removed."
193
+ cutoff_character_repetition_ratio = st.slider(
194
  cutoff_def, 0.0, 1.0, 1.0, step=0.01
195
  )
196
  new_key = (
197
+ "character_repetition_ratio",
198
+ cutoff_character_repetition_ratio,
199
  True,
200
  repetitions_length,
201
  )
 
203
  Visualization_for_lang.plot_hist(self.docs, new_key)
204
  cond = get_cond(new_key[0], new_key[1], new_key[2])
205
  Visualization_for_lang.print_discarded_by_cond(cond)
206
+ conds["character_repetition_ratio"] = [cond]
207
+
208
+ if "word_repetition_ratio" in columns:
209
+ with st.sidebar.expander("Word repetition ratio"):
210
+ val_repetitions_lengths = list(
211
+ self.docs["word_repetition_ratio"].iloc[0].keys()
212
+ )
213
+ default_index = (
214
+ val_repetitions_lengths.index("5")
215
+ if "5" in val_repetitions_lengths
216
+ else 0
217
+ )
218
+ label_selectbox = "Length of repetitions in words (that will influence the word repetition ratio)."
219
+ repetitions_length = st.selectbox(
220
+ label=label_selectbox,
221
+ options=val_repetitions_lengths,
222
+ index=default_index,
223
+ )
224
+ st.caption(
225
+ "Choosing a higher or lower number does not mean that the filtering "
226
+ "is stronger or weaker. Be careful, choosing a low number (like 3) could "
227
+ "tend to associate a high word repetition ratio to very long documents (like book chapters), but with "
228
+ "few or no repetitions, simply because their length gives them more diversity, and we do "
229
+ "not want to discard such documents. It is generally better to increase a bit this number, so that false "
230
+ "positives are very short documents (which we want to delete anyway) rather than long ones."
231
+ )
232
+ self.docs["word_repetition_ratio"] = self.docs_checkpoint[
233
+ "word_repetition_ratio"
234
+ ]
235
+ for i in range(len(self.docs["word_repetition_ratio"])):
236
+ self.docs["word_repetition_ratio"].iloc[i] = self.docs[
237
+ "word_repetition_ratio"
238
+ ].iloc[i][repetitions_length]
239
+
240
+ cutoff_def = "If the word repetition ratio of a document is higher than this number, the document is removed."
241
+ cutoff_word_repetition_ratio = st.slider(
242
+ cutoff_def, 0.0, 1.0, 1.0, step=0.01
243
+ )
244
+ new_key = (
245
+ "word_repetition_ratio",
246
+ cutoff_word_repetition_ratio,
247
+ True,
248
+ repetitions_length,
249
+ )
250
+ keys.append(new_key)
251
+ Visualization_for_lang.plot_hist(self.docs, new_key)
252
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
253
+ Visualization_for_lang.print_discarded_by_cond(cond)
254
+ conds["word_repetition_ratio"] = [cond]
255
 
256
  if "special_characters_ratio" in columns:
257
  with st.sidebar.expander("Special characters ratio"):
 
411
  "docs",
412
  )
413
 
414
+ if "character_repetition_ratio" in columns:
415
+ cond_filter = np.invert(
416
+ np.all(conds["character_repetition_ratio"], axis=0)
417
+ )
418
+ Visualization_for_lang.display_dataset(
419
+ self.docs,
420
+ cond_filter,
421
+ "Discarded documents for the filter on the character repetition ratio",
422
+ "docs",
423
+ )
424
+
425
+ if "word_repetition_ratio" in columns:
426
+ cond_filter = np.invert(
427
+ np.all(conds["word_repetition_ratio"], axis=0)
428
+ )
429
  Visualization_for_lang.display_dataset(
430
  self.docs,
431
  cond_filter,
432
+ "Discarded documents for the filter on the word repetition ratio",
433
  "docs",
434
  )
435
 
 
669
  if is_doc_discarded(key, len(words)):
670
  is_discarded = True
671
 
672
+ elif key[0] == "character_repetition_ratio":
673
+ character_repetition_ratio = (
674
+ Filtering.compute_character_repetition_ratio(
675
+ personal_doc, int(key[3])
676
+ )
677
+ )
678
+ character_repetition_ratio = round(
679
+ character_repetition_ratio, 3
680
+ )
681
+ st.markdown(
682
+ f"Character repetition ratio: {character_repetition_ratio}"
683
+ )
684
+ if is_doc_discarded(key, character_repetition_ratio):
685
+ is_discarded = True
686
+
687
+ elif key[0] == "word_repetition_ratio":
688
+ word_repetition_ratio = Filtering.compute_word_repetition_ratio(
689
+ personal_doc,
690
+ self.sentencepiece_model_tok,
691
+ self.param["strip_characters"],
692
+ int(key[3]),
693
  )
694
+ word_repetition_ratio = round(word_repetition_ratio, 3)
695
+ st.markdown(f"Word repetition ratio: {word_repetition_ratio}")
696
+ if is_doc_discarded(key, word_repetition_ratio):
697
  is_discarded = True
698
 
699
  elif key[0] == "special_characters_ratio":
 
854
 
855
  def visualization(self):
856
  self.preamble()
857
+ self.warning_preamble()
858
  self.choose_lang()
859
 
860
 
en_examples_with_stats.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29417f05cc029ab24ba89cfc4358dac755411b01f1925c735c2205b68f975fcc
3
- size 240781004
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac12d82e24642fd0b1d4f6c5b8fbe1edb42dc15a38185ccc8ec95ac0fe687bc2
3
+ size 241407829
explanation_filtering_pipeline.pdf CHANGED
Binary files a/explanation_filtering_pipeline.pdf and b/explanation_filtering_pipeline.pdf differ
 
zh_examples_with_stats.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90ffaf5e5c7b556587c8b2b97ad49c752bea5608d5cc56b7ea03fb0d96a71fd2
3
- size 62914634
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85f70e561c971b468ba69963841b73e6a6da0a230f19f191234701e926688feb
3
+ size 63554172