HugoLaurencon
commited on
Commit
•
bfbcd60
1
Parent(s):
649ea6a
button to download parameters
Browse files- app.py +114 -88
- explanation_filtering_pipeline.pdf +0 -0
app.py
CHANGED
@@ -162,9 +162,7 @@ class Visualization:
|
|
162 |
if "10" in val_repetitions_lengths
|
163 |
else 0
|
164 |
)
|
165 |
-
label_selectbox = (
|
166 |
-
"Length of the repetitions (that will determine the repetitions ratio)."
|
167 |
-
)
|
168 |
repetitions_length = st.sidebar.selectbox(
|
169 |
label=label_selectbox,
|
170 |
options=val_repetitions_lengths,
|
@@ -261,6 +259,7 @@ class Visualization:
|
|
261 |
return keys, conds
|
262 |
|
263 |
self.keys, conds = set_sliders()
|
|
|
264 |
|
265 |
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
|
266 |
all_conds = np.all(all_conds, axis=0)
|
@@ -347,10 +346,14 @@ class Visualization:
|
|
347 |
cutoff_def = "If the length of a word is higher than this number, the word is removed."
|
348 |
max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
|
349 |
cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
|
|
|
|
|
350 |
|
351 |
incorrect_substrings = st.sidebar.checkbox(
|
352 |
"Remove words with incorrect substrings."
|
353 |
)
|
|
|
|
|
354 |
|
355 |
cond_words = self.words["len_word"] <= cutoff_word
|
356 |
if incorrect_substrings:
|
@@ -381,6 +384,13 @@ class Visualization:
|
|
381 |
)
|
382 |
st.dataframe(retained_words)
|
383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
def plot_distributions_filtering_parameters(self):
|
385 |
st.header("Distributions of the filtering parameters")
|
386 |
|
@@ -437,94 +447,109 @@ class Visualization:
|
|
437 |
is_discarded = False
|
438 |
|
439 |
def is_doc_discarded(key, score):
|
440 |
-
if key[2]:
|
441 |
return score > key[1]
|
442 |
else:
|
443 |
return score < key[1]
|
444 |
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
528 |
|
529 |
def download_data(self):
|
530 |
st.header("Download data")
|
@@ -543,8 +568,9 @@ class Visualization:
|
|
543 |
self.set_title()
|
544 |
self.filtering_of_docs()
|
545 |
self.filtering_of_words()
|
|
|
546 |
self.plot_distributions_filtering_parameters()
|
547 |
-
#self.plot_zipf_law()
|
548 |
self.analyse_personal_doc()
|
549 |
self.download_data()
|
550 |
|
|
|
162 |
if "10" in val_repetitions_lengths
|
163 |
else 0
|
164 |
)
|
165 |
+
label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
|
|
|
|
|
166 |
repetitions_length = st.sidebar.selectbox(
|
167 |
label=label_selectbox,
|
168 |
options=val_repetitions_lengths,
|
|
|
259 |
return keys, conds
|
260 |
|
261 |
self.keys, conds = set_sliders()
|
262 |
+
self.parameters = self.keys * 1
|
263 |
|
264 |
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
|
265 |
all_conds = np.all(all_conds, axis=0)
|
|
|
346 |
cutoff_def = "If the length of a word is higher than this number, the word is removed."
|
347 |
max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
|
348 |
cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
|
349 |
+
self.parameters.append(("len_word", cutoff_word, True))
|
350 |
+
st.sidebar.caption("---------")
|
351 |
|
352 |
incorrect_substrings = st.sidebar.checkbox(
|
353 |
"Remove words with incorrect substrings."
|
354 |
)
|
355 |
+
self.parameters.append(("incorrect_substrings", incorrect_substrings))
|
356 |
+
st.sidebar.caption("---------")
|
357 |
|
358 |
cond_words = self.words["len_word"] <= cutoff_word
|
359 |
if incorrect_substrings:
|
|
|
384 |
)
|
385 |
st.dataframe(retained_words)
|
386 |
|
387 |
+
def download_parameters(self):
|
388 |
+
btn = st.sidebar.download_button(
|
389 |
+
label="Download current parameters as json",
|
390 |
+
data=json.dumps(self.parameters),
|
391 |
+
file_name=f"parameters_{self.lang_dataset_id}.json",
|
392 |
+
)
|
393 |
+
|
394 |
def plot_distributions_filtering_parameters(self):
|
395 |
st.header("Distributions of the filtering parameters")
|
396 |
|
|
|
447 |
is_discarded = False
|
448 |
|
449 |
def is_doc_discarded(key, score):
|
450 |
+
if key[2]: # max cutoff
|
451 |
return score > key[1]
|
452 |
else:
|
453 |
return score < key[1]
|
454 |
|
455 |
+
if personal_doc:
|
456 |
+
|
457 |
+
st.markdown("Statistics of the document:")
|
458 |
+
|
459 |
+
for key in self.keys:
|
460 |
+
if key[0] == "number_words":
|
461 |
+
words = ModifyingDocuments.get_words_from_document(
|
462 |
+
personal_doc,
|
463 |
+
self.sentencepiece_model_tok,
|
464 |
+
lower_case=False,
|
465 |
+
strip_characters=self.param["strip_characters"],
|
466 |
+
)
|
467 |
+
if key[2]:
|
468 |
+
st.markdown(f"Number of words: {len(words)}")
|
469 |
+
if is_doc_discarded(key, len(words)):
|
470 |
+
is_discarded = True
|
471 |
+
|
472 |
+
elif key[0] == "repetitions_ratio":
|
473 |
+
repetitions_ratio = Filtering.compute_repetitions_ratio(
|
474 |
+
personal_doc, int(key[3])
|
475 |
+
)
|
476 |
+
repetitions_ratio = round(repetitions_ratio, 3)
|
477 |
+
st.markdown(f"Repetitions ratio: {repetitions_ratio}")
|
478 |
+
if is_doc_discarded(key, repetitions_ratio):
|
479 |
+
is_discarded = True
|
480 |
+
|
481 |
+
elif key[0] == "special_characters_ratio":
|
482 |
+
special_characters_ratio = (
|
483 |
+
Filtering.compute_special_characters_ratio(
|
484 |
+
personal_doc, self.param["special_characters"]
|
485 |
+
)
|
486 |
+
)
|
487 |
+
special_characters_ratio = round(special_characters_ratio, 3)
|
488 |
+
st.markdown(f"Special characters ratio: {special_characters_ratio}")
|
489 |
+
if is_doc_discarded(key, special_characters_ratio):
|
490 |
+
is_discarded = True
|
491 |
+
|
492 |
+
elif key[0] == "stopwords_ratio":
|
493 |
+
stopwords_ratio = Filtering.compute_stopwords_ratio(
|
494 |
+
personal_doc,
|
495 |
+
self.sentencepiece_model_tok,
|
496 |
+
self.param["strip_characters"],
|
497 |
+
self.param["cond_words_augmentation"],
|
498 |
+
self.param["words_augmentation_group_sizes"],
|
499 |
+
self.param["words_augmentation_join_char"],
|
500 |
+
self.stopwords,
|
501 |
+
)
|
502 |
+
stopwords_ratio = round(stopwords_ratio, 3)
|
503 |
+
st.markdown(f"Stop words ratio: {stopwords_ratio}")
|
504 |
+
if is_doc_discarded(key, stopwords_ratio):
|
505 |
+
is_discarded = True
|
506 |
+
|
507 |
+
elif key[0] == "badwords_ratio":
|
508 |
+
badwords_ratio = Filtering.compute_badwords_ratio(
|
509 |
+
personal_doc,
|
510 |
+
self.sentencepiece_model_tok,
|
511 |
+
self.param["strip_characters"],
|
512 |
+
self.param["cond_words_augmentation"],
|
513 |
+
self.param["words_augmentation_group_sizes"],
|
514 |
+
self.param["words_augmentation_join_char"],
|
515 |
+
self.badwords,
|
516 |
+
)
|
517 |
+
badwords_ratio = round(badwords_ratio, 3)
|
518 |
+
st.markdown(f"Flagged words ratio: {badwords_ratio}")
|
519 |
+
if is_doc_discarded(key, badwords_ratio):
|
520 |
+
is_discarded = True
|
521 |
+
|
522 |
+
elif key[0] == "lang_id_score":
|
523 |
+
(
|
524 |
+
lang_pred_dataset_id,
|
525 |
+
lang_id_score,
|
526 |
+
) = Filtering.compute_lang_id_pred_score(
|
527 |
+
personal_doc, self.model_lang_id
|
528 |
+
)
|
529 |
+
lang_id_score = round(lang_id_score, 3)
|
530 |
+
st.markdown(
|
531 |
+
f"Language identification confidence score: {lang_id_score}"
|
532 |
+
)
|
533 |
+
if is_doc_discarded(key, badwords_ratio) or (
|
534 |
+
self.lang_dataset_id != lang_pred_dataset_id
|
535 |
+
):
|
536 |
+
is_discarded = True
|
537 |
+
|
538 |
+
elif key[0] == "perplexity_score":
|
539 |
+
perplexity_score = Filtering.compute_perplexity_score(
|
540 |
+
personal_doc,
|
541 |
+
self.sentencepiece_model,
|
542 |
+
self.kenlm_model,
|
543 |
+
)
|
544 |
+
perplexity_score = round(perplexity_score, 3)
|
545 |
+
st.markdown(f"Perplexity score: {perplexity_score}")
|
546 |
+
if is_doc_discarded(key, perplexity_score):
|
547 |
+
is_discarded = True
|
548 |
+
|
549 |
+
is_discarded = "" if is_discarded else "not "
|
550 |
+
st.markdown(
|
551 |
+
f"With the current filtering parameters, this document **is {is_discarded}discarded**."
|
552 |
+
)
|
553 |
|
554 |
def download_data(self):
|
555 |
st.header("Download data")
|
|
|
568 |
self.set_title()
|
569 |
self.filtering_of_docs()
|
570 |
self.filtering_of_words()
|
571 |
+
self.download_parameters()
|
572 |
self.plot_distributions_filtering_parameters()
|
573 |
+
# self.plot_zipf_law()
|
574 |
self.analyse_personal_doc()
|
575 |
self.download_data()
|
576 |
|
explanation_filtering_pipeline.pdf
CHANGED
Binary files a/explanation_filtering_pipeline.pdf and b/explanation_filtering_pipeline.pdf differ
|
|