victormiller commited on
Commit
ee43c81
1 Parent(s): d82cc95

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +322 -126
curated.py CHANGED
@@ -484,6 +484,291 @@ freelaw_examples = Div(
484
  ),
485
  )
486
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  filtering_process = Div(
488
  Section(
489
  H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
@@ -497,6 +782,10 @@ filtering_process = Div(
497
  H4("Filtering"),
498
  P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
499
  table_div_wikipedia,
 
 
 
 
500
  ),
501
  ),
502
  Section(
@@ -514,6 +803,10 @@ filtering_process = Div(
514
  Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
515
  ),
516
  table_div_arx,
 
 
 
 
517
  ),
518
  ),
519
  Section(
@@ -552,6 +845,10 @@ filtering_process = Div(
552
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
553
  ),
554
  table_div_s2o,
 
 
 
 
555
  ),
556
  ),
557
  Section(
@@ -584,6 +881,10 @@ filtering_process = Div(
584
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
585
  ),
586
  table_div_med,
 
 
 
 
587
  ),
588
  ),
589
  Section(
@@ -597,6 +898,10 @@ filtering_process = Div(
597
  Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
598
  ),
599
  table_div_phil,
 
 
 
 
600
  ),
601
  ),
602
  Section(
@@ -608,6 +913,10 @@ filtering_process = Div(
608
  H4("Filtering"),
609
  P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
610
  table_div_up,
 
 
 
 
611
  ),
612
  ),
613
  Section(
@@ -697,6 +1006,10 @@ filtering_process = Div(
697
  Li("Minimum Word Count Filter: 10"),
698
  ),
699
  table_div_se,
 
 
 
 
700
  ),
701
  ),
702
  Section(
@@ -724,7 +1037,7 @@ filtering_process = Div(
724
  Li("Minimum Word Count Filter: 10"),
725
  Li("Unigram Log Probability"),
726
  ),
727
- table_div_uirc,
728
  ),
729
  ),
730
  Section(
@@ -745,6 +1058,10 @@ filtering_process = Div(
745
  Li("None"),
746
  ),
747
  table_div_dmm,
 
 
 
 
748
  ),
749
  ),
750
  Section(
@@ -762,6 +1079,10 @@ filtering_process = Div(
762
  Li("Unigram Log Probability"),
763
  ),
764
  table_div_pg19,
 
 
 
 
765
  ),
766
  ),
767
  )
@@ -887,78 +1208,6 @@ table_div_data_pipe = Div(NotStr(table_html_data_pipe), style="margin: 40px;")
887
 
888
 
889
 
890
-
891
- def get_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
892
- doc_id = max(0, min(int(doc_id), 9))
893
-
894
- if data_source == "Freelaw":
895
- raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json"))
896
- extracted_sample_doc = json.load(
897
- open("data/curated_samples/freelaw_extract.json")
898
- )
899
- elif data_source == "Wikipedia":
900
- raw_sample_doc = extracted_sample_doc = json.load(
901
- open("data/curated_samples/wiki.json")
902
- )
903
- elif data_source == "StackExchange":
904
- raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
905
- extracted_sample_doc = json.load(
906
- open("data/curated_samples/stackexchange_extract.json")
907
- )
908
- elif data_source == "PhilPapers":
909
- raw_sample_doc = extracted_sample_doc = json.load(
910
- open("data/curated_samples/philpapers_raw.json")
911
- )
912
- elif data_source == "Arxiv":
913
- raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json"))
914
- extracted_sample_doc = json.load(
915
- open("data/curated_samples/arxiv_extract.json")
916
- )
917
- elif data_source == "S2ORC":
918
- raw_sample_doc = extracted_sample_doc = json.load(
919
- open("data/curated_samples/s2orc_raw.json")
920
- )
921
- elif data_source == "S2ORC Abstract":
922
- raw_sample_doc = extracted_sample_doc = json.load(
923
- open("data/curated_samples/s2orc_abstract_raw.json")
924
- )
925
- elif data_source == "Pubmed":
926
- raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
927
- extracted_sample_doc = json.load(
928
- open("data/curated_samples/pubmed_extract.json")
929
- )
930
- elif data_source == "DM Maths":
931
- raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
932
- extracted_sample_doc = json.load(
933
- open("data/curated_samples/dm_maths_extract.json")
934
- )
935
- elif data_source == "PG19":
936
- raw_sample_doc = extracted_sample_doc = json.load(
937
- open("data/curated_samples/pg19_raw.json")
938
- )
939
- elif data_source == "Europarl":
940
- raw_sample_doc = extracted_sample_doc = json.load(
941
- open("data/curated_samples/europarl_raw.json")
942
- )
943
- else:
944
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
945
-
946
- raw_json = raw_sample_doc[doc_id]
947
- extracted_json = extracted_sample_doc[doc_id]
948
- return view_data(
949
- raw_json,
950
- extracted_json,
951
- doc_id=doc_id,
952
- data_source=data_source,
953
- data_sources=data_sources,
954
- target=target,
955
- )
956
-
957
-
958
-
959
-
960
-
961
-
962
  def update(target: str, request):
963
  params = request.query_params
964
  if data_source := params.get(f"data_source_{target}"):
@@ -1082,58 +1331,6 @@ def curated(request):
1082
  )
1083
 
1084
 
1085
- preprocessing_steps = pd.DataFrame(
1086
- {
1087
- "Step": [
1088
- "Language Filter",
1089
- "Min Word Count",
1090
- "Title Abstract",
1091
- "Majority Language",
1092
- "Paragraph Count",
1093
- "Frequency",
1094
- "Unigram Log Probability",
1095
- ],
1096
- "Description": [
1097
- "Filtering data based on language",
1098
- "Setting a minimum word count threshold",
1099
- "Extracting information from the title and abstract",
1100
- "Identifying the majority language in the dataset",
1101
- "Counting the number of paragraphs in each document",
1102
- "Calculating the frequency of each word in the dataset",
1103
- "Calculating the log probability of each unigram",
1104
- ],
1105
- "Need": [
1106
- "To remove documents in unwanted languages",
1107
- "To filter out documents with very few words",
1108
- "To extract relevant information for analysis",
1109
- "To understand the distribution of languages in the dataset",
1110
- "To analyze the structure and length of documents",
1111
- "To identify important words in the dataset",
1112
- "To measure the significance of individual words",
1113
- ],
1114
- "Pros": [
1115
- "Improves data quality by removing irrelevant documents",
1116
- "Filters out low-quality or incomplete documents",
1117
- "Provides additional information for analysis",
1118
- "Enables language-specific analysis and insights",
1119
- "Helps understand the complexity and content of documents",
1120
- "Identifies important terms and topics in the dataset",
1121
- "Quantifies the importance of individual words",
1122
- ],
1123
- "Cons": [
1124
- "May exclude documents in less common languages",
1125
- "May remove documents with valuable information",
1126
- "May introduce bias in the analysis",
1127
- "May not accurately represent the language distribution",
1128
- "May not capture the complexity of document structure",
1129
- "May be sensitive to noise and outliers",
1130
- "May not capture the semantic meaning of words",
1131
- ],
1132
- }
1133
- )
1134
-
1135
- table_html = preprocessing_steps.to_html(index=False, border=0)
1136
- table_div = Div(NotStr(table_html), style="margin: 40px;")
1137
  data_preprocessing_div = Div(
1138
  H2("Data Preprocessing"),
1139
  P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
@@ -1166,7 +1363,6 @@ def curated(request):
1166
  plotly2fasthtml(diff2_stacked_bar),
1167
  P("The figure above provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
1168
  filtering_process,
1169
- freelaw_examples,
1170
  data_preparation_div,
1171
  #H2("Local Deduplication"), are these numbers even right?
1172
  #local_dedup_text,
 
484
  ),
485
  )
486
 
487
+
488
+ def get_wiki_data(data_source: str = "Wikipedia", doc_id: int = 3, target: str = "foo"):
489
+ doc_id = max(0, min(int(doc_id), 9))
490
+
491
+ if data_source == "Wikipedia":
492
+ raw_sample_doc = extracted_sample_doc = json.load(
493
+ open("data/curated_samples/wiki.json")
494
+ )
495
+ else:
496
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
497
+
498
+ raw_json = raw_sample_doc[doc_id]
499
+ extracted_json = extracted_sample_doc[doc_id]
500
+ return view_data(
501
+ raw_json,
502
+ extracted_json,
503
+ doc_id=doc_id,
504
+ data_source="Wikipedia",
505
+ data_sources="Wikipedia",
506
+ target=target,
507
+ )
508
+
509
+ wiki_examples = Div(
510
+ Div(
511
+ get_wiki_data(target=gen_random_id()),
512
+ style="border: 1px solid #ccc; padding: 20px;",
513
+ ),
514
+ )
515
+
516
+ def get_se_data(data_source: str = "StackExchange", doc_id: int = 3, target: str = "foo"):
517
+ doc_id = max(0, min(int(doc_id), 9))
518
+
519
+ if data_source == "StackExchange":
520
+ raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
521
+ extracted_sample_doc = json.load(
522
+ open("data/curated_samples/stackexchange_extract.json")
523
+ )
524
+ else:
525
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
526
+
527
+ raw_json = raw_sample_doc[doc_id]
528
+ extracted_json = extracted_sample_doc[doc_id]
529
+ return view_data(
530
+ raw_json,
531
+ extracted_json,
532
+ doc_id=doc_id,
533
+ data_source="StackExchange",
534
+ data_sources="StackExchange",
535
+ target=target,
536
+ )
537
+
538
+ se_examples = Div(
539
+ Div(
540
+ get_se_data(target=gen_random_id()),
541
+ style="border: 1px solid #ccc; padding: 20px;",
542
+ ),
543
+ )
544
+
545
+ def get_phil_data(data_source: str = "PhilPapers", doc_id: int = 3, target: str = "foo"):
546
+ doc_id = max(0, min(int(doc_id), 9))
547
+
548
+ if data_source == "PhilPapers":
549
+ raw_sample_doc = extracted_sample_doc = json.load(
550
+ open("data/curated_samples/philpapers_raw.json")
551
+ )
552
+ else:
553
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
554
+
555
+ raw_json = raw_sample_doc[doc_id]
556
+ extracted_json = extracted_sample_doc[doc_id]
557
+ return view_data(
558
+ raw_json,
559
+ extracted_json,
560
+ doc_id=doc_id,
561
+ data_source="PhilPapers",
562
+ data_sources="PhilPapers",
563
+ target=target,
564
+ )
565
+
566
+ phil_examples = Div(
567
+ Div(
568
+ get_phil_data(target=gen_random_id()),
569
+ style="border: 1px solid #ccc; padding: 20px;",
570
+ ),
571
+ )
572
+
573
+ def get_arx_data(data_source: str = "Arxiv", doc_id: int = 3, target: str = "foo"):
574
+ doc_id = max(0, min(int(doc_id), 9))
575
+
576
+ if data_source == "Arxiv":
577
+ raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json"))
578
+ extracted_sample_doc = json.load(
579
+ open("data/curated_samples/arxiv_extract.json")
580
+ )
581
+ else:
582
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
583
+
584
+ raw_json = raw_sample_doc[doc_id]
585
+ extracted_json = extracted_sample_doc[doc_id]
586
+ return view_data(
587
+ raw_json,
588
+ extracted_json,
589
+ doc_id=doc_id,
590
+ data_source="Arxiv",
591
+ data_sources="Arxiv",
592
+ target=target,
593
+ )
594
+
595
+ arx_examples = Div(
596
+ Div(
597
+ get_arx_data(target=gen_random_id()),
598
+ style="border: 1px solid #ccc; padding: 20px;",
599
+ ),
600
+ )
601
+
602
+ def get_S2ORC_data(data_source: str = "S2ORC", doc_id: int = 3, target: str = "foo"):
603
+ doc_id = max(0, min(int(doc_id), 9))
604
+
605
+ if data_source == "S2ORC":
606
+ raw_sample_doc = extracted_sample_doc = json.load(
607
+ open("data/curated_samples/s2orc_raw.json")
608
+ )
609
+ else:
610
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
611
+
612
+ raw_json = raw_sample_doc[doc_id]
613
+ extracted_json = extracted_sample_doc[doc_id]
614
+ return view_data(
615
+ raw_json,
616
+ extracted_json,
617
+ doc_id=doc_id,
618
+ data_source="S2ORC",
619
+ data_sources="S2ORC",
620
+ target=target,
621
+ )
622
+
623
+ s2o_examples = Div(
624
+ Div(
625
+ get_S2ORC_data(target=gen_random_id()),
626
+ style="border: 1px solid #ccc; padding: 20px;",
627
+ ),
628
+ )
629
+
630
+ def get_S2ORCA_data(data_source: str = "S2ORC Abstract", doc_id: int = 3, target: str = "foo"):
631
+ doc_id = max(0, min(int(doc_id), 9))
632
+
633
+ if data_source == "S2ORC":
634
+ raw_sample_doc = extracted_sample_doc = json.load(
635
+ open("data/curated_samples/s2orc_abstract_raw.json")
636
+ )
637
+ else:
638
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
639
+
640
+ raw_json = raw_sample_doc[doc_id]
641
+ extracted_json = extracted_sample_doc[doc_id]
642
+ return view_data(
643
+ raw_json,
644
+ extracted_json,
645
+ doc_id=doc_id,
646
+ data_source="S2ORC Abstract",
647
+ data_sources="S2ORC Abstract",
648
+ target=target,
649
+ )
650
+
651
+ s2oa_examples = Div(
652
+ Div(
653
+ get_S2ORCA_data(target=gen_random_id()),
654
+ style="border: 1px solid #ccc; padding: 20px;",
655
+ ),
656
+ )
657
+
658
+ def get_pubmed_data(data_source: str = "Pubmed", doc_id: int = 3, target: str = "foo"):
659
+ doc_id = max(0, min(int(doc_id), 9))
660
+
661
+ if data_source == "Pubmed":
662
+ raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
663
+ extracted_sample_doc = json.load(
664
+ open("data/curated_samples/pubmed_extract.json")
665
+ )
666
+ else:
667
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
668
+
669
+ raw_json = raw_sample_doc[doc_id]
670
+ extracted_json = extracted_sample_doc[doc_id]
671
+ return view_data(
672
+ raw_json,
673
+ extracted_json,
674
+ doc_id=doc_id,
675
+ data_source="Pubmed",
676
+ data_sources="Pubmed",
677
+ target=target,
678
+ )
679
+
680
+ pubmed_examples = Div(
681
+ Div(
682
+ get_pubmed_data(target=gen_random_id()),
683
+ style="border: 1px solid #ccc; padding: 20px;",
684
+ ),
685
+ )
686
+
687
+ def get_dmm_data(data_source: str = "DM Math", doc_id: int = 3, target: str = "foo"):
688
+ doc_id = max(0, min(int(doc_id), 9))
689
+
690
+ if data_source == "DM Math":
691
+ raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
692
+ extracted_sample_doc = json.load(
693
+ open("data/curated_samples/dm_maths_extract.json")
694
+ )
695
+ else:
696
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
697
+
698
+ raw_json = raw_sample_doc[doc_id]
699
+ extracted_json = extracted_sample_doc[doc_id]
700
+ return view_data(
701
+ raw_json,
702
+ extracted_json,
703
+ doc_id=doc_id,
704
+ data_source="DM Math",
705
+ data_sources="DM Math",
706
+ target=target,
707
+ )
708
+
709
+ dmm_examples = Div(
710
+ Div(
711
+ get_dmm_data(target=gen_random_id()),
712
+ style="border: 1px solid #ccc; padding: 20px;",
713
+ ),
714
+ )
715
+
716
+ def get_pg19_data(data_source: str = "PG19", doc_id: int = 3, target: str = "foo"):
717
+ doc_id = max(0, min(int(doc_id), 9))
718
+
719
+ if data_source == "PG19":
720
+ raw_sample_doc = extracted_sample_doc = json.load(
721
+ open("data/curated_samples/pg19_raw.json")
722
+ )
723
+ else:
724
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
725
+
726
+ raw_json = raw_sample_doc[doc_id]
727
+ extracted_json = extracted_sample_doc[doc_id]
728
+ return view_data(
729
+ raw_json,
730
+ extracted_json,
731
+ doc_id=doc_id,
732
+ data_source="PG19",
733
+ data_sources="PG19",
734
+ target=target,
735
+ )
736
+
737
+ pg19_examples = Div(
738
+ Div(
739
+ get_pg19_data(target=gen_random_id()),
740
+ style="border: 1px solid #ccc; padding: 20px;",
741
+ ),
742
+ )
743
+
744
+ def get_eu_data(data_source: str = "Europarl", doc_id: int = 3, target: str = "foo"):
745
+ doc_id = max(0, min(int(doc_id), 9))
746
+
747
+ if data_source == "Europarl":
748
+ raw_sample_doc = extracted_sample_doc = json.load(
749
+ open("data/curated_samples/europarl_raw.json")
750
+ )
751
+ else:
752
+ raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
753
+
754
+ raw_json = raw_sample_doc[doc_id]
755
+ extracted_json = extracted_sample_doc[doc_id]
756
+ return view_data(
757
+ raw_json,
758
+ extracted_json,
759
+ doc_id=doc_id,
760
+ data_source="Europarl",
761
+ data_sources="Europarl",
762
+ target=target,
763
+ )
764
+
765
+ eu_examples = Div(
766
+ Div(
767
+ get_eu_data(target=gen_random_id()),
768
+ style="border: 1px solid #ccc; padding: 20px;",
769
+ ),
770
+ )
771
+
772
  filtering_process = Div(
773
  Section(
774
  H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
 
782
  H4("Filtering"),
783
  P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
784
  table_div_wikipedia,
785
+ Details(
786
+ Summary("Wikipedia Filtering Examples"),
787
+ wiki_examples_examples,
788
+ ),
789
  ),
790
  ),
791
  Section(
 
803
  Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
804
  ),
805
  table_div_arx,
806
+ Details(
807
+ Summary("ArXiv Filtering Examples"),
808
+ arx_examples_examples,
809
+ ),
810
  ),
811
  ),
812
  Section(
 
845
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
846
  ),
847
  table_div_s2o,
848
+ Details(
849
+ Summary("FreeLaw Filtering Examples -- need to update"),
850
+ freelaw_examples,
851
+ ),
852
  ),
853
  ),
854
  Section(
 
881
  Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
882
  ),
883
  table_div_med,
884
+ Details(
885
+ Summary("PubMed Filtering Examples"),
886
+ pubmed_examples,
887
+ ),
888
  ),
889
  ),
890
  Section(
 
898
  Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
899
  ),
900
  table_div_phil,
901
+ Details(
902
+ Summary("Phil Papers Filtering Examples"),
903
+ phil_examples,
904
+ ),
905
  ),
906
  ),
907
  Section(
 
913
  H4("Filtering"),
914
  P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
915
  table_div_up,
916
+ Details(
917
+ Summary("EuroParl Filtering Examples"),
918
+ eu_examples,
919
+ ),
920
  ),
921
  ),
922
  Section(
 
1006
  Li("Minimum Word Count Filter: 10"),
1007
  ),
1008
  table_div_se,
1009
+ Details(
1010
+ Summary("StackExchange Filtering Examples"),
1011
+ se_examples,
1012
+ ),
1013
  ),
1014
  ),
1015
  Section(
 
1037
  Li("Minimum Word Count Filter: 10"),
1038
  Li("Unigram Log Probability"),
1039
  ),
1040
+ table_div_uirc,
1041
  ),
1042
  ),
1043
  Section(
 
1058
  Li("None"),
1059
  ),
1060
  table_div_dmm,
1061
+ Details(
1062
+ Summary("DM Math Filtering Examples"),
1063
+ dmm_examples,
1064
+ ),
1065
  ),
1066
  ),
1067
  Section(
 
1079
  Li("Unigram Log Probability"),
1080
  ),
1081
  table_div_pg19,
1082
+ Details(
1083
+ Summary("PG-19 Filtering Examples"),
1084
+ pg19_examples,
1085
+ ),
1086
  ),
1087
  ),
1088
  )
 
1208
 
1209
 
1210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1211
  def update(target: str, request):
1212
  params = request.query_params
1213
  if data_source := params.get(f"data_source_{target}"):
 
1331
  )
1332
 
1333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1334
  data_preprocessing_div = Div(
1335
  H2("Data Preprocessing"),
1336
  P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
 
1363
  plotly2fasthtml(diff2_stacked_bar),
1364
  P("The figure above provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
1365
  filtering_process,
 
1366
  data_preparation_div,
1367
  #H2("Local Deduplication"), are these numbers even right?
1368
  #local_dedup_text,