victormiller commited on
Commit
f9cb337
1 Parent(s): 0cd5893

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +4 -206
curated.py CHANGED
@@ -645,101 +645,6 @@ def get_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo")
645
  )
646
 
647
 
648
- def get_chart_28168342():
649
- fig = go.Figure()
650
- filter_names = [
651
- "Download",
652
- "Language",
653
- "Min word count",
654
- "Title Abstract",
655
- "Majority language",
656
- "Paragraph count",
657
- "Frequency",
658
- "Unigram log probability",
659
- "Local dedup",
660
- ]
661
-
662
- data_sources = [
663
- ("Wikipedia", [61614907, 61614907, 60468491, 60468491, 60468491, 60468491, 60468491, 60468491, 20]),
664
- ("Freelaw", [75971288, 73690766, 68171834, 68171834, 68171834, 68171834, 68171834, 68123174, 20]),
665
- ("DM Maths", [112559888, 112559888, 112559888, 112559888, 112559888, 112559888, 112559888, 112559888, 20]),
666
- ("USPTO", [6880276, 6878964, 6749922, 6749922, 6749922, 6749922, 6749922, 6749389, 20]),
667
- ("PG19", [28752, 28683, 28682, 28682, 28682, 28682, 28682, 28632, 20]),
668
- ("Hackernews", [2064931, 2010802, 2010488, 2010488, 2010488, 2010488, 2010488, 2003636, 20]),
669
- ("Ubuntu IRC", [37966, 23501, 23468, 23468, 23468, 23468, 23468, 23205, 20]),
670
- ("Europarl", [69814, 69814,69814,69814,69814,69814,69814,69814, 20]),
671
- ("StackExchange", [23246548, 23246548, 23246352, 23246352, 23246352, 23246352, 23246352, 23246352, 20]),
672
- ("Arxiv", [1911867, 1869441, 1763840, 1763840, 1763840, 1763840, 1763840, 1762661, 20]),
673
- ("S2ORC", [12963563, 12963563, 12963563, 10731113, 9455620, 9306816, 8055147, 8055147, 20]),
674
- ("S2ORC Abstract", [102324176, 83867601, 82889293, 82889293, 82889293, 82889293, 82889293, 82777912, 20]),
675
- ("PubMed Central", [5230932, 4830486, 4768310, 4768310, 4768310, 4768310, 4768310, 4767474, 20]),
676
- ("PubMed Central Abstract", [25787474, 25784374, 25747955, 25747955, 25747955, 25747955, 25747955, 25746724, 20]),
677
- ("PhilPapers", [49389, 39175, 39175, 39175, 39175, 39175, 39175, 39128, 20]),
678
- ]
679
-
680
- for name, x_values in data_sources:
681
- fig.add_trace(
682
- go.Funnel(
683
- name=name,
684
- orientation="h",
685
- y=filter_names,
686
- x=x_values,
687
- textinfo="value+percent total",
688
- textposition="inside",
689
- )
690
- )
691
-
692
- fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
693
- return fig
694
-
695
-
696
- def get_chart_new():
697
- fig = go.Figure()
698
- filter_names = [
699
- "Download",
700
- "Language",
701
- "Min word count",
702
- "Title Abstract",
703
- "Majority language",
704
- "Paragraph count",
705
- "Frequency",
706
- "Unigram log probability",
707
- "Local dedup",
708
- ]
709
-
710
- data_sources = [
711
- ("Wikipedia", [61614907, 0, 1146416, 0, 0, 0, 0, 0, 20]),
712
- ("Freelaw", [75971288, 2280522, 5518932, 0, 0, 0, 0, 48660, 20]),
713
- ("DM Maths", [112559888, 0, 0, 0, 0, 0, 0, 0, 20]),
714
- ("USPTO", [6880276, 1312, 129042, 0, 0, 0, 0, 533, 20]),
715
- ("PG19", [28752, 69, 1, 0, 0, 0, 0, 50, 20]),
716
- ("Hackernews", [2064931, 54129, 314, 0, 0, 0, 0, 6852, 20]),
717
- ("Ubuntu IRC", [37966, 14465, 33, 0, 0, 0, 0, 263, 20]),
718
- ("Europarl", [69814, 0, 0, 0, 0, 0, 0, 0, 20]),
719
- ("StackExchange", [23246548, 0, 196, 0, 0, 0, 0, 0, 20]),
720
- ("Arxiv", [1911867, 42426, 105601, 0, 0, 0, 0, 1179, 20]),
721
- ("S2ORC", [12963563, 0, 0, 2232450, 1275493, 148804, 1251669, 0, 20]),
722
- ("S2ORC Abstract", [102324176, 18456575, 978308, 0, 0, 0, 0, 111381, 20]),
723
- ("PubMed Central", [5230932, 400446, 62176, 0, 0, 0, 0, 836, 20]),
724
- ("PubMed Central Abstract", [25787474, 3100, 36419, 0, 0, 0, 0, 1231, 20]),
725
- ("PhilPapers", [49389, 10214, 0, 0, 0, 0, 0, 47, 20]),
726
- ]
727
-
728
- for name, x_values in data_sources:
729
- fig.add_trace(
730
- go.Funnel(
731
- name=name,
732
- orientation="h",
733
- y=filter_names,
734
- x=x_values,
735
- textinfo="value+percent total",
736
- textposition="inside",
737
- )
738
- )
739
-
740
- fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
741
- return fig
742
-
743
  def update(target: str, request):
744
  params = request.query_params
745
  if data_source := params.get(f"data_source_{target}"):
@@ -749,113 +654,6 @@ def update(target: str, request):
749
  return get_data(
750
  params.get(f"data_source_{target}"), doc_id, target)
751
 
752
-
753
- # Creating the dataframe from the provided table data
754
- data = {
755
- 'Dataset': ['Wikipedia', 'Freelaw', 'DM Maths', 'USPTO', 'PG19', 'Hackernews', 'Ubuntu IRC', 'Europarl',
756
- 'StackExchange', 'Arxiv', 'S2ORC', 'S2ORC Abstract', 'Pubmed Central', 'Pubmed Abstract', 'Phil Papers'],
757
- 'Downloaded Lines': [61614907, 75971288, 112559888, 6880276, 28752, 2064931, 37966, 69814, 23246548, 1911867,
758
- 12963563, 102324176, 5230932, 25787474, 49389],
759
- 'Language Filter': [0, 2280522, 0, 1312, 69, 54129, 14465, 0, 0, 42426, 0, 18456575, 400446, 3100, 10214],
760
- 'Min Word Count': [1146416, 5518932, 0, 129042, 1, 314, 33, 0, 196, 105601, 0, 978308, 62176, 36419, 0],
761
- 'Unigram log probability': [60468491, 68171834, 112559888, 6749922, 28682, 2010488, 23468, 69814, 23246352,
762
- 1763840, 12963563, 82889293, 4768310, 25747955, 39175],
763
- 'Total Lines Remaining': [60468491, 68123174, 112559888, 6749389, 28632, 2003636, 23205, 69814, 23246352,
764
- 1762661, 12963563, 82777912, 4767474, 25746724, 39128]
765
- }
766
-
767
- df = pd.DataFrame(data)
768
-
769
- # Create the stacked bar chart
770
- fig = go.Figure()
771
-
772
- # Adding traces for each filter stage
773
- fig.add_trace(go.Bar(
774
- name='Language Filter',
775
- x=df['Dataset'],
776
- y=df['Language Filter']
777
- ))
778
-
779
- fig.add_trace(go.Bar(
780
- name='Min Word Count Filter',
781
- x=df['Dataset'],
782
- y=df['Min Word Count']
783
- ))
784
-
785
- fig.add_trace(go.Bar(
786
- name='Unigram log probability Filter',
787
- x=df['Dataset'],
788
- y=df['Unigram log probability']
789
- ))
790
-
791
- fig.add_trace(go.Bar(
792
- name='Total Lines Remaining',
793
- x=df['Dataset'],
794
- y=df['Total Lines Remaining']
795
- ))
796
-
797
- # Update the layout
798
- fig.update_layout(
799
- barmode='stack',
800
- title='Stacked Bar Chart of Line Reductions by Dataset',
801
- xaxis_title='Dataset',
802
- yaxis_title='Number of Lines',
803
- legend_title='Filters',
804
- height=600,
805
- width=1000
806
- )
807
-
808
- # Show the plot
809
- stacked_bar = fig
810
-
811
- # Aggregating the data for filters and datasets
812
- filter_data = {
813
- 'Filter': ['Language Filter', 'Min Word Count', 'Unigram log probability', 'Total Lines Remaining'],
814
- 'Wikipedia': [0, 1146416, 60468491, 60468491],
815
- 'Freelaw': [2280522, 5518932, 68171834, 68123174],
816
- 'DM Maths': [0, 0, 112559888, 112559888],
817
- 'USPTO': [1312, 129042, 6749922, 6749389],
818
- 'PG19': [69, 1, 28682, 28632],
819
- 'Hackernews': [54129, 314, 2010488, 2003636],
820
- 'Ubuntu IRC': [14465, 33, 23468, 23205],
821
- 'Europarl': [0, 0, 69814, 69814],
822
- 'StackExchange': [0, 196, 23246352, 23246352],
823
- 'Arxiv': [42426, 105601, 1763840, 1762661],
824
- 'S2ORC': [0, 0, 12963563, 12963563],
825
- 'S2ORC Abstract': [18456575, 978308, 82889293, 82777912],
826
- 'Pubmed Central': [400446, 62176, 4768310, 4767474],
827
- 'Pubmed Abstract': [3100, 36419, 25747955, 25746724],
828
- 'Phil Papers': [10214, 0, 39175, 39128]
829
- }
830
-
831
- # Creating a new dataframe for the filter data
832
- filter_df = pd.DataFrame(filter_data)
833
-
834
- # Creating the stacked bar chart
835
- fig = go.Figure()
836
-
837
- # Add trace for each dataset
838
- for dataset in filter_df.columns[1:]:
839
- fig.add_trace(go.Bar(
840
- name=dataset,
841
- x=filter_df['Filter'],
842
- y=filter_df[dataset]
843
- ))
844
-
845
- # Update the layout
846
- fig.update_layout(
847
- barmode='stack',
848
- title='Stacked Bar Chart of Filters for Each Dataset',
849
- xaxis_title='Filter',
850
- yaxis_title='Number of Lines',
851
- legend_title='Dataset',
852
- height=600,
853
- width=1000
854
- )
855
-
856
- # Show the plot
857
- diff_stacked_bar = fig
858
-
859
  # Data for the stacked bar chart
860
  data = {
861
  'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
@@ -1037,10 +835,10 @@ def curated(request):
1037
  H2("Curated Sources Defined"),
1038
  table_desc,
1039
  data_preprocessing_div,
1040
- plotly2fasthtml(get_chart_28168342()),
1041
- plotly2fasthtml(get_chart_new()),
1042
- plotly2fasthtml(stacked_bar),
1043
- plotly2fasthtml(diff_stacked_bar),
1044
  plotly2fasthtml(diff2_stacked_bar),
1045
  H2("Curated Sources Processing"),
1046
  filtering_process,
 
645
  )
646
 
647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  def update(target: str, request):
649
  params = request.query_params
650
  if data_source := params.get(f"data_source_{target}"):
 
654
  return get_data(
655
  params.get(f"data_source_{target}"), doc_id, target)
656
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657
  # Data for the stacked bar chart
658
  data = {
659
  'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
 
835
  H2("Curated Sources Defined"),
836
  table_desc,
837
  data_preprocessing_div,
838
+ # plotly2fasthtml(get_chart_28168342()),
839
+ # plotly2fasthtml(get_chart_new()),
840
+ # plotly2fasthtml(stacked_bar),
841
+ # plotly2fasthtml(diff_stacked_bar),
842
  plotly2fasthtml(diff2_stacked_bar),
843
  H2("Curated Sources Processing"),
844
  filtering_process,