omkarenator commited on
Commit
0698fac
1 Parent(s): 9a127b5

fix curated page layout. remove fixed width of the chart

Browse files
Files changed (1) hide show
  1. curated.py +39 -44
curated.py CHANGED
@@ -579,54 +579,49 @@ data_preprocessing_div = Div(
579
  ),
580
  )
581
 
582
- # Data for the stacked bar chart
583
- data = {
584
- "Filter": [
585
- "Downloaded Lines",
586
- "Language Filter",
587
- "Min Word Count",
588
- "Unigram Log Probability",
589
- ],
590
- "Wikipedia": [61614907, 61614907, 60468491, 60468491],
591
- "Freelaw": [75971288, 73690766, 68171834, 68123174],
592
- "DM Maths": [112559888, 112559888, 112559888, 112559888],
593
- "USPTO": [6880276, 6878964, 6749922, 6749389],
594
- "PG19": [28752, 28683, 28682, 28632],
595
- "Hackernews": [2064931, 2010802, 2010488, 2003636],
596
- "Ubuntu IRC": [37966, 23501, 23468, 23205],
597
- "Europarl": [69814, 69814, 69814, 69814],
598
- "StackExchange": [23246548, 23246548, 23246352, 23246352],
599
- "Arxiv": [1911867, 1869441, 1763840, 1762661],
600
- "S2ORC": [12963563, 12963563, 12963563, 12963563],
601
- "S2ORC Abstract": [102324176, 83867601, 82889293, 82777912],
602
- "Pubmed Central": [5230932, 4830486, 4768310, 4767474],
603
- "Pubmed Abstract": [25787474, 25784374, 25747955, 25746724],
604
- "Phil Papers": [49389, 39175, 39175, 39128],
605
- }
606
 
607
- # Creating a dataframe
608
- df = pd.DataFrame(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
 
610
- # Creating the stacked bar chart
611
- fig = go.Figure()
612
 
613
- # Add trace for each dataset
614
- for dataset in df.columns[1:]:
615
- fig.add_trace(go.Bar(name=dataset, x=df["Filter"], y=df[dataset]))
616
 
617
- # Update the layout
618
- fig.update_layout(
619
- barmode="stack",
620
- title="Document Reduction by Filter for Each Dataset",
621
- xaxis_title="Filter",
622
- yaxis_title="Number of Lines",
623
- legend_title="Dataset",
624
- height=600,
625
- width=1000,
626
- )
627
 
628
- # Show the plot
629
- diff2_stacked_bar = fig
 
 
 
 
 
 
 
630
 
631
 
632
  filtering_process = Div(
@@ -635,7 +630,7 @@ filtering_process = Div(
635
  P(
636
  "Below is a detail recount of how each dataset was extracted and filtered. If specific challenges were found with a dataset, they are included and discussed to the best of our abilities. The figure below provides a global view of the document filtering results. ~8% of documents were removed during these three steps."
637
  ),
638
- plotly2fasthtml(diff2_stacked_bar),
639
  H3(
640
  "This section continues below with the specific filtering steps taken for all 14 curated datasets."
641
  ),
 
579
  ),
580
  )
581
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
582
 
583
+ def diff2_stacked_bar():
584
+ # Data for the stacked bar chart
585
+ data = {
586
+ "Filter": [
587
+ "Downloaded Lines",
588
+ "Language Filter",
589
+ "Min Word Count",
590
+ "Unigram Log Probability",
591
+ ],
592
+ "Wikipedia": [61614907, 61614907, 60468491, 60468491],
593
+ "Freelaw": [75971288, 73690766, 68171834, 68123174],
594
+ "DM Maths": [112559888, 112559888, 112559888, 112559888],
595
+ "USPTO": [6880276, 6878964, 6749922, 6749389],
596
+ "PG19": [28752, 28683, 28682, 28632],
597
+ "Hackernews": [2064931, 2010802, 2010488, 2003636],
598
+ "Ubuntu IRC": [37966, 23501, 23468, 23205],
599
+ "Europarl": [69814, 69814, 69814, 69814],
600
+ "StackExchange": [23246548, 23246548, 23246352, 23246352],
601
+ "Arxiv": [1911867, 1869441, 1763840, 1762661],
602
+ "S2ORC": [12963563, 12963563, 12963563, 12963563],
603
+ "S2ORC Abstract": [102324176, 83867601, 82889293, 82777912],
604
+ "Pubmed Central": [5230932, 4830486, 4768310, 4767474],
605
+ "Pubmed Abstract": [25787474, 25784374, 25747955, 25746724],
606
+ "Phil Papers": [49389, 39175, 39175, 39128],
607
+ }
608
 
609
+ df = pd.DataFrame(data)
 
610
 
611
+ fig = go.Figure()
 
 
612
 
613
+ for dataset in df.columns[1:]:
614
+ fig.add_trace(go.Bar(name=dataset, x=df["Filter"], y=df[dataset]))
 
 
 
 
 
 
 
 
615
 
616
+ fig.update_layout(
617
+ barmode="stack",
618
+ title="Document Reduction by Filter for Each Dataset",
619
+ xaxis_title="Filter",
620
+ yaxis_title="Number of Lines",
621
+ legend_title="Dataset",
622
+ height=600,
623
+ )
624
+ return fig
625
 
626
 
627
  filtering_process = Div(
 
630
  P(
631
  "Below is a detail recount of how each dataset was extracted and filtered. If specific challenges were found with a dataset, they are included and discussed to the best of our abilities. The figure below provides a global view of the document filtering results. ~8% of documents were removed during these three steps."
632
  ),
633
+ plotly2fasthtml(diff2_stacked_bar()),
634
  H3(
635
  "This section continues below with the specific filtering steps taken for all 14 curated datasets."
636
  ),