victormiller commited on
Commit
858c4bf
1 Parent(s): 10a8615

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +60 -1
curated.py CHANGED
@@ -9,7 +9,6 @@ from rich import print
9
  import uuid
10
  import plotly.express as px
11
 
12
-
13
  overview = Div(
14
  H2("Curated Source Processing Overview"),
15
  H3("What This Section Contains"),
@@ -751,6 +750,65 @@ def update(target: str, request):
751
  params.get(f"data_source_{target}"), doc_id, target)
752
 
753
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754
  def curated(request):
755
 
756
  # Partial Updates
@@ -884,6 +942,7 @@ def curated(request):
884
  data_preprocessing_div,
885
  plotly2fasthtml(get_chart_28168342()),
886
  plotly2fasthtml(get_chart_new()),
 
887
  H2("Curated Sources Processing"),
888
  filtering_process,
889
  data_preparation_div,
 
9
  import uuid
10
  import plotly.express as px
11
 
 
12
  overview = Div(
13
  H2("Curated Source Processing Overview"),
14
  H3("What This Section Contains"),
 
750
  params.get(f"data_source_{target}"), doc_id, target)
751
 
752
 
753
+ # Creating the dataframe from the provided table data
754
+ data = {
755
+ 'Dataset': ['Wikipedia', 'Freelaw', 'DM Maths', 'USPTO', 'PG19', 'Hackernews', 'Ubuntu IRC', 'Europarl',
756
+ 'StackExchange', 'Arxiv', 'S2ORC', 'S2ORC Abstract', 'Pubmed Central', 'Pubmed Abstract', 'Phil Papers'],
757
+ 'Downloaded Lines': [61614907, 75971288, 112559888, 6880276, 28752, 2064931, 37966, 69814, 23246548, 1911867,
758
+ 12963563, 102324176, 5230932, 25787474, 49389],
759
+ 'Language Filter': [0, 2280522, 0, 1312, 69, 54129, 14465, 0, 0, 42426, 0, 18456575, 400446, 3100, 10214],
760
+ 'Min Word Count': [1146416, 5518932, 0, 129042, 1, 314, 33, 0, 196, 105601, 0, 978308, 62176, 36419, 0],
761
+ 'Unigram log probability': [60468491, 68171834, 112559888, 6749922, 28682, 2010488, 23468, 69814, 23246352,
762
+ 1763840, 12963563, 82889293, 4768310, 25747955, 39175],
763
+ 'Total Lines Remaining': [60468491, 68123174, 112559888, 6749389, 28632, 2003636, 23205, 69814, 23246352,
764
+ 1762661, 12963563, 82777912, 4767474, 25746724, 39128]
765
+ }
766
+
767
+ df = pd.DataFrame(data)
768
+
769
+ # Create the stacked bar chart
770
+ fig = go.Figure()
771
+
772
+ # Adding traces for each filter stage
773
+ fig.add_trace(go.Bar(
774
+ name='Language Filter',
775
+ x=df['Dataset'],
776
+ y=df['Language Filter']
777
+ ))
778
+
779
+ fig.add_trace(go.Bar(
780
+ name='Min Word Count Filter',
781
+ x=df['Dataset'],
782
+ y=df['Min Word Count']
783
+ ))
784
+
785
+ fig.add_trace(go.Bar(
786
+ name='Unigram log probability Filter',
787
+ x=df['Dataset'],
788
+ y=df['Unigram log probability']
789
+ ))
790
+
791
+ fig.add_trace(go.Bar(
792
+ name='Total Lines Remaining',
793
+ x=df['Dataset'],
794
+ y=df['Total Lines Remaining']
795
+ ))
796
+
797
+ # Update the layout
798
+ fig.update_layout(
799
+ barmode='stack',
800
+ title='Stacked Bar Chart of Line Reductions by Dataset',
801
+ xaxis_title='Dataset',
802
+ yaxis_title='Number of Lines',
803
+ legend_title='Filters',
804
+ height=600,
805
+ width=1000
806
+ )
807
+
808
+ # Show the plot
809
+ stacked_bar = fig
810
+
811
+
812
  def curated(request):
813
 
814
  # Partial Updates
 
942
  data_preprocessing_div,
943
  plotly2fasthtml(get_chart_28168342()),
944
  plotly2fasthtml(get_chart_new()),
945
+ plotly2fasthtml(stacked_bar),
946
  H2("Curated Sources Processing"),
947
  filtering_process,
948
  data_preparation_div,