TxT360

Running

App Files Files Community

victormiller commited on 15 days ago

Commit

f9cb337

•

1 Parent(s): 0cd5893

Update curated.py

Browse files

Files changed (1) hide show

curated.py +4 -206

curated.py CHANGED Viewed

@@ -645,101 +645,6 @@ def get_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo")
     )
-def get_chart_28168342():
-    fig = go.Figure()
-    filter_names = [
-        "Download",
-        "Language",
-        "Min word count",
-        "Title Abstract",
-        "Majority language",
-        "Paragraph count",
-        "Frequency",
-        "Unigram log probability",
-        "Local dedup",
-    ]
-    data_sources = [
-        ("Wikipedia", [61614907, 61614907, 60468491, 60468491, 60468491, 60468491, 60468491, 60468491, 20]),
-        ("Freelaw", [75971288, 73690766, 68171834, 68171834, 68171834, 68171834, 68171834, 68123174, 20]),
-        ("DM Maths", [112559888, 112559888, 112559888, 112559888, 112559888, 112559888, 112559888, 112559888, 20]),
-        ("USPTO", [6880276, 6878964, 6749922, 6749922, 6749922, 6749922, 6749922, 6749389, 20]),
-        ("PG19", [28752, 28683, 28682, 28682, 28682, 28682, 28682, 28632, 20]),
-        ("Hackernews", [2064931, 2010802, 2010488, 2010488, 2010488, 2010488, 2010488, 2003636, 20]),
-        ("Ubuntu IRC", [37966, 23501, 23468, 23468, 23468, 23468, 23468, 23205, 20]),
-        ("Europarl", [69814, 69814,69814,69814,69814,69814,69814,69814, 20]),
-        ("StackExchange", [23246548, 23246548, 23246352, 23246352, 23246352, 23246352, 23246352, 23246352, 20]),
-        ("Arxiv", [1911867, 1869441, 1763840, 1763840, 1763840, 1763840, 1763840, 1762661, 20]),
-        ("S2ORC", [12963563, 12963563, 12963563, 10731113, 9455620, 9306816, 8055147, 8055147, 20]),
-        ("S2ORC Abstract", [102324176, 83867601, 82889293, 82889293, 82889293, 82889293, 82889293, 82777912, 20]),
-        ("PubMed Central", [5230932, 4830486, 4768310, 4768310, 4768310, 4768310, 4768310, 4767474, 20]),
-        ("PubMed Central Abstract", [25787474, 25784374, 25747955, 25747955, 25747955, 25747955, 25747955, 25746724, 20]),
-        ("PhilPapers", [49389, 39175, 39175, 39175, 39175, 39175, 39175, 39128, 20]),
-    ]
-    for name, x_values in data_sources:
-        fig.add_trace(
-            go.Funnel(
-                name=name,
-                orientation="h",
-                y=filter_names,
-                x=x_values,
-                textinfo="value+percent total",
-                textposition="inside",
-            )
-        )
-    fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
-    return fig
-def get_chart_new():
-    fig = go.Figure()
-    filter_names = [
-        "Download",
-        "Language",
-        "Min word count",
-        "Title Abstract",
-        "Majority language",
-        "Paragraph count",
-        "Frequency",
-        "Unigram log probability",
-        "Local dedup",
-    ]
-    data_sources = [
-        ("Wikipedia", [61614907, 0, 1146416, 0, 0, 0, 0, 0, 20]),
-        ("Freelaw", [75971288, 2280522, 5518932, 0, 0, 0, 0, 48660, 20]),
-        ("DM Maths", [112559888, 0, 0, 0, 0, 0, 0, 0, 20]),
-        ("USPTO", [6880276, 1312, 129042, 0, 0, 0, 0, 533, 20]),
-        ("PG19", [28752, 69, 1, 0, 0, 0, 0, 50, 20]),
-        ("Hackernews", [2064931, 54129, 314, 0, 0, 0, 0, 6852, 20]),
-        ("Ubuntu IRC", [37966, 14465, 33, 0, 0, 0, 0, 263, 20]),
-        ("Europarl", [69814, 0, 0, 0, 0, 0, 0, 0, 20]),
-        ("StackExchange", [23246548, 0, 196, 0, 0, 0, 0, 0, 20]),
-        ("Arxiv", [1911867, 42426, 105601, 0, 0, 0, 0, 1179, 20]),
-        ("S2ORC", [12963563, 0, 0, 2232450, 1275493, 148804, 1251669, 0, 20]),
-        ("S2ORC Abstract", [102324176, 18456575, 978308, 0, 0, 0, 0, 111381, 20]),
-        ("PubMed Central", [5230932, 400446, 62176, 0, 0, 0, 0, 836, 20]),
-        ("PubMed Central Abstract", [25787474, 3100, 36419, 0, 0, 0, 0, 1231, 20]),
-        ("PhilPapers", [49389, 10214, 0, 0, 0, 0, 0, 47, 20]),
-    ]
-    for name, x_values in data_sources:
-        fig.add_trace(
-            go.Funnel(
-                name=name,
-                orientation="h",
-                y=filter_names,
-                x=x_values,
-                textinfo="value+percent total",
-                textposition="inside",
-            )
-        )
-    fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
-    return fig
 def update(target: str, request):
     params = request.query_params
     if data_source := params.get(f"data_source_{target}"):
@@ -749,113 +654,6 @@ def update(target: str, request):
         return get_data(
             params.get(f"data_source_{target}"), doc_id, target)
-# Creating the dataframe from the provided table data
-data = {
-    'Dataset': ['Wikipedia', 'Freelaw', 'DM Maths', 'USPTO', 'PG19', 'Hackernews', 'Ubuntu IRC', 'Europarl',
-                'StackExchange', 'Arxiv', 'S2ORC', 'S2ORC Abstract', 'Pubmed Central', 'Pubmed Abstract', 'Phil Papers'],
-    'Downloaded Lines': [61614907, 75971288, 112559888, 6880276, 28752, 2064931, 37966, 69814, 23246548, 1911867,
-                         12963563, 102324176, 5230932, 25787474, 49389],
-    'Language Filter': [0, 2280522, 0, 1312, 69, 54129, 14465, 0, 0, 42426, 0, 18456575, 400446, 3100, 10214],
-    'Min Word Count': [1146416, 5518932, 0, 129042, 1, 314, 33, 0, 196, 105601, 0, 978308, 62176, 36419, 0],
-    'Unigram log probability': [60468491, 68171834, 112559888, 6749922, 28682, 2010488, 23468, 69814, 23246352,
-                                1763840, 12963563, 82889293, 4768310, 25747955, 39175],
-    'Total Lines Remaining': [60468491, 68123174, 112559888, 6749389, 28632, 2003636, 23205, 69814, 23246352,
-                              1762661, 12963563, 82777912, 4767474, 25746724, 39128]
-}
-df = pd.DataFrame(data)
-# Create the stacked bar chart
-fig = go.Figure()
-# Adding traces for each filter stage
-fig.add_trace(go.Bar(
-    name='Language Filter',
-    x=df['Dataset'],
-    y=df['Language Filter']
-))
-fig.add_trace(go.Bar(
-    name='Min Word Count Filter',
-    x=df['Dataset'],
-    y=df['Min Word Count']
-))
-fig.add_trace(go.Bar(
-    name='Unigram log probability Filter',
-    x=df['Dataset'],
-    y=df['Unigram log probability']
-))
-fig.add_trace(go.Bar(
-    name='Total Lines Remaining',
-    x=df['Dataset'],
-    y=df['Total Lines Remaining']
-))
-# Update the layout
-fig.update_layout(
-    barmode='stack',
-    title='Stacked Bar Chart of Line Reductions by Dataset',
-    xaxis_title='Dataset',
-    yaxis_title='Number of Lines',
-    legend_title='Filters',
-    height=600,
-    width=1000
-)
-# Show the plot
-stacked_bar = fig
-# Aggregating the data for filters and datasets
-filter_data = {
-    'Filter': ['Language Filter', 'Min Word Count', 'Unigram log probability', 'Total Lines Remaining'],
-    'Wikipedia': [0, 1146416, 60468491, 60468491],
-    'Freelaw': [2280522, 5518932, 68171834, 68123174],
-    'DM Maths': [0, 0, 112559888, 112559888],
-    'USPTO': [1312, 129042, 6749922, 6749389],
-    'PG19': [69, 1, 28682, 28632],
-    'Hackernews': [54129, 314, 2010488, 2003636],
-    'Ubuntu IRC': [14465, 33, 23468, 23205],
-    'Europarl': [0, 0, 69814, 69814],
-    'StackExchange': [0, 196, 23246352, 23246352],
-    'Arxiv': [42426, 105601, 1763840, 1762661],
-    'S2ORC': [0, 0, 12963563, 12963563],
-    'S2ORC Abstract': [18456575, 978308, 82889293, 82777912],
-    'Pubmed Central': [400446, 62176, 4768310, 4767474],
-    'Pubmed Abstract': [3100, 36419, 25747955, 25746724],
-    'Phil Papers': [10214, 0, 39175, 39128]
-}
-# Creating a new dataframe for the filter data
-filter_df = pd.DataFrame(filter_data)
-# Creating the stacked bar chart
-fig = go.Figure()
-# Add trace for each dataset
-for dataset in filter_df.columns[1:]:
-    fig.add_trace(go.Bar(
-        name=dataset,
-        x=filter_df['Filter'],
-        y=filter_df[dataset]
-    ))
-# Update the layout
-fig.update_layout(
-    barmode='stack',
-    title='Stacked Bar Chart of Filters for Each Dataset',
-    xaxis_title='Filter',
-    yaxis_title='Number of Lines',
-    legend_title='Dataset',
-    height=600,
-    width=1000
-)
-# Show the plot
-diff_stacked_bar = fig
 # Data for the stacked bar chart
 data = {
     'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
@@ -1037,10 +835,10 @@ def curated(request):
             H2("Curated Sources Defined"),
             table_desc,
             data_preprocessing_div,
-            plotly2fasthtml(get_chart_28168342()),
-            plotly2fasthtml(get_chart_new()),
-            plotly2fasthtml(stacked_bar),
-            plotly2fasthtml(diff_stacked_bar),
             plotly2fasthtml(diff2_stacked_bar),
             H2("Curated Sources Processing"),
             filtering_process,

     )
 def update(target: str, request):
     params = request.query_params
     if data_source := params.get(f"data_source_{target}"):
         return get_data(
             params.get(f"data_source_{target}"), doc_id, target)
 # Data for the stacked bar chart
 data = {
     'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
             H2("Curated Sources Defined"),
             table_desc,
             data_preprocessing_div,
+           # plotly2fasthtml(get_chart_28168342()),
+           # plotly2fasthtml(get_chart_new()),
+           # plotly2fasthtml(stacked_bar),
+           # plotly2fasthtml(diff_stacked_bar),
             plotly2fasthtml(diff2_stacked_bar),
             H2("Curated Sources Processing"),
             filtering_process,