Spaces:
Running
Running
victormiller
commited on
Commit
•
10a8615
1
Parent(s):
d293ab8
Update curated.py
Browse files- curated.py +48 -0
curated.py
CHANGED
@@ -694,6 +694,53 @@ def get_chart_28168342():
|
|
694 |
return fig
|
695 |
|
696 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
697 |
def update(target: str, request):
|
698 |
params = request.query_params
|
699 |
if data_source := params.get(f"data_source_{target}"):
|
@@ -836,6 +883,7 @@ def curated(request):
|
|
836 |
table_desc,
|
837 |
data_preprocessing_div,
|
838 |
plotly2fasthtml(get_chart_28168342()),
|
|
|
839 |
H2("Curated Sources Processing"),
|
840 |
filtering_process,
|
841 |
data_preparation_div,
|
|
|
694 |
return fig
|
695 |
|
696 |
|
697 |
+
def get_chart_new():
|
698 |
+
fig = go.Figure()
|
699 |
+
filter_names = [
|
700 |
+
"Download",
|
701 |
+
"Language",
|
702 |
+
"Min word count",
|
703 |
+
"Title Abstract",
|
704 |
+
"Majority language",
|
705 |
+
"Paragraph count",
|
706 |
+
"Frequency",
|
707 |
+
"Unigram log probability",
|
708 |
+
"Local dedup",
|
709 |
+
]
|
710 |
+
|
711 |
+
data_sources = [
|
712 |
+
("Wikipedia", [61614907, 0, 1146416, 0, 0, 0, 0, 0, 20]),
|
713 |
+
("Freelaw", [75971288, 2280522, 5518932, 0, 0, 0, 0, 48660, 20]),
|
714 |
+
("DM Maths", [112559888, 0, 0, 0, 0, 0, 0, 0, 20]),
|
715 |
+
("USPTO", [6880276, 1312, 129042, 0, 0, 0, 0, 533, 20]),
|
716 |
+
("PG19", [28752, 69, 1, 0, 0, 0, 0, 50, 20]),
|
717 |
+
("Hackernews", [2064931, 54129, 314, 0, 0, 0, 0, 6852, 20]),
|
718 |
+
("Ubuntu IRC", [37966, 14465, 33, 0, 0, 0, 0, 263, 20]),
|
719 |
+
("Europarl", [69814, 0, 0, 0, 0, 0, 0, 0, 20]),
|
720 |
+
("StackExchange", [23246548, 0, 196, 0, 0, 0, 0, 0, 20]),
|
721 |
+
("Arxiv", [1911867, 42426, 105601, 0, 0, 0, 0, 1179, 20]),
|
722 |
+
("S2ORC", [12963563, 0, 0, 2232450, 1275493, 148804, 1251669, 0, 20]),
|
723 |
+
("S2ORC Abstract", [102324176, 18456575, 978308, 0, 0, 0, 0, 111381, 20]),
|
724 |
+
("PubMed Central", [5230932, 400446, 62176, 0, 0, 0, 0, 836, 20]),
|
725 |
+
("PubMed Central Abstract", [25787474, 3100, 36419, 0, 0, 0, 0, 1231, 20]),
|
726 |
+
("PhilPapers", [49389, 10214, 0, 0, 0, 0, 0, 47, 20]),
|
727 |
+
]
|
728 |
+
|
729 |
+
for name, x_values in data_sources:
|
730 |
+
fig.add_trace(
|
731 |
+
go.Funnel(
|
732 |
+
name=name,
|
733 |
+
orientation="h",
|
734 |
+
y=filter_names,
|
735 |
+
x=x_values,
|
736 |
+
textinfo="value+percent total",
|
737 |
+
textposition="inside",
|
738 |
+
)
|
739 |
+
)
|
740 |
+
|
741 |
+
fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
|
742 |
+
return fig
|
743 |
+
|
744 |
def update(target: str, request):
|
745 |
params = request.query_params
|
746 |
if data_source := params.get(f"data_source_{target}"):
|
|
|
883 |
table_desc,
|
884 |
data_preprocessing_div,
|
885 |
plotly2fasthtml(get_chart_28168342()),
|
886 |
+
plotly2fasthtml(get_chart_new()),
|
887 |
H2("Curated Sources Processing"),
|
888 |
filtering_process,
|
889 |
data_preparation_div,
|