|
from fasthtml.common import * |
|
from fasthtml.components import * |
|
from plotly import graph_objects as go |
|
from fh_plotly import plotly2fasthtml |
|
import pandas as pd |
|
import json |
|
from data_viewer import view_data, gen_random_id |
|
from rich import print |
|
import uuid |
|
|
|
|
|
data_sources = [ |
|
"Freelaw", |
|
"Wikipedia", |
|
"PhilPapers", |
|
"Arxiv", |
|
"S2ORC", |
|
"S2ORC Abstract", |
|
"Pubmed", |
|
"USPTO", |
|
"Hackernews", |
|
"Ubuntu IRC", |
|
"StackExchange", |
|
"DM Maths", |
|
"PG19", |
|
"Europarl", |
|
] |
|
|
|
|
|
def get_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"): |
|
doc_id = max(0, min(int(doc_id), 9)) |
|
|
|
if data_source == "Freelaw": |
|
raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json")) |
|
extracted_sample_doc = json.load( |
|
open("data/curated_samples/freelaw_extract.json") |
|
) |
|
elif data_source == "Wikipedia": |
|
raw_sample_doc = extracted_sample_doc = json.load( |
|
open("data/curated_samples/wiki.json") |
|
) |
|
elif data_source == "StackExchange": |
|
raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json")) |
|
extracted_sample_doc = json.load( |
|
open("data/curated_samples/stackexchange_extract.json") |
|
) |
|
elif data_source == "PhilPapers": |
|
raw_sample_doc = extracted_sample_doc = json.load( |
|
open("data/curated_samples/philpapers_raw.json") |
|
) |
|
elif data_source == "Arxiv": |
|
raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json")) |
|
extracted_sample_doc = json.load( |
|
open("data/curated_samples/arxiv_extract.json") |
|
) |
|
elif data_source == "S2ORC": |
|
raw_sample_doc = extracted_sample_doc = json.load( |
|
open("data/curated_samples/s2orc_raw.json") |
|
) |
|
elif data_source == "S2ORC Abstract": |
|
raw_sample_doc = extracted_sample_doc = json.load( |
|
open("data/curated_samples/s2orc_abstract_raw.json") |
|
) |
|
elif data_source == "Pubmed": |
|
raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json")) |
|
extracted_sample_doc = json.load( |
|
open("data/curated_samples/pubmed_extract.json") |
|
) |
|
elif data_source == "DM Maths": |
|
raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json")) |
|
extracted_sample_doc = json.load( |
|
open("data/curated_samples/dm_maths_extract.json") |
|
) |
|
elif data_source == "PG19": |
|
raw_sample_doc = extracted_sample_doc = json.load( |
|
open("data/curated_samples/pg19_raw.json") |
|
) |
|
elif data_source == "Europarl": |
|
raw_sample_doc = extracted_sample_doc = json.load( |
|
open("data/curated_samples/europarl_raw.json") |
|
) |
|
else: |
|
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)] |
|
|
|
raw_json = raw_sample_doc[doc_id] |
|
extracted_json = extracted_sample_doc[doc_id] |
|
return view_data( |
|
raw_json, |
|
extracted_json, |
|
doc_id=doc_id, |
|
data_source=data_source, |
|
data_sources=data_sources, |
|
target=target, |
|
) |
|
|
|
|
|
def get_chart_28168342(): |
|
fig = go.Figure() |
|
filter_names = [ |
|
"Download", |
|
"Language", |
|
"Min word count", |
|
"Title Abstract", |
|
"Majority language", |
|
"Paragraph count", |
|
"Frequency", |
|
"Unigram log probability", |
|
"Local dedup", |
|
] |
|
|
|
data_sources = [ |
|
("Wikipedia", [100, 90, 80, 70, 60, 50, 40, 30, 20]), |
|
("Freelaw", [100, 90, 80, 70, 60, 50, 40, 20, 20]), |
|
("DM Maths", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
("USPTO", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
("PG19", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
("Hackernews", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
("Ubuntu IRC", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
("Europarl", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
("StackExchange", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
("Arxiv", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
("S2ORC", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
("S2ORC Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
("PubMed Central", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
("PubMed Central Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
("PhilPapers", [100, 90, 80, 70, 60, 40, 40, 30, 20]), |
|
] |
|
|
|
for name, x_values in data_sources: |
|
fig.add_trace( |
|
go.Funnel( |
|
name=name, |
|
orientation="h", |
|
y=filter_names, |
|
x=x_values, |
|
textinfo="value+percent total", |
|
textposition="inside", |
|
) |
|
) |
|
|
|
fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)") |
|
return fig |
|
|
|
|
|
def update(target: str, request): |
|
params = request.query_params |
|
if data_source := params.get(f"data_source_{target}"): |
|
return get_data( |
|
data_source, params.get(f"doc_id_{target}", 3), target) |
|
if doc_id := params.get(f"doc_id_{target}"): |
|
return get_data( |
|
params.get(f"data_source_{target}"), doc_id, target) |
|
|
|
|
|
def curated(request): |
|
data_preparation_steps = pd.DataFrame( |
|
{ |
|
"Method": [ |
|
"HTTP/FTP dumps", |
|
"Web crawling", |
|
"Archive snapshot", |
|
"Generated", |
|
"Curated", |
|
], |
|
"Description": [ |
|
"Acquiring data from HTTP/FTP dumps", |
|
"Crawling websites to extract data", |
|
"Working with archive dumps", |
|
"Generating synthetic data", |
|
"High quality curated data", |
|
], |
|
"Source": [ |
|
"Freelaw | Wikipedia | PhilPapers | Arxiv | S2ORC | Pubmeds", |
|
"USPTO | Hackernews | Ubuntu IRC", |
|
"StackExchange", |
|
"DM Maths", |
|
"PG19 | Europarl", |
|
], |
|
} |
|
) |
|
|
|
table_html = data_preparation_steps.to_html(index=False, border=0) |
|
table_div = Div(NotStr(table_html), style="margin: 40px;") |
|
|
|
text = P("""This initial stage serves as the foundation for the entire |
|
process. Here, we focus on acquiring and extracting the raw data, which can |
|
come from various sources such as crawling websites, using HTTP/FTP dumps, |
|
or working with archive dumps. For instance, to download and prepare a |
|
dataset, we can specific downloaders based on the data source. Each dataset |
|
might have its own downloader script which can be updated in real time to |
|
handle changes in the data source. Here is a general outline of the data |
|
preparation process: It's worth noting that some pipelines might require |
|
invoking additional functions or scripts to handle specific data sources or |
|
formats. These helper scripts can be located within specific directories |
|
or modules dedicated to the dataset.""") |
|
|
|
data_preparation_div = Div( |
|
H3("Data Preparation"), |
|
text, |
|
table_div, |
|
Div( |
|
get_data(target=gen_random_id()), |
|
style="border: 1px solid #ccc; padding: 20px;", |
|
), |
|
) |
|
|
|
text = P("""Data preprocessing is a crucial step in the data science |
|
pipeline. It involves cleaning and transforming raw data into a format that |
|
is suitable for analysis. This process includes handling missing values, |
|
normalizing data, encoding categorical variables, and more.""") |
|
|
|
preprocessing_steps = pd.DataFrame( |
|
{ |
|
"Step": [ |
|
"Language Filter", |
|
"Min Word Count", |
|
"Title Abstract", |
|
"Majority Language", |
|
"Paragraph Count", |
|
"Frequency", |
|
"Unigram Log Probability", |
|
], |
|
"Description": [ |
|
"Filtering data based on language", |
|
"Setting a minimum word count threshold", |
|
"Extracting information from the title and abstract", |
|
"Identifying the majority language in the dataset", |
|
"Counting the number of paragraphs in each document", |
|
"Calculating the frequency of each word in the dataset", |
|
"Calculating the log probability of each unigram", |
|
], |
|
"Need": [ |
|
"To remove documents in unwanted languages", |
|
"To filter out documents with very few words", |
|
"To extract relevant information for analysis", |
|
"To understand the distribution of languages in the dataset", |
|
"To analyze the structure and length of documents", |
|
"To identify important words in the dataset", |
|
"To measure the significance of individual words", |
|
], |
|
"Pros": [ |
|
"Improves data quality by removing irrelevant documents", |
|
"Filters out low-quality or incomplete documents", |
|
"Provides additional information for analysis", |
|
"Enables language-specific analysis and insights", |
|
"Helps understand the complexity and content of documents", |
|
"Identifies important terms and topics in the dataset", |
|
"Quantifies the importance of individual words", |
|
], |
|
"Cons": [ |
|
"May exclude documents in less common languages", |
|
"May remove documents with valuable information", |
|
"May introduce bias in the analysis", |
|
"May not accurately represent the language distribution", |
|
"May not capture the complexity of document structure", |
|
"May be sensitive to noise and outliers", |
|
"May not capture the semantic meaning of words", |
|
], |
|
} |
|
) |
|
|
|
table_html = preprocessing_steps.to_html(index=False, border=0) |
|
table_div = Div(NotStr(table_html), style="margin: 40px;") |
|
data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div) |
|
|
|
return Div( |
|
Section( |
|
H2("Curated Sources"), |
|
plotly2fasthtml(get_chart_28168342()), |
|
data_preparation_div, |
|
data_preprocessing_div, |
|
id="inner-text", |
|
) |
|
) |
|
|