omkarenator commited on
Commit
888beee
1 Parent(s): b6c56e9

add more stuff

Browse files
Files changed (1) hide show
  1. main.py +79 -10
main.py CHANGED
@@ -346,23 +346,92 @@ def curated(request):
346
  )
347
 
348
  table_html = data_preparation_steps.to_html(index=False, border=0)
349
- table_div = Div(NotStr(table_html), cls="l-body-outset")
350
 
351
- expander = Details(
352
- Summary("Raw Data Extraction"),
353
- get_data(),
354
- style="border: 1px solid #ccc; padding: 20px;",
355
- open=True,
 
 
 
 
 
 
 
 
 
 
 
 
356
  )
357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  return Div(
359
  Section(
360
  H2("Curated Sources"),
361
  plotly2fasthtml(get_chart_28168342()),
362
- H3("Data Preparation"),
363
- table_div,
364
- H3("Data Preprocessing"),
365
- expander,
366
  id="inner-text",
367
  )
368
  )
 
346
  )
347
 
348
  table_html = data_preparation_steps.to_html(index=False, border=0)
349
+ table_div = Div(NotStr(table_html), style="margin: 40px;")
350
 
351
+ text = P("""This initial stage serves as the foundation for the entire
352
+ process. Here, we focus on acquiring and extracting the raw data, which can
353
+ come from various sources such as crawling websites, using HTTP/FTP dumps,
354
+ or working with archive dumps. For instance, to download and prepare a
355
+ dataset, we can specific downloaders based on the data source. Each dataset
356
+ might have its own downloader script which can be updated in real time to
357
+ handle changes in the data source. Here is a general outline of the data
358
+ preparation process: It's worth noting that some pipelines might require
359
+ invoking additional functions or scripts to handle specific data sources or
360
+ formats. These helper scripts can be located within specific directories
361
+ or modules dedicated to the dataset.""")
362
+
363
+ data_preparation_div = Div(
364
+ H3("Data Preparation"),
365
+ text,
366
+ table_div,
367
+ Div(get_data(), style="border: 1px solid #ccc; padding: 20px;"),
368
  )
369
 
370
+ text = P("""Data preprocessing is a crucial step in the data science
371
+ pipeline. It involves cleaning and transforming raw data into a format that
372
+ is suitable for analysis. This process includes handling missing values,
373
+ normalizing data, encoding categorical variables, and more.""")
374
+
375
+ preprocessing_steps = pd.DataFrame(
376
+ {
377
+ "Step": [
378
+ "Language Filter",
379
+ "Min Word Count",
380
+ "Title Abstract",
381
+ "Majority Language",
382
+ "Paragraph Count",
383
+ "Frequency",
384
+ "Unigram Log Probability",
385
+ ],
386
+ "Description": [
387
+ "Filtering data based on language",
388
+ "Setting a minimum word count threshold",
389
+ "Extracting information from the title and abstract",
390
+ "Identifying the majority language in the dataset",
391
+ "Counting the number of paragraphs in each document",
392
+ "Calculating the frequency of each word in the dataset",
393
+ "Calculating the log probability of each unigram",
394
+ ],
395
+ "Need": [
396
+ "To remove documents in unwanted languages",
397
+ "To filter out documents with very few words",
398
+ "To extract relevant information for analysis",
399
+ "To understand the distribution of languages in the dataset",
400
+ "To analyze the structure and length of documents",
401
+ "To identify important words in the dataset",
402
+ "To measure the significance of individual words",
403
+ ],
404
+ "Pros": [
405
+ "Improves data quality by removing irrelevant documents",
406
+ "Filters out low-quality or incomplete documents",
407
+ "Provides additional information for analysis",
408
+ "Enables language-specific analysis and insights",
409
+ "Helps understand the complexity and content of documents",
410
+ "Identifies important terms and topics in the dataset",
411
+ "Quantifies the importance of individual words",
412
+ ],
413
+ "Cons": [
414
+ "May exclude documents in less common languages",
415
+ "May remove documents with valuable information",
416
+ "May introduce bias in the analysis",
417
+ "May not accurately represent the language distribution",
418
+ "May not capture the complexity of document structure",
419
+ "May be sensitive to noise and outliers",
420
+ "May not capture the semantic meaning of words",
421
+ ],
422
+ }
423
+ )
424
+
425
+ table_html = preprocessing_steps.to_html(index=False, border=0)
426
+ table_div = Div(NotStr(table_html), style="margin: 40px;")
427
+ data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
428
+
429
  return Div(
430
  Section(
431
  H2("Curated Sources"),
432
  plotly2fasthtml(get_chart_28168342()),
433
+ data_preparation_div,
434
+ data_preprocessing_div,
 
 
435
  id="inner-text",
436
  )
437
  )