victormiller commited on
Commit
d4c2068
1 Parent(s): 51e13a8

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +5 -280
curated.py CHANGED
@@ -46,19 +46,12 @@ treemap_data = {
46
  'Deep Mind Maths dataset with generated questions.'
47
  ]
48
  }
49
- # Calculate percentage for each data source
50
  total_count = sum(treemap_data['Count'])
51
  treemap_data['Percentage'] = [count / total_count * 100 for count in treemap_data['Count']]
52
-
53
- # Create treemap
54
  fig = px.treemap(treemap_data, path=['Category', 'Source'], values='Count', hover_data=['Details', 'Percentage'], hover_name='Source')
55
-
56
- # Set the size of the chart
57
-
58
-
59
- # Display treemap if you want to update the size.update_layout(width=800, height=600)
60
  treemap_chart = fig
61
 
 
62
  wikipedia_filter = pd.DataFrame(
63
  {
64
  "Dataset": [
@@ -438,291 +431,23 @@ phil_filter = pd.DataFrame(
438
 
439
  table_html_phil = phil_filter.to_html(index=False, border=0)
440
  table_div_phil = Div(NotStr(table_html_phil), style="margin: 40px;")
 
441
 
442
- data_sources = [
443
- "Freelaw",
444
- "Wikipedia",
445
- "PhilPapers",
446
- "Arxiv",
447
- "S2ORC",
448
- "S2ORC Abstract",
449
- "Pubmed",
450
- "USPTO",
451
- "Hackernews",
452
- "Ubuntu IRC",
453
- "StackExchange",
454
- "DM Maths",
455
- "PG19",
456
- "Europarl",
457
- ]
458
-
459
-
460
-
461
- def get_wiki_data(data_source: str = "Wikipedia", doc_id: int = 3, target: str = "foo"):
462
- doc_id = max(0, min(int(doc_id), 9))
463
-
464
- if data_source == "Wikipedia":
465
- raw_sample_doc = extracted_sample_doc = json.load(
466
- open("data/curated_samples/wiki.json")
467
- )
468
- else:
469
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
470
-
471
- raw_json = raw_sample_doc[doc_id]
472
- extracted_json = extracted_sample_doc[doc_id]
473
- return view_data(
474
- raw_json,
475
- extracted_json,
476
- doc_id=doc_id,
477
- data_source="Wikipedia",
478
- data_sources="Wikipedia",
479
- target=target,
480
- )
481
-
482
- wiki_examples = Div(
483
- Div(
484
- get_wiki_data(target=gen_random_id()),
485
- style="border: 1px solid #ccc; padding: 20px;",
486
- ),
487
- )
488
 
 
489
  wiki_examples = DV("data/curated_samples/wiki.json", 0, "Wikipedia")
490
-
491
-
492
- def get_freelaw_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
493
- doc_id = max(0, min(int(doc_id), 9))
494
-
495
- if data_source == "Freelaw":
496
- raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json"))
497
- extracted_sample_doc = json.load(
498
- open("data/curated_samples/freelaw_extract.json")
499
- )
500
- else:
501
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
502
-
503
- raw_json = raw_sample_doc[doc_id]
504
- extracted_json = extracted_sample_doc[doc_id]
505
- return view_data(
506
- raw_json,
507
- extracted_json,
508
- doc_id=doc_id,
509
- data_source="Freelaw",
510
- data_sources="Freelaw",
511
- target=target,
512
- )
513
-
514
  freelaw_examples = DV2("data/curated_samples/freelaw_raw.json", "data/curated_samples/freelaw_extract.json", 2)
515
-
516
- def get_se_data(data_source: str = "StackExchange", doc_id: int = 3, target: str = "foo"):
517
- doc_id = max(0, min(int(doc_id), 9))
518
-
519
- if data_source == "StackExchange":
520
- raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
521
- extracted_sample_doc = json.load(
522
- open("data/curated_samples/stackexchange_extract.json")
523
- )
524
- else:
525
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
526
-
527
- raw_json = raw_sample_doc[doc_id]
528
- extracted_json = extracted_sample_doc[doc_id]
529
- return view_data(
530
- raw_json,
531
- extracted_json,
532
- doc_id=doc_id,
533
- data_source="StackExchange",
534
- data_sources="StackExchange",
535
- target=target,
536
- )
537
-
538
  se_examples = DV2("data/curated_samples/stackexchange_raw.json", "data/curated_samples/stackexchange_extract.json", 3)
539
-
540
- def get_phil_data(data_source: str = "PhilPapers", doc_id: int = 3, target: str = "foo"):
541
- doc_id = max(0, min(int(doc_id), 9))
542
-
543
- if data_source == "PhilPapers":
544
- raw_sample_doc = extracted_sample_doc = json.load(
545
- open("data/curated_samples/philpapers_raw.json")
546
- )
547
- else:
548
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
549
-
550
- raw_json = raw_sample_doc[doc_id]
551
- extracted_json = extracted_sample_doc[doc_id]
552
- return view_data(
553
- raw_json,
554
- extracted_json,
555
- doc_id=doc_id,
556
- data_source="PhilPapers",
557
- data_sources="PhilPapers",
558
- target=target,
559
- )
560
-
561
  phil_examples = DV("data/curated_samples/philpapers_raw.json", 2, "PhilPapers")
562
-
563
- def get_arx_data(data_source: str = "Arxiv", doc_id: int = 3, target: str = "foo"):
564
- doc_id = max(0, min(int(doc_id), 9))
565
-
566
- if data_source == "Arxiv":
567
- raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json"))
568
- extracted_sample_doc = json.load(
569
- open("data/curated_samples/arxiv_extract.json")
570
- )
571
- else:
572
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
573
-
574
- raw_json = raw_sample_doc[doc_id]
575
- extracted_json = extracted_sample_doc[doc_id]
576
- return view_data(
577
- raw_json,
578
- extracted_json,
579
- doc_id=doc_id,
580
- data_source="Arxiv",
581
- data_sources="Arxiv",
582
- target=target,
583
- )
584
-
585
  arx_examples = DV2("data/curated_samples/arxiv_raw.json", "data/curated_samples/arxiv_extract.json", 3)
586
-
587
- def get_S2ORC_data(data_source: str = "S2ORC", doc_id: int = 3, target: str = "foo"):
588
- doc_id = max(0, min(int(doc_id), 9))
589
-
590
- if data_source == "S2ORC":
591
- raw_sample_doc = extracted_sample_doc = json.load(
592
- open("data/curated_samples/s2orc_raw.json")
593
- )
594
- else:
595
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
596
-
597
- raw_json = raw_sample_doc[doc_id]
598
- extracted_json = extracted_sample_doc[doc_id]
599
- return view_data(
600
- raw_json,
601
- extracted_json,
602
- doc_id=doc_id,
603
- data_source="S2ORC",
604
- data_sources="S2ORC",
605
- target=target,
606
- )
607
-
608
  s2o_examples = DV("data/curated_samples/s2orc_raw.json", 0, "S2ORC")
609
-
610
- def get_S2ORCA_data(data_source: str = "S2ORC Abstract", doc_id: int = 3, target: str = "foo"):
611
- doc_id = max(0, min(int(doc_id), 9))
612
-
613
- if data_source == "S2ORC":
614
- raw_sample_doc = extracted_sample_doc = json.load(
615
- open("data/curated_samples/s2orc_abstract_raw.json")
616
- )
617
- else:
618
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
619
-
620
- raw_json = raw_sample_doc[doc_id]
621
- extracted_json = extracted_sample_doc[doc_id]
622
- return view_data(
623
- raw_json,
624
- extracted_json,
625
- doc_id=doc_id,
626
- data_source="S2ORC Abstract",
627
- data_sources="S2ORC Abstract",
628
- target=target,
629
- )
630
-
631
  s2oa_examples = DV("data/curated_samples/s2orc_abstract_raw.json", 0, "S2ORC Abstract")
632
-
633
- def get_pubmed_data(data_source: str = "Pubmed", doc_id: int = 3, target: str = "foo"):
634
- doc_id = max(0, min(int(doc_id), 9))
635
-
636
- if data_source == "Pubmed":
637
- raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
638
- extracted_sample_doc = json.load(
639
- open("data/curated_samples/pubmed_extract.json")
640
- )
641
- else:
642
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
643
-
644
- raw_json = raw_sample_doc[doc_id]
645
- extracted_json = extracted_sample_doc[doc_id]
646
- return view_data(
647
- raw_json,
648
- extracted_json,
649
- doc_id=doc_id,
650
- data_source="Pubmed",
651
- data_sources="Pubmed",
652
- target=target,
653
- )
654
-
655
  pubmed_examples = DV2("data/curated_samples/pubmed_raw.json", "data/curated_samples/pubmed_extract.json", 3)
656
-
657
- def get_dmm_data(data_source: str = "DM Math", doc_id: int = 3, target: str = "foo"):
658
- doc_id = max(0, min(int(doc_id), 9))
659
-
660
- if data_source == "DM Math":
661
- raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
662
- extracted_sample_doc = json.load(
663
- open("data/curated_samples/dm_maths_extract.json")
664
- )
665
- else:
666
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
667
-
668
- raw_json = raw_sample_doc[doc_id]
669
- extracted_json = extracted_sample_doc[doc_id]
670
- return view_data(
671
- raw_json,
672
- extracted_json,
673
- doc_id=doc_id,
674
- data_source="DM Math",
675
- data_sources="DM Math",
676
- target=target,
677
- )
678
-
679
  dmm_examples = DV2("data/curated_samples/dm_maths_raw.json", "data/curated_samples/dm_maths_extract.json", 3)
680
-
681
- def get_pg19_data(data_source: str = "PG19", doc_id: int = 3, target: str = "foo"):
682
- doc_id = max(0, min(int(doc_id), 9))
683
-
684
- if data_source == "PG19":
685
- raw_sample_doc = extracted_sample_doc = json.load(
686
- open("data/curated_samples/pg19_raw.json")
687
- )
688
- else:
689
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
690
-
691
- raw_json = raw_sample_doc[doc_id]
692
- extracted_json = extracted_sample_doc[doc_id]
693
- return view_data(
694
- raw_json,
695
- extracted_json,
696
- doc_id=doc_id,
697
- data_source="PG19",
698
- data_sources="PG19",
699
- target=target,
700
- )
701
-
702
  pg19_examples = DV("data/curated_samples/pg19_raw.json", 0, "PG19")
703
-
704
- def get_eu_data(data_source: str = "Europarl", doc_id: int = 3, target: str = "foo"):
705
- doc_id = max(0, min(int(doc_id), 9))
706
-
707
- if data_source == "Europarl":
708
- raw_sample_doc = extracted_sample_doc = json.load(
709
- open("data/curated_samples/europarl_raw.json")
710
- )
711
- else:
712
- raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
713
-
714
- raw_json = raw_sample_doc[doc_id]
715
- extracted_json = extracted_sample_doc[doc_id]
716
- return view_data(
717
- raw_json,
718
- extracted_json,
719
- doc_id=doc_id,
720
- data_source="Europarl",
721
- data_sources="Europarl",
722
- target=target,
723
- )
724
-
725
  eu_examples = DV("data/curated_samples/europarl_raw.json", 0, "Europarl")
 
 
726
 
727
  filtering_process = Div(
728
  Section(
 
46
  'Deep Mind Maths dataset with generated questions.'
47
  ]
48
  }
 
49
  total_count = sum(treemap_data['Count'])
50
  treemap_data['Percentage'] = [count / total_count * 100 for count in treemap_data['Count']]
 
 
51
  fig = px.treemap(treemap_data, path=['Category', 'Source'], values='Count', hover_data=['Details', 'Percentage'], hover_name='Source')
 
 
 
 
 
52
  treemap_chart = fig
53
 
54
+ #start individual tables showing filterin
55
  wikipedia_filter = pd.DataFrame(
56
  {
57
  "Dataset": [
 
431
 
432
  table_html_phil = phil_filter.to_html(index=False, border=0)
433
  table_div_phil = Div(NotStr(table_html_phil), style="margin: 40px;")
434
+ ## end individual tables showing filterin
435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
 
437
+ ## start filtered examples
438
  wiki_examples = DV("data/curated_samples/wiki.json", 0, "Wikipedia")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  freelaw_examples = DV2("data/curated_samples/freelaw_raw.json", "data/curated_samples/freelaw_extract.json", 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  se_examples = DV2("data/curated_samples/stackexchange_raw.json", "data/curated_samples/stackexchange_extract.json", 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  phil_examples = DV("data/curated_samples/philpapers_raw.json", 2, "PhilPapers")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  arx_examples = DV2("data/curated_samples/arxiv_raw.json", "data/curated_samples/arxiv_extract.json", 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  s2o_examples = DV("data/curated_samples/s2orc_raw.json", 0, "S2ORC")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  s2oa_examples = DV("data/curated_samples/s2orc_abstract_raw.json", 0, "S2ORC Abstract")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  pubmed_examples = DV2("data/curated_samples/pubmed_raw.json", "data/curated_samples/pubmed_extract.json", 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  dmm_examples = DV2("data/curated_samples/dm_maths_raw.json", "data/curated_samples/dm_maths_extract.json", 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  pg19_examples = DV("data/curated_samples/pg19_raw.json", 0, "PG19")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  eu_examples = DV("data/curated_samples/europarl_raw.json", 0, "Europarl")
449
+ ## end filtered examples
450
+
451
 
452
  filtering_process = Div(
453
  Section(