victormiller commited on
Commit
5d321b6
1 Parent(s): 14abefa

Update results.py

Browse files
Files changed (1) hide show
  1. results.py +32 -19
results.py CHANGED
@@ -50,9 +50,9 @@ fig_val.add_trace(go.Scatter(x=steps, y=txt360, mode='lines', name='TxT360'))
50
 
51
  # Update layout
52
  fig_val.update_layout(
53
- title='Perplexity Across Steps',
54
  xaxis_title='Steps',
55
- yaxis_title='Perplexity',
56
  legend_title='Models'
57
  )
58
 
@@ -76,11 +76,10 @@ fig_loss.add_trace(go.Scatter(x=data['Step'], y=data['FineWeb'], mode='lines', n
76
 
77
  # Update layout
78
  fig_loss.update_layout(
79
- title="Loss over Steps: TxT360 vs FineWeb",
80
  xaxis_title="Steps",
81
  yaxis_title="Loss",
82
  legend_title="Models",
83
- template="plotly_dark"
84
  )
85
 
86
  # Display the graph
@@ -750,16 +749,20 @@ dataset_comparison = pd.DataFrame(
750
  )
751
 
752
  table_html = dataset_comparison.to_html(index=False, border=0)
753
- table_div_1 = Div(NotStr(table_html), style="margin: 40px;")
754
 
755
  intro_div = Div(
756
- H2("Perplexity Evaluation on Duplicate Data"),
757
- H3("Model based Quality Estimation"),
758
- P("We took one of the model-based data quality evaluation strategies adopted by [DataComp-LM](https://arxiv.org/abs/2406.11794), which used perplexity filtering as a candidate for quality filtering. DataComp-LM followed [CCNet’s](https://arxiv.org/abs/1911.00359) practice to use a 5-gram Kneser-Ney model as implemented in the [KenLM](https://github.com/kpu/kenlm) library for efficient perplexity calculation. Following this practice, we estimated data quality by taking a KenLM model (from [edugp/kenlm](https://huggingface.co/edugp/kenlm)) trained on English Wikipedia data to compute perplexity on data with different duplication patterns. Lower perplexity is regarded as a signal of higher quality."),
759
- H3("Sampling Strategy"),
760
- P("We started from a processed Common Crawl (CC) ablation dataset divided by the number of duplicates of each document. For each CC dump, we have different buckets each holding chunks of document with different duplicate count ranges (1-1, 2-5, 6-10, 11-100, 101-1000, 1001-30000000). We sampled the first 10k documents from each chunk with their meta data."),
 
 
 
761
  )
762
 
 
763
  upsampling_exp = Div(
764
  H2("Upsampling Experiment: TxT360 vs FineWeb"),
765
  H3("Experiment Setup"),
@@ -772,11 +775,20 @@ upsampling_exp = Div(
772
  plotly2fasthtml(lm_loss_graph),
773
  )
774
 
 
 
 
 
 
 
 
 
 
775
  perp1_div = Div(
776
  Section(
777
  H3("Perplexity vs Buckets"),
778
  P("For each bucket, we aggregated all the chunks that belong to a single year and calculated the average perplexity for each (bucket, year) data point."),
779
- Img(src="images/prep-diff-buckets-global.png", height = "300", width = "600" ),
780
  plotly2fasthtml(Perplexity_Across_Different_Buckets_global_graph),
781
  ),
782
  Section(
@@ -794,19 +806,19 @@ perp1_div = Div(
794
  Section(
795
  H3("Perplexity vs Dump Duplication"),
796
  P("We are also interested in how the number of dumps a document is in affect data quality. From the graph below we can see that documents that are duplicated across around 40 - 60 dumps usually have lower perplexity."),
797
- Img(src="images/prep-across-diff-dump-dup-counts-global.png", height = "300", width = "600" ),
798
  plotly2fasthtml(graph4),
799
  ),
800
  Section(
801
  H3("Perplexity vs Local Buckets"),
802
  P("Previously we have seen that documents in recent dumps tend to have lower perplexity. This might be related to the way how global deduplication was implemented. During global deduplication, we only keep copy in the latest dump. Hence documents that are duplicated across multiple dumps only appear in the latest one. To avoid bias brought by this strategy, we tried to recover the states before the global deduplication by reading the metadata attached with each document."),
803
- Img(src="images/prep-across-diff-buckets-local.png", height = "300", width = "600" ),
804
  plotly2fasthtml(graph5),
805
  ),
806
  Section(
807
  H3("Perplexity vs Local Dump Duplication"),
808
  P("Following the same practice, we can plot the local version of the graph of average perplexity with respect to dump duplication."),
809
- Img(src="images/prep-diff-dump-dump-counts-local.png", height = "300", width = "600" ),
810
  plotly2fasthtml(graph6),
811
  ),
812
  )
@@ -819,27 +831,27 @@ llama_div = Div(
819
  ),
820
  Section(
821
  H3("Perplexity vs Buckets"),
822
- Img(src="images/perp-across-diff-buckets-global.png", height = "300", width = "600" ),
823
  plotly2fasthtml(llama_graph1),
824
  ),
825
  Section(
826
  H3("Perplexity vs Years"),
827
- Img(src="images/prep-across-diff-years-global.png", height = "300", width = "600" ),
828
  plotly2fasthtml(llama_graph2),
829
  ),
830
  Section(
831
  H3("Perplexity vs Dump Duplication"),
832
- Img(src="images/prep-vs-dump-dup-global.png", height = "300", width = "600" ),
833
  plotly2fasthtml(llama_graph4),
834
  ),
835
  Section(
836
  H3("Perplexity vs Local Buckets"),
837
- Img(src="images/prep-diff-buckets-local.png", height = "300", width = "600" ),
838
  plotly2fasthtml(llama_graph5),
839
  ),
840
  Section(
841
  H3("Perplexity vs Local Dump Duplication"),
842
- Img(src="images/prep-vs-dump-dup-global.png", height = "300", width = "600" ),
843
  plotly2fasthtml(llama_graph6),
844
  ),
845
  )
@@ -850,6 +862,7 @@ def results():
850
  Section(
851
  intro_div,
852
  upsampling_exp,
 
853
  perp1_div,
854
  llama_div,
855
  P("test plotly"),
 
50
 
51
  # Update layout
52
  fig_val.update_layout(
53
+ title='Validation Loss Over Steps: TxT360 vs FineWeb',
54
  xaxis_title='Steps',
55
+ yaxis_title='Loss',
56
  legend_title='Models'
57
  )
58
 
 
76
 
77
  # Update layout
78
  fig_loss.update_layout(
79
+ title="LM Loss Over Steps: TxT360 vs FineWeb",
80
  xaxis_title="Steps",
81
  yaxis_title="Loss",
82
  legend_title="Models",
 
83
  )
84
 
85
  # Display the graph
 
749
  )
750
 
751
  table_html = dataset_comparison.to_html(index=False, border=0)
752
+ table_div_1 = Div(Center(NotStr(table_html), style="margin: 40px;"))
753
 
754
  intro_div = Div(
755
+ H2("TxT360 Results")
756
+ H3("What This Section Contains"),
757
+ P("This section provides brief upsampling ablation examples and detailed perplexity analysis of CommonCrawl snapshots. The section is split into the following topic areas: "),
758
+ Ul(
759
+ Li("Upsampling Experiment: TxT360 vs FineWeb", style = "margin-bottom: 5px"),
760
+ Li("Perplexity Evaluation on Duplicate Data", style = "margin-bottom: 5px"),
761
+ ),
762
+ ),
763
  )
764
 
765
+
766
  upsampling_exp = Div(
767
  H2("Upsampling Experiment: TxT360 vs FineWeb"),
768
  H3("Experiment Setup"),
 
775
  plotly2fasthtml(lm_loss_graph),
776
  )
777
 
778
+ preplexity_intro_div = Div(
779
+ H2("Perplexity Evaluation on Duplicate Data"),
780
+ H3("Model based Quality Estimation"),
781
+ P("We took one of the model-based data quality evaluation strategies adopted by [DataComp-LM](https://arxiv.org/abs/2406.11794), which used perplexity filtering as a candidate for quality filtering. DataComp-LM followed [CCNet’s](https://arxiv.org/abs/1911.00359) practice to use a 5-gram Kneser-Ney model as implemented in the [KenLM](https://github.com/kpu/kenlm) library for efficient perplexity calculation. Following this practice, we estimated data quality by taking a KenLM model (from [edugp/kenlm](https://huggingface.co/edugp/kenlm)) trained on English Wikipedia data to compute perplexity on data with different duplication patterns. Lower perplexity is regarded as a signal of higher quality."),
782
+ H3("Sampling Strategy"),
783
+ P("We started from a processed Common Crawl (CC) ablation dataset divided by the number of duplicates of each document. For each CC dump, we have different buckets each holding chunks of document with different duplicate count ranges (1-1, 2-5, 6-10, 11-100, 101-1000, 1001-30000000). We sampled the first 10k documents from each chunk with their meta data."),
784
+ )
785
+
786
+
787
  perp1_div = Div(
788
  Section(
789
  H3("Perplexity vs Buckets"),
790
  P("For each bucket, we aggregated all the chunks that belong to a single year and calculated the average perplexity for each (bucket, year) data point."),
791
+ #Img(src="images/prep-diff-buckets-global.png", height = "300", width = "600" ),
792
  plotly2fasthtml(Perplexity_Across_Different_Buckets_global_graph),
793
  ),
794
  Section(
 
806
  Section(
807
  H3("Perplexity vs Dump Duplication"),
808
  P("We are also interested in how the number of dumps a document is in affect data quality. From the graph below we can see that documents that are duplicated across around 40 - 60 dumps usually have lower perplexity."),
809
+ #Img(src="images/prep-across-diff-dump-dup-counts-global.png", height = "300", width = "600" ),
810
  plotly2fasthtml(graph4),
811
  ),
812
  Section(
813
  H3("Perplexity vs Local Buckets"),
814
  P("Previously we have seen that documents in recent dumps tend to have lower perplexity. This might be related to the way how global deduplication was implemented. During global deduplication, we only keep copy in the latest dump. Hence documents that are duplicated across multiple dumps only appear in the latest one. To avoid bias brought by this strategy, we tried to recover the states before the global deduplication by reading the metadata attached with each document."),
815
+ #Img(src="images/prep-across-diff-buckets-local.png", height = "300", width = "600" ),
816
  plotly2fasthtml(graph5),
817
  ),
818
  Section(
819
  H3("Perplexity vs Local Dump Duplication"),
820
  P("Following the same practice, we can plot the local version of the graph of average perplexity with respect to dump duplication."),
821
+ #Img(src="images/prep-diff-dump-dump-counts-local.png", height = "300", width = "600" ),
822
  plotly2fasthtml(graph6),
823
  ),
824
  )
 
831
  ),
832
  Section(
833
  H3("Perplexity vs Buckets"),
834
+ #Img(src="images/perp-across-diff-buckets-global.png", height = "300", width = "600" ),
835
  plotly2fasthtml(llama_graph1),
836
  ),
837
  Section(
838
  H3("Perplexity vs Years"),
839
+ #Img(src="images/prep-across-diff-years-global.png", height = "300", width = "600" ),
840
  plotly2fasthtml(llama_graph2),
841
  ),
842
  Section(
843
  H3("Perplexity vs Dump Duplication"),
844
+ #Img(src="images/prep-vs-dump-dup-global.png", height = "300", width = "600" ),
845
  plotly2fasthtml(llama_graph4),
846
  ),
847
  Section(
848
  H3("Perplexity vs Local Buckets"),
849
+ #Img(src="images/prep-diff-buckets-local.png", height = "300", width = "600" ),
850
  plotly2fasthtml(llama_graph5),
851
  ),
852
  Section(
853
  H3("Perplexity vs Local Dump Duplication"),
854
+ #Img(src="images/prep-vs-dump-dup-global.png", height = "300", width = "600" ),
855
  plotly2fasthtml(llama_graph6),
856
  ),
857
  )
 
862
  Section(
863
  intro_div,
864
  upsampling_exp,
865
+ preplexity_intro_div,
866
  perp1_div,
867
  llama_div,
868
  P("test plotly"),