victormiller commited on
Commit
e4314a9
1 Parent(s): 218198d

Update results.py

Browse files
Files changed (1) hide show
  1. results.py +6 -0
results.py CHANGED
@@ -624,6 +624,12 @@ intro_div = Div(
624
 
625
  )
626
 
 
 
 
 
 
 
627
  perp1_div = Div(
628
  Section(
629
  H3("Perplexity vs Buckets"),
 
624
 
625
  )
626
 
627
+ upsampling_exp = Div(
628
+ H2("Upsampling Experiment: TxT360 vs FineWeb"),
629
+ H3("Experiment Setup"),
630
+ P("We performed a comparison of 1.5T tokens from FineWeb and 1.5T tokens of TxT360 across 10 diverse evaluations. Our FineWeb evaluation is based on a random sample 1.5T tokens from FineWeb (base). For TxT360, we also random sample 1.5T tokens by upsampling data instances with more duplicates. Concretely, the upsampling weight is set to 3 for data points with duplicates in the range from 2 to 5, 5 for the range from 5 to 100, 8 for that from 101 to 1000, and 10 for more than 1000 duplicates."),
631
+ )
632
+
633
  perp1_div = Div(
634
  Section(
635
  H3("Perplexity vs Buckets"),