victormiller
commited on
Commit
•
e4314a9
1
Parent(s):
218198d
Update results.py
Browse files- results.py +6 -0
results.py
CHANGED
@@ -624,6 +624,12 @@ intro_div = Div(
|
|
624 |
|
625 |
)
|
626 |
|
|
|
|
|
|
|
|
|
|
|
|
|
627 |
perp1_div = Div(
|
628 |
Section(
|
629 |
H3("Perplexity vs Buckets"),
|
|
|
624 |
|
625 |
)
|
626 |
|
627 |
+
upsampling_exp = Div(
|
628 |
+
H2("Upsampling Experiment: TxT360 vs FineWeb"),
|
629 |
+
H3("Experiment Setup"),
|
630 |
+
P("We performed a comparison of 1.5T tokens from FineWeb and 1.5T tokens of TxT360 across 10 diverse evaluations. Our FineWeb evaluation is based on a random sample 1.5T tokens from FineWeb (base). For TxT360, we also random sample 1.5T tokens by upsampling data instances with more duplicates. Concretely, the upsampling weight is set to 3 for data points with duplicates in the range from 2 to 5, 5 for the range from 5 to 100, 8 for that from 101 to 1000, and 10 for more than 1000 duplicates."),
|
631 |
+
)
|
632 |
+
|
633 |
perp1_div = Div(
|
634 |
Section(
|
635 |
H3("Perplexity vs Buckets"),
|