victormiller commited on
Commit
a89d144
1 Parent(s): 591cd18

Update overview.py

Browse files
Files changed (1) hide show
  1. overview.py +25 -4
overview.py CHANGED
@@ -11,7 +11,7 @@ import web
11
  import common
12
  import results
13
 
14
- dataset_comparison = pd.DataFrame(
15
  {
16
  "Dataset": [
17
  "TxT360",
@@ -83,6 +83,26 @@ dataset_comparison = pd.DataFrame(
83
  "-",
84
  "Included",
85
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  "PG-19": [
87
  "Included",
88
  "-",
@@ -146,8 +166,8 @@ dataset_comparison = pd.DataFrame(
146
  }
147
  )
148
 
149
- table_html = dataset_comparison.to_html(index=False, border=0)
150
- table_div = Div(NotStr(table_html), style="margin: 40px;")
151
 
152
  dataset_sources = pd.DataFrame(
153
  {
@@ -259,7 +279,8 @@ both critical for effective LLM pre-training."""),
259
  P("By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 is crafted to meet and surpass the rigorous standards required for state-of-the-art LLM pre-training."),
260
  H3("TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered."),
261
  P("Table 1: The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
262
- table_div,
 
263
  P("Table 2: Basic TxT360 Statistics."),
264
  table_div1,
265
  ),
 
11
  import common
12
  import results
13
 
14
+ dataset_comparison1 = pd.DataFrame(
15
  {
16
  "Dataset": [
17
  "TxT360",
 
83
  "-",
84
  "Included",
85
  ],
86
+
87
+ }
88
+ )
89
+
90
+ table_html = dataset_comparison1.to_html(index=False, border=0)
91
+ table_div_1 = Div(NotStr(table_html), style="margin: 40px;")
92
+
93
+ dataset_comparison2 = pd.DataFrame(
94
+ {
95
+ "Dataset": [
96
+ "TxT360",
97
+ "FineWeb",
98
+ "RefinedWeb",
99
+ "RedPajama-v2",
100
+ "C4",
101
+ "Dolma",
102
+ "RedPajama-v1",
103
+ "The Pile",
104
+ ],
105
+
106
  "PG-19": [
107
  "Included",
108
  "-",
 
166
  }
167
  )
168
 
169
+ table_html2 = dataset_comparison2.to_html(index=False, border=0)
170
+ table_div2 = Div(NotStr(table_html2), style="margin: 40px;")
171
 
172
  dataset_sources = pd.DataFrame(
173
  {
 
279
  P("By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 is crafted to meet and surpass the rigorous standards required for state-of-the-art LLM pre-training."),
280
  H3("TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered."),
281
  P("Table 1: The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
282
+ table_div1,
283
+ table_div2,
284
  P("Table 2: Basic TxT360 Statistics."),
285
  table_div1,
286
  ),