victormiller commited on
Commit
7ab95df
1 Parent(s): 4e6ee79

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +3 -3
main.py CHANGED
@@ -748,7 +748,7 @@ dataset_sources = pd.DataFrame(
748
  "1.96B",
749
  "8.37B",
750
  ],
751
- "Cut-Off Date": [
752
  "2024-30",
753
  "Q4 2023",
754
  "-",
@@ -776,7 +776,7 @@ styled_table = dataset_sources.style.apply(
776
 
777
  table_html_data = styled_table._repr_html_()
778
  # table_html_data = dataset_sources.to_html(index=False, border=0)
779
- table_div_data = Div(NotStr(table_html_data), style="margin: 40px;"; align="center")
780
 
781
 
782
  @app.get("/intro")
@@ -822,7 +822,7 @@ def intro():
822
  "Web datasets are inherently noisy and varied. The TxT360 pipeline implements sophisticated filtering and deduplication techniques to clean and remove redundancies while preserving data integrity."
823
  ),
824
  P(
825
- "Curated datasets are typically structured and consistently formatted. TxT360 filters these sources with selective steps to maintain their integrity while providing seamless integration into the larger dataset. Both data source types are globally deduplicated together resulting in 5.7T tokens of high-quality data. The table below shows the final source distribution of TxT360 tokens."
826
  ),
827
  table_div_data,
828
  P(
 
748
  "1.96B",
749
  "8.37B",
750
  ],
751
+ "Information Cut-Off Date": [
752
  "2024-30",
753
  "Q4 2023",
754
  "-",
 
776
 
777
  table_html_data = styled_table._repr_html_()
778
  # table_html_data = dataset_sources.to_html(index=False, border=0)
779
+ table_div_data = Div(NotStr(table_html_data), style="margin: 40px;")
780
 
781
 
782
  @app.get("/intro")
 
822
  "Web datasets are inherently noisy and varied. The TxT360 pipeline implements sophisticated filtering and deduplication techniques to clean and remove redundancies while preserving data integrity."
823
  ),
824
  P(
825
+ "Curated datasets are typically structured and consistently formatted. TxT360 filters these sources with selective steps to maintain their integrity while providing seamless integration into the larger dataset. Both data source types are globally deduplicated together resulting in 5.7T tokens of high-quality data. The table below shows the source distribution of TxT360 tokens."
826
  ),
827
  table_div_data,
828
  P(