jsulz HF staff commited on
Commit
b385dd5
1 Parent(s): 51ba302

removing trailing s from units

Browse files
Files changed (1) hide show
  1. app.py +24 -24
app.py CHANGED
@@ -54,14 +54,14 @@ def process_dataset():
54
  columns={
55
  "type": "Repository Type",
56
  "num_files": "Number of Files",
57
- "total_size": "Total Size (PBs)",
58
  }
59
  )
60
  file_counts_and_sizes = file_counts_and_sizes.drop(columns=["Number of Files"])
61
 
62
  # sort the dataframe by total size in descending order
63
  file_counts_and_sizes = file_counts_and_sizes.sort_values(
64
- by="Total Size (PBs)", ascending=False
65
  )
66
 
67
  # drop nas from the extension column
@@ -121,9 +121,9 @@ def compare_last_10_months(_cumulative_df, _cumulative_df_deduped):
121
  last_10_months = last_10_months.rename(
122
  columns={
123
  "date": "Date",
124
- "total_change": "Month-to-Month Growth (PBs)",
125
- "deduped_change": "Growth with File-Level Deduplication (PBs)",
126
- "savings": "Dedupe Savings (PBs)",
127
  }
128
  )
129
  return last_10_months
@@ -131,8 +131,8 @@ def compare_last_10_months(_cumulative_df, _cumulative_df_deduped):
131
 
132
  def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_deduped):
133
  # create a new column in the repository sizes dataframe for "deduped size" and set it to empty atif rist
134
- repo_sizes["Deduped Size (PBs)"] = ""
135
- repo_sizes["Dedupe Savings (PBs)"] = ""
136
 
137
  for column in cumulative_df.columns:
138
  cum_repo_size = cumulative_df[column].iloc[-1]
@@ -140,10 +140,10 @@ def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_deduped):
140
  repo_size_diff = cum_repo_size - comp_repo_size
141
  repo_sizes.loc[
142
  repo_sizes["Repository Type"] == column.capitalize(),
143
- "Deduped Size (PBs)",
144
  ] = comp_repo_size
145
  repo_sizes.loc[
146
- repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
147
  ] = repo_size_diff
148
 
149
  # add a row that sums the total size and deduped size
@@ -207,7 +207,7 @@ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped):
207
  fig.update_layout(
208
  title="Cumulative Growth of Models, Spaces, and Datasets Over Time<br><sup>Dotted lines represent growth with file-level deduplication</sup>",
209
  xaxis_title="Date",
210
- yaxis_title="Cumulative Size (PBs)",
211
  legend_title="Type",
212
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
213
  )
@@ -254,7 +254,7 @@ def cumulative_growth_single(_df):
254
  fig.update_layout(
255
  title="Cumulative Growth of Models, Spaces, and Datasets",
256
  xaxis_title="Date",
257
- yaxis_title="Size (PBs)",
258
  legend_title="Type",
259
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
260
  )
@@ -280,9 +280,9 @@ def plot_total_sum(by_type_arr):
280
 
281
  # Update layout
282
  fig.update_layout(
283
- title="Top 20 File Extensions by Total Size (in PBs)",
284
  xaxis_title="File Extension",
285
- yaxis_title="Total Size (PBs)",
286
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
287
  colorway=px.colors.qualitative.Alphabet, # Use Plotly color palette
288
  )
@@ -350,11 +350,11 @@ def area_plot_by_extension_month(_df):
350
  fig = px.area(_df, x="date", y="total_size", color="extension")
351
  # Update layout
352
  fig.update_layout(
353
- title="File Extension Monthly Additions (in PBs) Over Time",
354
  xaxis_title="Date",
355
- yaxis_title="Size (PBs)",
356
  legend_title="Type",
357
- # format y-axis to be PBs (currently bytes) with two decimal places
358
  yaxis=dict(tickformat=".2f"),
359
  )
360
 
@@ -437,9 +437,9 @@ with gr.Blocks(theme="citrus") as demo:
437
  # Convert the total size to petabytes and format to two decimal places
438
  current_storage = format_dataframe_size_column(
439
  by_repo_type_analysis,
440
- ["Total Size (PBs)", "Deduped Size (PBs)", "Dedupe Savings (PBs)"],
441
  )
442
- gr.Dataframe(current_storage[["Repository Type", "Total Size (PBs)"]])
443
 
444
  gr.HTML(div_px(25))
445
  # File Extension analysis
@@ -448,7 +448,7 @@ with gr.Blocks(theme="citrus") as demo:
448
  "What types of files are stored on the Hub? The Xet team's backend architecture allows for storage optimizations by file type, so seeing the breakdown of the most popular stored file types helps to prioritize our roadmap. The following sections filter the analysis to the top 20 file extensions stored (by bytes) using Git LFS. Taken together, these 20 file extensions account for 82% of the total bytes stored in LFS."
449
  )
450
  gr.Markdown(
451
- "[Safetensors](https://huggingface.co/docs/safetensors/en/index) is quickly becoming the defacto standard on the Hub for storing tensor files, accounting for over 7PBs (25%) of LFS storage. [GGUF (GPT-Generated Unified Format)](https://huggingface.co/docs/hub/gguf), a format for storing tensor files with a different set of optimizations, is also on the rise, accounting for 3.2 PBs (11%) of LFS storage."
452
  )
453
  # Get the top 10 file extensions by size
454
  by_extension_size = by_extension.sort_values(by="size", ascending=False).head(22)
@@ -473,7 +473,7 @@ with gr.Blocks(theme="citrus") as demo:
473
  columns={
474
  "extension": "File Extension",
475
  "count": "Number of Files",
476
- "size": "Total Size (PBs)",
477
  }
478
  )
479
 
@@ -485,7 +485,7 @@ with gr.Blocks(theme="citrus") as demo:
485
  by_extension_size[
486
  [
487
  "File Extension",
488
- "Total Size (PBs)",
489
  "Number of Files",
490
  "Average File Size (MBs)",
491
  ]
@@ -493,7 +493,7 @@ with gr.Blocks(theme="citrus") as demo:
493
  )
494
 
495
  gr.HTML(div_px(5))
496
- gr.Markdown("### Storage Growth by File Extension (Monthly PBs Added)")
497
  gr.Markdown(
498
  "The following area chart shows the number of bytes added to LFS storage each month, faceted by file extension."
499
  )
@@ -534,7 +534,7 @@ with gr.Blocks(theme="citrus") as demo:
534
  with gr.Column(scale=1):
535
  gr.Markdown("### Current Storage Usage + File-level Deduplication")
536
  gr.Markdown(
537
- "This simple change to the storage backend will save 3.24 PBs (the equivalent of 7.2 Common Crawls)."
538
  )
539
  with gr.Column(scale=3):
540
  # Convert the total size to petabytes and format to two decimal places
@@ -545,7 +545,7 @@ with gr.Blocks(theme="citrus") as demo:
545
  with gr.Column(scale=1):
546
  gr.Markdown("### Month-to-Month Growth + File-level Deduplication")
547
  gr.Markdown(
548
- "This table shows month-to-month growth in model, dataset, and space storage. In 2024, the Hub has averaged nearly **2.3 PBs uploaded to Git LFS per month**. Deduplicating at the file level saves nearly 225 TB (half a Common Crawl) monthly."
549
  )
550
  with gr.Column(scale=3):
551
  gr.Dataframe(last_10_months)
 
54
  columns={
55
  "type": "Repository Type",
56
  "num_files": "Number of Files",
57
+ "total_size": "Total Size (PB)",
58
  }
59
  )
60
  file_counts_and_sizes = file_counts_and_sizes.drop(columns=["Number of Files"])
61
 
62
  # sort the dataframe by total size in descending order
63
  file_counts_and_sizes = file_counts_and_sizes.sort_values(
64
+ by="Total Size (PB)", ascending=False
65
  )
66
 
67
  # drop nas from the extension column
 
121
  last_10_months = last_10_months.rename(
122
  columns={
123
  "date": "Date",
124
+ "total_change": "Month-to-Month Growth (PB)",
125
+ "deduped_change": "Growth with File-Level Deduplication (PB)",
126
+ "savings": "Dedupe Savings (PB)",
127
  }
128
  )
129
  return last_10_months
 
131
 
132
  def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_deduped):
133
  # create a new column in the repository sizes dataframe for "deduped size" and set it to empty atif rist
134
+ repo_sizes["Deduped Size (PB)"] = ""
135
+ repo_sizes["Dedupe Savings (PB)"] = ""
136
 
137
  for column in cumulative_df.columns:
138
  cum_repo_size = cumulative_df[column].iloc[-1]
 
140
  repo_size_diff = cum_repo_size - comp_repo_size
141
  repo_sizes.loc[
142
  repo_sizes["Repository Type"] == column.capitalize(),
143
+ "Deduped Size (PB)",
144
  ] = comp_repo_size
145
  repo_sizes.loc[
146
+ repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PB)"
147
  ] = repo_size_diff
148
 
149
  # add a row that sums the total size and deduped size
 
207
  fig.update_layout(
208
  title="Cumulative Growth of Models, Spaces, and Datasets Over Time<br><sup>Dotted lines represent growth with file-level deduplication</sup>",
209
  xaxis_title="Date",
210
+ yaxis_title="Cumulative Size (PB)",
211
  legend_title="Type",
212
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
213
  )
 
254
  fig.update_layout(
255
  title="Cumulative Growth of Models, Spaces, and Datasets",
256
  xaxis_title="Date",
257
+ yaxis_title="Size (PB)",
258
  legend_title="Type",
259
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
260
  )
 
280
 
281
  # Update layout
282
  fig.update_layout(
283
+ title="Top 20 File Extensions by Total Size (in PB)",
284
  xaxis_title="File Extension",
285
+ yaxis_title="Total Size (PB)",
286
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
287
  colorway=px.colors.qualitative.Alphabet, # Use Plotly color palette
288
  )
 
350
  fig = px.area(_df, x="date", y="total_size", color="extension")
351
  # Update layout
352
  fig.update_layout(
353
+ title="File Extension Monthly Additions (in PB) Over Time",
354
  xaxis_title="Date",
355
+ yaxis_title="Size (PB)",
356
  legend_title="Type",
357
+ # format y-axis to be PB (currently bytes) with two decimal places
358
  yaxis=dict(tickformat=".2f"),
359
  )
360
 
 
437
  # Convert the total size to petabytes and format to two decimal places
438
  current_storage = format_dataframe_size_column(
439
  by_repo_type_analysis,
440
+ ["Total Size (PB)", "Deduped Size (PB)", "Dedupe Savings (PB)"],
441
  )
442
+ gr.Dataframe(current_storage[["Repository Type", "Total Size (PB)"]])
443
 
444
  gr.HTML(div_px(25))
445
  # File Extension analysis
 
448
  "What types of files are stored on the Hub? The Xet team's backend architecture allows for storage optimizations by file type, so seeing the breakdown of the most popular stored file types helps to prioritize our roadmap. The following sections filter the analysis to the top 20 file extensions stored (by bytes) using Git LFS. Taken together, these 20 file extensions account for 82% of the total bytes stored in LFS."
449
  )
450
  gr.Markdown(
451
+ "[Safetensors](https://huggingface.co/docs/safetensors/en/index) is quickly becoming the defacto standard on the Hub for storing tensor files, accounting for over 7PB (25%) of LFS storage. [GGUF (GPT-Generated Unified Format)](https://huggingface.co/docs/hub/gguf), a format for storing tensor files with a different set of optimizations, is also on the rise, accounting for 3.2 PB (11%) of LFS storage."
452
  )
453
  # Get the top 10 file extensions by size
454
  by_extension_size = by_extension.sort_values(by="size", ascending=False).head(22)
 
473
  columns={
474
  "extension": "File Extension",
475
  "count": "Number of Files",
476
+ "size": "Total Size (PB)",
477
  }
478
  )
479
 
 
485
  by_extension_size[
486
  [
487
  "File Extension",
488
+ "Total Size (PB)",
489
  "Number of Files",
490
  "Average File Size (MBs)",
491
  ]
 
493
  )
494
 
495
  gr.HTML(div_px(5))
496
+ gr.Markdown("### Storage Growth by File Extension (Monthly PB Added)")
497
  gr.Markdown(
498
  "The following area chart shows the number of bytes added to LFS storage each month, faceted by file extension."
499
  )
 
534
  with gr.Column(scale=1):
535
  gr.Markdown("### Current Storage Usage + File-level Deduplication")
536
  gr.Markdown(
537
+ "This simple change to the storage backend will save 3.24 PB (the equivalent of 7.2 Common Crawls)."
538
  )
539
  with gr.Column(scale=3):
540
  # Convert the total size to petabytes and format to two decimal places
 
545
  with gr.Column(scale=1):
546
  gr.Markdown("### Month-to-Month Growth + File-level Deduplication")
547
  gr.Markdown(
548
+ "This table shows month-to-month growth in model, dataset, and space storage. In 2024, the Hub has averaged nearly **2.3 PB uploaded to Git LFS per month**. Deduplicating at the file level saves nearly 225 TB (half a Common Crawl) monthly."
549
  )
550
  with gr.Column(scale=3):
551
  gr.Dataframe(last_10_months)