Spaces:

xet-team
/

lfs-analysis

Running

App Files Files Community

Compressed -> Deduped column header

by erinys HF staff - opened Oct 10

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+57

-57

Files changed (1) hide show

app.py +57 -57

app.py CHANGED Viewed

@@ -54,21 +54,18 @@ def process_dataset():
         columns={
             "type": "Repository Type",
             "num_files": "Number of Files",
-            "total_size": "Total Size (PB)",
         }
     )
     file_counts_and_sizes = file_counts_and_sizes.drop(columns=["Number of Files"])
     # sort the dataframe by total size in descending order
     file_counts_and_sizes = file_counts_and_sizes.sort_values(
-        by="Total Size (PB)", ascending=False
     )
     # drop nas from the extension column
     file_extensions = file_extensions.dropna(subset=["extension"])
-    file_extensions_by_month = file_extensions_by_month[
-        file_extensions_by_month["extension"] != ""
-    ]
     return (
         repo_by_size_df,
@@ -91,19 +88,19 @@ def cumulative_growth_df(_df):
     return cumulative_df
-def compare_last_10_months(_cumulative_df, _cumulative_df_deduped):
     last_10_months = _cumulative_df.tail(10).copy()
     last_10_months["total"] = last_10_months.sum(axis=1)
     last_10_months["total_change"] = last_10_months["total"].diff()
-    last_10_months["deduped_change"] = (
-        _cumulative_df_deduped.tail(10).sum(axis=1).diff()
     )
     last_10_months["savings"] = (
-        last_10_months["total_change"] - last_10_months["deduped_change"]
     )
     last_10_months = format_dataframe_size_column(
-        last_10_months, ["total_change", "deduped_change", "savings"]
     )
     last_10_months["date"] = _cumulative_df.tail(10).index
@@ -115,50 +112,50 @@ def compare_last_10_months(_cumulative_df, _cumulative_df_deduped):
     last_10_months = last_10_months.drop(last_10_months.index[0])
     # order the columns date, total, total_change
     last_10_months = last_10_months[
-        ["date", "total_change", "deduped_change", "savings"]
     ]
     # rename the columns
     last_10_months = last_10_months.rename(
         columns={
             "date": "Date",
-            "total_change": "Month-to-Month Growth (PB)",
-            "deduped_change": "Growth with File-Level Deduplication (PB)",
-            "savings": "Dedupe Savings (PB)",
         }
     )
     return last_10_months
-def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_deduped):
-    # create a new column in the repository sizes dataframe for "deduped size" and set it to empty atif rist
-    repo_sizes["Deduped Size (PB)"] = ""
-    repo_sizes["Dedupe Savings (PB)"] = ""
     for column in cumulative_df.columns:
         cum_repo_size = cumulative_df[column].iloc[-1]
-        comp_repo_size = cumulative_df_deduped[column].iloc[-1]
         repo_size_diff = cum_repo_size - comp_repo_size
         repo_sizes.loc[
             repo_sizes["Repository Type"] == column.capitalize(),
-            "Deduped Size (PB)",
         ] = comp_repo_size
         repo_sizes.loc[
-            repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PB)"
         ] = repo_size_diff
-    # add a row that sums the total size and deduped size
     repo_sizes.loc["Total"] = repo_sizes.sum()
     repo_sizes.loc["Total", "Repository Type"] = "Total"
     return repo_sizes
-def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped):
     """
     Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
     Args:
         df (DataFrame): The input dataframe containing the data.
-        df_deduped (DataFrame): The input dataframe containing the deduped data.
     Returns:
         tuple: A tuple containing two elements:
@@ -192,11 +189,11 @@ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped):
         )
     # Add a scatter trace for each type
-    for column in cumulative_df_deduped.columns:
         fig.add_trace(
             go.Scatter(
-                x=cumulative_df_deduped.index,
-                y=cumulative_df_deduped[column] / 1e15,  # Convert to petabytes
                 mode="lines",
                 name=column.capitalize() + " (File-Level Deduplication)",
                 line=dict(color=color_map.get(column, "black"), dash="dash"),
@@ -207,7 +204,7 @@ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped):
     fig.update_layout(
         title="Cumulative Growth of Models, Spaces, and Datasets Over Time<br><sup>Dotted lines represent growth with file-level deduplication</sup>",
         xaxis_title="Date",
-        yaxis_title="Cumulative Size (PB)",
         legend_title="Type",
         yaxis=dict(tickformat=".2f"),  # Format y-axis labels to 2 decimal places
     )
@@ -254,7 +251,7 @@ def cumulative_growth_single(_df):
     fig.update_layout(
         title="Cumulative Growth of Models, Spaces, and Datasets",
         xaxis_title="Date",
-        yaxis_title="Size (PB)",
         legend_title="Type",
         yaxis=dict(tickformat=".2f"),  # Format y-axis labels to 2 decimal places
     )
@@ -280,9 +277,9 @@ def plot_total_sum(by_type_arr):
     # Update layout
     fig.update_layout(
-        title="Top 20 File Extensions by Total Size (in PB)",
         xaxis_title="File Extension",
-        yaxis_title="Total Size (PB)",
         yaxis=dict(tickformat=".2f"),  # Format y-axis labels to 2 decimal places
         colorway=px.colors.qualitative.Alphabet,  # Use Plotly color palette
     )
@@ -333,9 +330,9 @@ def filter_by_extension_month(_df, _extension):
     # Update layout
     fig.update_layout(
-        title="Monthly Additions of LFS Files by Extension (in TB)",
         xaxis_title="Date",
-        yaxis_title="Size (TB)",
         legend_title="Type",
         yaxis=dict(tickformat=".2f"),  # Format y-axis labels to 2 decimal places
     )
@@ -350,11 +347,11 @@ def area_plot_by_extension_month(_df):
     fig = px.area(_df, x="date", y="total_size", color="extension")
     # Update layout
     fig.update_layout(
-        title="File Extension Monthly Additions (in PB) Over Time",
         xaxis_title="Date",
-        yaxis_title="Size (PB)",
         legend_title="Type",
-        # format y-axis to be PB (currently bytes) with two decimal places
         yaxis=dict(tickformat=".2f"),
     )
@@ -393,16 +390,16 @@ with gr.Blocks(theme="citrus") as demo:
     # Convert year and month into a datetime column
     df = month_year_to_date(df)
-    df_deduped = month_year_to_date(file_df)
     # Calculate the cumulative growth of models, spaces, and datasets over time
     cumulative_df = cumulative_growth_df(df)
-    cumulative_df_deduped = cumulative_growth_df(df_deduped)
-    last_10_months = compare_last_10_months(cumulative_df, cumulative_df_deduped)
     by_repo_type_analysis = tabular_analysis(
-        by_repo_type, cumulative_df, cumulative_df_deduped
     )
     # Add top level heading and introduction text
@@ -431,15 +428,15 @@ with gr.Blocks(theme="citrus") as demo:
         with gr.Column(scale=2):
             gr.Markdown("### Current Storage Usage")
             gr.Markdown(
-                "As of September 20, 2024, total files stored in Git LFS summed to almost 29 PB. To put this into perspective, the last [Common Crawl](https://commoncrawl.org/) download was [451 TB](https://github.com/commoncrawl/cc-crawl-statistics/blob/master/stats/crawler/CC-MAIN-2024-38.json#L31) - the Hub stores the equivalent of more than **64 Common Crawls** 🤯."
             )
         with gr.Column(scale=3):
             # Convert the total size to petabytes and format to two decimal places
             current_storage = format_dataframe_size_column(
                 by_repo_type_analysis,
-                ["Total Size (PB)", "Deduped Size (PB)", "Dedupe Savings (PB)"],
             )
-            gr.Dataframe(current_storage[["Repository Type", "Total Size (PB)"]])
     gr.HTML(div_px(25))
     # File Extension analysis
@@ -448,23 +445,24 @@ with gr.Blocks(theme="citrus") as demo:
         "What types of files are stored on the Hub? The Xet team's backend architecture allows for storage optimizations by file type, so seeing the breakdown of the most popular stored file types helps to prioritize our roadmap. The following sections filter the analysis to the top 20 file extensions stored (by bytes) using Git LFS. Taken together, these 20 file extensions account for 82% of the total bytes stored in LFS."
     )
     gr.Markdown(
-        "[Safetensors](https://huggingface.co/docs/safetensors/en/index) is quickly becoming the defacto standard on the Hub for storing tensor files, accounting for over 7PB (25%) of LFS storage. [GGUF (GPT-Generated Unified Format)](https://huggingface.co/docs/hub/gguf), a format for storing tensor files with a different set of optimizations, is also on the rise, accounting for 3.2 PB (11%) of LFS storage."
     )
     # Get the top 10 file extensions by size
     by_extension_size = by_extension.sort_values(by="size", ascending=False).head(22)
     # make a bar chart of the by_extension_size dataframe
     gr.Plot(plot_total_sum(by_extension_size[["extension", "size"]].values))
     # drop the unnamed: 0 column
     by_extension_size = by_extension_size.drop(columns=["Unnamed: 0"])
     # average size
-    by_extension_size["Average File Size (MB)"] = (
         by_extension_size["size"].astype(float) / by_extension_size["count"]
     )
-    by_extension_size["Average File Size (MB)"] = (
-        by_extension_size["Average File Size (MB)"] / 1e6
     )
-    by_extension_size["Average File Size (MB)"] = by_extension_size[
-        "Average File Size (MB)"
     ].map("{:.2f}".format)
     # format the size column
     by_extension_size = format_dataframe_size_column(by_extension_size, ["size"])
@@ -473,7 +471,7 @@ with gr.Blocks(theme="citrus") as demo:
         columns={
             "extension": "File Extension",
             "count": "Number of Files",
-            "size": "Total Size (PB)",
         }
     )
@@ -485,15 +483,15 @@ with gr.Blocks(theme="citrus") as demo:
         by_extension_size[
             [
                 "File Extension",
-                "Total Size (PB)",
                 "Number of Files",
-                "Average File Size (MB)",
             ]
         ]
     )
     gr.HTML(div_px(5))
-    gr.Markdown("### Storage Growth by File Extension (Monthly PB Added)")
     gr.Markdown(
         "The following area chart shows the number of bytes added to LFS storage each month, faceted by file extension."
     )
@@ -501,7 +499,7 @@ with gr.Blocks(theme="citrus") as demo:
     gr.HTML(div_px(5))
     gr.Markdown(
-        "To dig deeper, use the dropdown to filter by file extension and see the bytes added (in TB) each month for specific file types."
     )
     # get the unique values in the extension column and remove any empty strings
@@ -525,7 +523,9 @@ with gr.Blocks(theme="citrus") as demo:
     gr.Markdown(
         "The first improvement we can make to Hub storage is to add file-level deduplication. Since forking any Hub repository makes copies of the files, a scan of existing files unsurprisingly shows that some files match exactly. The following chart shows the storage growth chart from above with additional dashed lines showing the potential savings from deduplicating at the file level."
     )
-    dedupe_fig = cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped)
     gr.Plot(dedupe_fig)
     gr.HTML(div_px(5))
@@ -534,7 +534,7 @@ with gr.Blocks(theme="citrus") as demo:
         with gr.Column(scale=1):
             gr.Markdown("### Current Storage Usage + File-level Deduplication")
             gr.Markdown(
-                "This simple change to the storage backend will save 3.24 PB (the equivalent of 7.2 Common Crawls)."
             )
         with gr.Column(scale=3):
             # Convert the total size to petabytes and format to two decimal places
@@ -545,7 +545,7 @@ with gr.Blocks(theme="citrus") as demo:
         with gr.Column(scale=1):
             gr.Markdown("### Month-to-Month Growth + File-level Deduplication")
             gr.Markdown(
-                "This table shows month-to-month growth in model, dataset, and space storage. In 2024, the Hub has averaged nearly **2.3 PB uploaded to Git LFS per month**. Deduplicating at the file level saves nearly 225 TB (half a Common Crawl) monthly."
             )
         with gr.Column(scale=3):
             gr.Dataframe(last_10_months)

         columns={
             "type": "Repository Type",
             "num_files": "Number of Files",
+            "total_size": "Total Size (PBs)",
         }
     )
     file_counts_and_sizes = file_counts_and_sizes.drop(columns=["Number of Files"])
     # sort the dataframe by total size in descending order
     file_counts_and_sizes = file_counts_and_sizes.sort_values(
+        by="Total Size (PBs)", ascending=False
     )
     # drop nas from the extension column
     file_extensions = file_extensions.dropna(subset=["extension"])
     return (
         repo_by_size_df,
     return cumulative_df
+def compare_last_10_months(_cumulative_df, _cumulative_df_compressed):
     last_10_months = _cumulative_df.tail(10).copy()
     last_10_months["total"] = last_10_months.sum(axis=1)
     last_10_months["total_change"] = last_10_months["total"].diff()
+    last_10_months["compressed_change"] = (
+        _cumulative_df_compressed.tail(10).sum(axis=1).diff()
     )
     last_10_months["savings"] = (
+        last_10_months["total_change"] - last_10_months["compressed_change"]
     )
     last_10_months = format_dataframe_size_column(
+        last_10_months, ["total_change", "compressed_change", "savings"]
     )
     last_10_months["date"] = _cumulative_df.tail(10).index
     last_10_months = last_10_months.drop(last_10_months.index[0])
     # order the columns date, total, total_change
     last_10_months = last_10_months[
+        ["date", "total_change", "compressed_change", "savings"]
     ]
     # rename the columns
     last_10_months = last_10_months.rename(
         columns={
             "date": "Date",
+            "total_change": "Month-to-Month Growth (PBs)",
+            "compressed_change": "Growth with File-Level Deduplication (PBs)",
+            "savings": "Dedupe Savings (PBs)",
         }
     )
     return last_10_months
+def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_compressed):
+    # create a new column in the repository sizes dataframe for "compressed size" and set it to empty atif rist
+    repo_sizes["Deduped Size (PBs)"] = ""
+    repo_sizes["Dedupe Savings (PBs)"] = ""
     for column in cumulative_df.columns:
         cum_repo_size = cumulative_df[column].iloc[-1]
+        comp_repo_size = cumulative_df_compressed[column].iloc[-1]
         repo_size_diff = cum_repo_size - comp_repo_size
         repo_sizes.loc[
             repo_sizes["Repository Type"] == column.capitalize(),
+            "Deduped Size (PBs)",
         ] = comp_repo_size
         repo_sizes.loc[
+            repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
         ] = repo_size_diff
+    # add a row that sums the total size and compressed size
     repo_sizes.loc["Total"] = repo_sizes.sum()
     repo_sizes.loc["Total", "Repository Type"] = "Total"
     return repo_sizes
+def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_compressed):
     """
     Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
     Args:
         df (DataFrame): The input dataframe containing the data.
+        df_compressed (DataFrame): The input dataframe containing the compressed data.
     Returns:
         tuple: A tuple containing two elements:
         )
     # Add a scatter trace for each type
+    for column in cumulative_df_compressed.columns:
         fig.add_trace(
             go.Scatter(
+                x=cumulative_df_compressed.index,
+                y=cumulative_df_compressed[column] / 1e15,  # Convert to petabytes
                 mode="lines",
                 name=column.capitalize() + " (File-Level Deduplication)",
                 line=dict(color=color_map.get(column, "black"), dash="dash"),
     fig.update_layout(
         title="Cumulative Growth of Models, Spaces, and Datasets Over Time<br><sup>Dotted lines represent growth with file-level deduplication</sup>",
         xaxis_title="Date",
+        yaxis_title="Cumulative Size (PBs)",
         legend_title="Type",
         yaxis=dict(tickformat=".2f"),  # Format y-axis labels to 2 decimal places
     )
     fig.update_layout(
         title="Cumulative Growth of Models, Spaces, and Datasets",
         xaxis_title="Date",
+        yaxis_title="Size (PBs)",
         legend_title="Type",
         yaxis=dict(tickformat=".2f"),  # Format y-axis labels to 2 decimal places
     )
     # Update layout
     fig.update_layout(
+        title="Top 20 File Extensions by Total Size (in PBs)",
         xaxis_title="File Extension",
+        yaxis_title="Total Size (PBs)",
         yaxis=dict(tickformat=".2f"),  # Format y-axis labels to 2 decimal places
         colorway=px.colors.qualitative.Alphabet,  # Use Plotly color palette
     )
     # Update layout
     fig.update_layout(
+        title="Monthly Additions of LFS Files by Extension (in TBs)",
         xaxis_title="Date",
+        yaxis_title="Size (TBs)",
         legend_title="Type",
         yaxis=dict(tickformat=".2f"),  # Format y-axis labels to 2 decimal places
     )
     fig = px.area(_df, x="date", y="total_size", color="extension")
     # Update layout
     fig.update_layout(
+        title="File Extension Monthly Additions (in PBs) Over Time",
         xaxis_title="Date",
+        yaxis_title="Size (PBs)",
         legend_title="Type",
+        # format y-axis to be PBs (currently bytes) with two decimal places
         yaxis=dict(tickformat=".2f"),
     )
     # Convert year and month into a datetime column
     df = month_year_to_date(df)
+    df_compressed = month_year_to_date(file_df)
     # Calculate the cumulative growth of models, spaces, and datasets over time
     cumulative_df = cumulative_growth_df(df)
+    cumulative_df_compressed = cumulative_growth_df(df_compressed)
+    last_10_months = compare_last_10_months(cumulative_df, cumulative_df_compressed)
     by_repo_type_analysis = tabular_analysis(
+        by_repo_type, cumulative_df, cumulative_df_compressed
     )
     # Add top level heading and introduction text
         with gr.Column(scale=2):
             gr.Markdown("### Current Storage Usage")
             gr.Markdown(
+                "As of September 20, 2024, total files stored in Git LFS summed to almost 29 PB. To put this into perspective, the last [Common Crawl](https://commoncrawl.org/) download was [451 TBs](https://github.com/commoncrawl/cc-crawl-statistics/blob/master/stats/crawler/CC-MAIN-2024-38.json#L31) - the Hub stores the equivalent of more than **64 Common Crawls** 🤯."
             )
         with gr.Column(scale=3):
             # Convert the total size to petabytes and format to two decimal places
             current_storage = format_dataframe_size_column(
                 by_repo_type_analysis,
+                ["Total Size (PBs)", "Deduped Size (PBs)", "Dedupe Savings (PBs)"],
             )
+            gr.Dataframe(current_storage[["Repository Type", "Total Size (PBs)"]])
     gr.HTML(div_px(25))
     # File Extension analysis
         "What types of files are stored on the Hub? The Xet team's backend architecture allows for storage optimizations by file type, so seeing the breakdown of the most popular stored file types helps to prioritize our roadmap. The following sections filter the analysis to the top 20 file extensions stored (by bytes) using Git LFS. Taken together, these 20 file extensions account for 82% of the total bytes stored in LFS."
     )
     gr.Markdown(
+        "[Safetensors](https://huggingface.co/docs/safetensors/en/index) is quickly becoming the defacto standard on the Hub for storing tensor files, accounting for over 7PBs (25%) of LFS storage. [GGUF (GPT-Generated Unified Format)](https://huggingface.co/docs/hub/gguf), a format for storing tensor files with a different set of optimizations, is also on the rise, accounting for 3.2 PBs (11%) of LFS storage."
     )
     # Get the top 10 file extensions by size
     by_extension_size = by_extension.sort_values(by="size", ascending=False).head(22)
     # make a bar chart of the by_extension_size dataframe
     gr.Plot(plot_total_sum(by_extension_size[["extension", "size"]].values))
     # drop the unnamed: 0 column
     by_extension_size = by_extension_size.drop(columns=["Unnamed: 0"])
     # average size
+    by_extension_size["Average File Size (MBs)"] = (
         by_extension_size["size"].astype(float) / by_extension_size["count"]
     )
+    by_extension_size["Average File Size (MBs)"] = (
+        by_extension_size["Average File Size (MBs)"] / 1e6
     )
+    by_extension_size["Average File Size (MBs)"] = by_extension_size[
+        "Average File Size (MBs)"
     ].map("{:.2f}".format)
     # format the size column
     by_extension_size = format_dataframe_size_column(by_extension_size, ["size"])
         columns={
             "extension": "File Extension",
             "count": "Number of Files",
+            "size": "Total Size (PBs)",
         }
     )
         by_extension_size[
             [
                 "File Extension",
+                "Total Size (PBs)",
                 "Number of Files",
+                "Average File Size (MBs)",
             ]
         ]
     )
     gr.HTML(div_px(5))
+    gr.Markdown("### Storage Growth by File Extension (Monthly PBs Added)")
     gr.Markdown(
         "The following area chart shows the number of bytes added to LFS storage each month, faceted by file extension."
     )
     gr.HTML(div_px(5))
     gr.Markdown(
+        "To dig deeper, use the dropdown to filter by file extension and see the bytes added (in TBs) each month for specific file types."
     )
     # get the unique values in the extension column and remove any empty strings
     gr.Markdown(
         "The first improvement we can make to Hub storage is to add file-level deduplication. Since forking any Hub repository makes copies of the files, a scan of existing files unsurprisingly shows that some files match exactly. The following chart shows the storage growth chart from above with additional dashed lines showing the potential savings from deduplicating at the file level."
     )
+    dedupe_fig = cumulative_growth_plot_analysis(
+        cumulative_df, cumulative_df_compressed
+    )
     gr.Plot(dedupe_fig)
     gr.HTML(div_px(5))
         with gr.Column(scale=1):
             gr.Markdown("### Current Storage Usage + File-level Deduplication")
             gr.Markdown(
+                "This simple change to the storage backend will save 3.24 PBs (the equivalent of 7.2 Common Crawls)."
             )
         with gr.Column(scale=3):
             # Convert the total size to petabytes and format to two decimal places
         with gr.Column(scale=1):
             gr.Markdown("### Month-to-Month Growth + File-level Deduplication")
             gr.Markdown(
+                "This table shows month-to-month growth in model, dataset, and space storage. In 2024, the Hub has averaged nearly **2.3 PBs uploaded to Git LFS per month**. Deduplicating at the file level saves nearly 225 TB (half a Common Crawl) monthly."
             )
         with gr.Column(scale=3):
             gr.Dataframe(last_10_months)