jsulz HF staff commited on
Commit
f6db30c
1 Parent(s): f68c1d3

added new new chart

Browse files
Files changed (1) hide show
  1. app.py +75 -6
app.py CHANGED
@@ -21,18 +21,25 @@ def process_dataset():
21
  """
22
 
23
  file_counts_and_sizes = pd.read_parquet(
24
- "hf://datasets/xet-team/lfs-analysis-data/file_counts_and_sizes.parquet"
25
  )
26
  repo_by_size_df = pd.read_parquet(
27
- "hf://datasets/xet-team/lfs-analysis-data/repo_by_size.parquet"
28
  )
29
  unique_files_df = pd.read_parquet(
30
- "hf://datasets/xet-team/lfs-analysis-data/repo_by_size_file_dedupe.parquet"
31
  )
32
  file_extensions = pd.read_parquet(
33
- "hf://datasets/xet-team/lfs-analysis-data/file_extensions.parquet"
34
  )
35
 
 
 
 
 
 
 
 
36
  # Convert the total size to petabytes and format to two decimal places
37
  file_counts_and_sizes = format_dataframe_size_column(
38
  file_counts_and_sizes, "total_size"
@@ -55,7 +62,13 @@ def process_dataset():
55
  # drop nas from the extension column
56
  file_extensions = file_extensions.dropna(subset=["extension"])
57
 
58
- return repo_by_size_df, unique_files_df, file_counts_and_sizes, file_extensions
 
 
 
 
 
 
59
 
60
 
61
  def format_dataframe_size_column(_df, column_name):
@@ -196,9 +209,52 @@ def plot_total_sum(by_type_arr):
196
  return fig
197
 
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  # Create a gradio blocks interface and launch a demo
200
  with gr.Blocks() as demo:
201
- df, file_df, by_type, by_extension = process_dataset()
202
 
203
  # Add a heading
204
  gr.Markdown("# Git LFS Analysis Across the Hub")
@@ -258,5 +314,18 @@ with gr.Blocks() as demo:
258
  )
259
  gr.Dataframe(by_extension_size)
260
 
 
 
 
 
261
 
 
 
 
 
 
 
 
 
 
262
  demo.launch()
 
21
  """
22
 
23
  file_counts_and_sizes = pd.read_parquet(
24
+ "hf://datasets/xet-team/lfs-analysis-data/transformed/file_counts_and_sizes.parquet"
25
  )
26
  repo_by_size_df = pd.read_parquet(
27
+ "hf://datasets/xet-team/lfs-analysis-data/transformed/repo_by_size.parquet"
28
  )
29
  unique_files_df = pd.read_parquet(
30
+ "hf://datasets/xet-team/lfs-analysis-data/transformed/repo_by_size_file_dedupe.parquet"
31
  )
32
  file_extensions = pd.read_parquet(
33
+ "hf://datasets/xet-team/lfs-analysis-data/transformed/file_extensions.parquet"
34
  )
35
 
36
+ # read the file_extensions_by_month.parquet file
37
+ file_extensions_by_month = pd.read_parquet(
38
+ "hf://datasets/xet-team/lfs-analysis-data/transformed/file_extensions_by_month.parquet"
39
+ )
40
+ # drop any nas
41
+ file_extensions_by_month = file_extensions_by_month.dropna()
42
+
43
  # Convert the total size to petabytes and format to two decimal places
44
  file_counts_and_sizes = format_dataframe_size_column(
45
  file_counts_and_sizes, "total_size"
 
62
  # drop nas from the extension column
63
  file_extensions = file_extensions.dropna(subset=["extension"])
64
 
65
+ return (
66
+ repo_by_size_df,
67
+ unique_files_df,
68
+ file_counts_and_sizes,
69
+ file_extensions,
70
+ file_extensions_by_month,
71
+ )
72
 
73
 
74
  def format_dataframe_size_column(_df, column_name):
 
209
  return fig
210
 
211
 
212
+ def filter_by_extension_month(_df, _extension):
213
+ """
214
+ Filters the given DataFrame (_df) by the specified extension and creates a line plot using Plotly.
215
+
216
+ Parameters:
217
+ _df (DataFrame): The input DataFrame containing the data.
218
+ extension (str): The extension to filter the DataFrame by. If set to "All", no filtering is applied.
219
+
220
+ Returns:
221
+ fig (Figure): The Plotly figure object representing the line plot.
222
+ """
223
+ # Filter the DataFrame by the specified extension or extensions
224
+ if len(_extension) == 1 and "All" in _extension or len(_extension) == 0:
225
+ pass
226
+ else:
227
+ _df = _df[_df["extension"].isin(_extension)].copy()
228
+
229
+ # Convert year and month into a datetime column and sort by date
230
+ _df["date"] = pd.to_datetime(_df[["year", "month"]].assign(day=1))
231
+ _df = _df.sort_values(by="date")
232
+
233
+ # Pivot the DataFrame to get the total size for each extension and make this plotable as a time series
234
+ pivot_df = _df.pivot_table(
235
+ index="date", columns="extension", values="total_size"
236
+ ).fillna(0)
237
+
238
+ # Plot!!
239
+ fig = go.Figure()
240
+ for i, column in enumerate(pivot_df.columns):
241
+ if column != "":
242
+ fig.add_trace(
243
+ go.Scatter(
244
+ x=pivot_df.index,
245
+ y=pivot_df[column] / 1e15, # Convert to petabytes
246
+ mode="lines",
247
+ name=column.capitalize(),
248
+ line=dict(color=px.colors.qualitative.Alphabet[i]),
249
+ )
250
+ )
251
+
252
+ return fig
253
+
254
+
255
  # Create a gradio blocks interface and launch a demo
256
  with gr.Blocks() as demo:
257
+ df, file_df, by_type, by_extension, by_extension_month = process_dataset()
258
 
259
  # Add a heading
260
  gr.Markdown("# Git LFS Analysis Across the Hub")
 
314
  )
315
  gr.Dataframe(by_extension_size)
316
 
317
+ gr.Markdown("## File Extension Growth Over Time")
318
+ gr.Markdown(
319
+ "Want to dig a little deeper? Select a file extension to see how many bytes of that type were uploaded to the Hub each month."
320
+ )
321
 
322
+ # build a dropdown using the unique values in the extension column
323
+ extension = gr.Dropdown(
324
+ choices=by_extension["extension"].unique().tolist(),
325
+ value="All",
326
+ allow_custom_value=True,
327
+ multiselect=True,
328
+ )
329
+ _by_extension_month = gr.State(by_extension_month)
330
+ gr.Plot(filter_by_extension_month, inputs=[_by_extension_month, extension])
331
  demo.launch()