jsulz HF staff commited on
Commit
9d6f412
1 Parent(s): 34ae673
Files changed (1) hide show
  1. app.py +42 -34
app.py CHANGED
@@ -17,8 +17,10 @@ def apply_power_scaling(sizes: list, exponent=0.2) -> list:
17
  return [size**exponent if size is not None else 0 for size in sizes]
18
 
19
 
20
- def count_chunks(sizes: list) -> list:
21
  """Count the number of chunks, which are 64KB each in size; always roundup"""
 
 
22
  return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
23
 
24
 
@@ -99,7 +101,7 @@ def flatten_hierarchy(hierarchy, root_name="Repository"):
99
  return labels, parents, sizes, ids
100
 
101
 
102
- def visualize_repo_treemap(r_info):
103
  """Visualizes the repository as a treemap with directory sizes and human-readable tooltips."""
104
  siblings = r_info.siblings
105
  hierarchy = build_hierarchy(siblings)
@@ -108,7 +110,7 @@ def visualize_repo_treemap(r_info):
108
  calculate_directory_sizes(hierarchy)
109
 
110
  # Flatten the hierarchy for Plotly
111
- labels, parents, sizes, ids = flatten_hierarchy(hierarchy)
112
 
113
  # Scale for vix
114
  scaled_sizes = apply_power_scaling(sizes)
@@ -138,7 +140,7 @@ def visualize_repo_treemap(r_info):
138
  values=scaled_sizes,
139
  color=normalized_colors,
140
  color_continuous_scale=colorscale,
141
- title="Repo by Chunks",
142
  custom_data=[formatted_sizes, chunks],
143
  height=1000,
144
  ids=ids,
@@ -149,7 +151,7 @@ def visualize_repo_treemap(r_info):
149
  # Add subtitle by updating the layout
150
  fig.update_layout(
151
  title={
152
- "text": "Repo File Size Treemap<br><span style='font-size:14px;'>Hover over each directory or file to see the size of the file and its number of chunks</span>",
153
  "x": 0.5,
154
  "xanchor": "center",
155
  },
@@ -189,16 +191,18 @@ def format_repo_size(r_size: int) -> str:
189
 
190
  def repo_files(r_type: str, r_id: str) -> dict:
191
  r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
192
- fig = visualize_repo_treemap(r_info)
193
  files = {}
194
  for sibling in r_info.siblings:
195
  ext = sibling.rfilename.split(".")[-1]
196
  if ext in files:
197
  files[ext]["size"] += sibling.size
 
198
  files[ext]["count"] += 1
199
  else:
200
  files[ext] = {}
201
  files[ext]["size"] = sibling.size
 
202
  files[ext]["count"] = 1
203
  return files, fig
204
 
@@ -226,7 +230,11 @@ def repo_size(r_type, r_id):
226
  return {}
227
  size = response.get("size")
228
  if size is not None:
229
- repo_sizes[branch.name] = size
 
 
 
 
230
  return repo_sizes
231
 
232
 
@@ -246,40 +254,32 @@ def get_repo_info(r_type, r_id):
246
  gr.Dataframe(visible=False),
247
  )
248
 
249
- rf_sizes_df = (
250
- pd.DataFrame(repo_files_info)
251
- .T.reset_index(names="ext")
252
- .sort_values(by="size", ascending=False)
253
- )
254
  # check if repo_sizes is just {}
255
  if not repo_sizes:
256
  r_sizes_component = gr.Dataframe(visible=False)
257
  b_block = gr.Row(visible=False)
258
  else:
259
- r_sizes_df = pd.DataFrame(repo_sizes, index=["size"]).T.reset_index(
260
- names="branch"
 
261
  )
262
- r_sizes_df["formatted_size"] = r_sizes_df["size"].apply(format_repo_size)
263
- r_sizes_df.columns = ["Branch", "bytes", "Size"]
264
  r_sizes_component = gr.Dataframe(
265
- value=r_sizes_df[["Branch", "Size"]], visible=True
266
  )
267
  b_block = gr.Row(visible=True)
268
 
269
- rf_sizes_df["formatted_size"] = rf_sizes_df["size"].apply(format_repo_size)
270
- rf_sizes_df.columns = ["Extension", "bytes", "Count", "Size"]
271
- rf_sizes_plot = px.pie(
272
- rf_sizes_df,
273
- values="bytes",
274
- names="Extension",
275
- hover_data=["Size"],
276
- title=f"File Distribution in {r_id}",
277
- hole=0.3,
278
  )
 
 
279
  return (
280
  gr.Row(visible=True),
281
  gr.Dataframe(
282
- value=rf_sizes_df[["Extension", "Count", "Size"]],
283
  visible=True,
284
  ),
285
  # gr.Plot(rf_sizes_plot, visible=True),
@@ -290,9 +290,9 @@ def get_repo_info(r_type, r_id):
290
 
291
 
292
  with gr.Blocks(theme="ocean") as demo:
293
- gr.Markdown("# Repository Information")
294
  gr.Markdown(
295
- "Search for a model or dataset repository using the autocomplete below, select the repository type, and get back information about the repository's files and branches."
296
  )
297
  with gr.Blocks():
298
  # repo_id = gr.Textbox(label="Repository ID", placeholder="123456")
@@ -310,15 +310,23 @@ with gr.Blocks(theme="ocean") as demo:
310
  with gr.Blocks():
311
  with gr.Row(visible=False) as results_block:
312
  with gr.Column():
313
- gr.Markdown("## File Information")
314
  file_info_plot = gr.Plot(visible=False)
315
- with gr.Row():
316
- file_info = gr.Dataframe(visible=False)
317
- # file_info_plot = gr.Plot(visible=False)
318
  with gr.Row(visible=False) as branch_block:
319
  with gr.Column():
320
- gr.Markdown("## Branch Sizes")
 
 
 
321
  branch_sizes = gr.Dataframe(visible=False)
 
 
 
 
 
 
 
 
322
 
323
  search_button.click(
324
  get_repo_info,
 
17
  return [size**exponent if size is not None else 0 for size in sizes]
18
 
19
 
20
+ def count_chunks(sizes: list | int) -> list:
21
  """Count the number of chunks, which are 64KB each in size; always roundup"""
22
+ if isinstance(sizes, int):
23
+ return int(np.ceil(sizes / 64_000))
24
  return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
25
 
26
 
 
101
  return labels, parents, sizes, ids
102
 
103
 
104
+ def visualize_repo_treemap(r_info: dict, r_id: str) -> px.treemap:
105
  """Visualizes the repository as a treemap with directory sizes and human-readable tooltips."""
106
  siblings = r_info.siblings
107
  hierarchy = build_hierarchy(siblings)
 
110
  calculate_directory_sizes(hierarchy)
111
 
112
  # Flatten the hierarchy for Plotly
113
+ labels, parents, sizes, ids = flatten_hierarchy(hierarchy, r_id)
114
 
115
  # Scale for vix
116
  scaled_sizes = apply_power_scaling(sizes)
 
140
  values=scaled_sizes,
141
  color=normalized_colors,
142
  color_continuous_scale=colorscale,
143
+ title=f"{r_id} by Chunks",
144
  custom_data=[formatted_sizes, chunks],
145
  height=1000,
146
  ids=ids,
 
151
  # Add subtitle by updating the layout
152
  fig.update_layout(
153
  title={
154
+ "text": f"{r_id} file and chunk treemap<br><span style='font-size:14px;'>Hover over each directory/file to see its size and number of chunks it contains.</span>",
155
  "x": 0.5,
156
  "xanchor": "center",
157
  },
 
191
 
192
  def repo_files(r_type: str, r_id: str) -> dict:
193
  r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
194
+ fig = visualize_repo_treemap(r_info, r_id)
195
  files = {}
196
  for sibling in r_info.siblings:
197
  ext = sibling.rfilename.split(".")[-1]
198
  if ext in files:
199
  files[ext]["size"] += sibling.size
200
+ files[ext]["chunks"] += count_chunks(sibling.size)
201
  files[ext]["count"] += 1
202
  else:
203
  files[ext] = {}
204
  files[ext]["size"] = sibling.size
205
+ files[ext]["chunks"] = count_chunks(sibling.size)
206
  files[ext]["count"] = 1
207
  return files, fig
208
 
 
230
  return {}
231
  size = response.get("size")
232
  if size is not None:
233
+ repo_sizes[branch.name] = {
234
+ "size_in_bytes": size,
235
+ "size_in_chunks": count_chunks(size),
236
+ }
237
+
238
  return repo_sizes
239
 
240
 
 
254
  gr.Dataframe(visible=False),
255
  )
256
 
 
 
 
 
 
257
  # check if repo_sizes is just {}
258
  if not repo_sizes:
259
  r_sizes_component = gr.Dataframe(visible=False)
260
  b_block = gr.Row(visible=False)
261
  else:
262
+ r_sizes_df = pd.DataFrame(repo_sizes).T.reset_index(names="branch")
263
+ r_sizes_df["formatted_size"] = r_sizes_df["size_in_bytes"].apply(
264
+ format_repo_size
265
  )
266
+ r_sizes_df.columns = ["Branch", "size_in_bytes", "Chunks", "Size"]
 
267
  r_sizes_component = gr.Dataframe(
268
+ value=r_sizes_df[["Branch", "Size", "Chunks"]], visible=True
269
  )
270
  b_block = gr.Row(visible=True)
271
 
272
+ rf_sizes_df = (
273
+ pd.DataFrame(repo_files_info)
274
+ .T.reset_index(names="ext")
275
+ .sort_values(by="size", ascending=False)
 
 
 
 
 
276
  )
277
+ rf_sizes_df["formatted_size"] = rf_sizes_df["size"].apply(format_repo_size)
278
+ rf_sizes_df.columns = ["Extension", "bytes", "Chunks", "Count", "Size"]
279
  return (
280
  gr.Row(visible=True),
281
  gr.Dataframe(
282
+ value=rf_sizes_df[["Extension", "Count", "Size", "Chunks"]],
283
  visible=True,
284
  ),
285
  # gr.Plot(rf_sizes_plot, visible=True),
 
290
 
291
 
292
  with gr.Blocks(theme="ocean") as demo:
293
+ gr.Markdown("# Chunking Repos")
294
  gr.Markdown(
295
+ "Search for a model or dataset repository using the autocomplete below, select the repository type, and get back information about the repository's contents including the [number of chunks each file might be split into with Xet backed storage](https://huggingface.co/blog/from-files-to-chunks)."
296
  )
297
  with gr.Blocks():
298
  # repo_id = gr.Textbox(label="Repository ID", placeholder="123456")
 
310
  with gr.Blocks():
311
  with gr.Row(visible=False) as results_block:
312
  with gr.Column():
313
+ gr.Markdown("## Repo Info")
314
  file_info_plot = gr.Plot(visible=False)
 
 
 
315
  with gr.Row(visible=False) as branch_block:
316
  with gr.Column():
317
+ gr.Markdown("### Branch Sizes")
318
+ gr.Markdown(
319
+ "The size of each branch in the repository and how many chunks it might need (assuming no dedupe)."
320
+ )
321
  branch_sizes = gr.Dataframe(visible=False)
322
+ with gr.Row():
323
+ with gr.Column():
324
+ gr.Markdown("### File Sizes")
325
+ gr.Markdown(
326
+ "The cumulative size of each filetype in the repository (in the `main` branch) and how many chunks they might need (assuming no dedupe)."
327
+ )
328
+ file_info = gr.Dataframe(visible=False)
329
+ # file_info_plot = gr.Plot(visible=False)
330
 
331
  search_button.click(
332
  get_repo_info,