jsulz HF staff commited on
Commit
adbb8fc
1 Parent(s): ed3daa4

visualizing by chunks

Browse files
Files changed (3) hide show
  1. app.py +144 -4
  2. poetry.lock +1 -1
  3. pyproject.toml +1 -0
app.py CHANGED
@@ -6,10 +6,146 @@ from huggingface_hub.errors import RepositoryNotFoundError
6
  import pandas as pd
7
  import plotly.express as px
8
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
 
 
9
 
10
  HF_API = HfApi()
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def format_repo_size(r_size: int) -> str:
14
  units = {0: "B", 1: "KB", 2: "MB", 3: "GB", 4: "TB", 5: "PB"}
15
  order = 0
@@ -21,6 +157,7 @@ def format_repo_size(r_size: int) -> str:
21
 
22
  def repo_files(r_type: str, r_id: str) -> dict:
23
  r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
 
24
  files = {}
25
  for sibling in r_info.siblings:
26
  ext = sibling.rfilename.split(".")[-1]
@@ -31,7 +168,7 @@ def repo_files(r_type: str, r_id: str) -> dict:
31
  files[ext] = {}
32
  files[ext]["size"] = sibling.size
33
  files[ext]["count"] = 1
34
- return files
35
 
36
 
37
  def repo_size(r_type, r_id):
@@ -64,7 +201,7 @@ def repo_size(r_type, r_id):
64
  def get_repo_info(r_type, r_id):
65
  try:
66
  repo_sizes = repo_size(r_type, r_id)
67
- repo_files_info = repo_files(r_type, r_id)
68
  except RepositoryNotFoundError:
69
  gr.Warning(
70
  "Repository not found. Make sure you've entered a valid repo ID and type that corresponds to the repository."
@@ -76,6 +213,7 @@ def get_repo_info(r_type, r_id):
76
  gr.Row(visible=False),
77
  gr.Dataframe(visible=False),
78
  )
 
79
  rf_sizes_df = (
80
  pd.DataFrame(repo_files_info)
81
  .T.reset_index(names="ext")
@@ -112,7 +250,8 @@ def get_repo_info(r_type, r_id):
112
  value=rf_sizes_df[["Extension", "Count", "Size"]],
113
  visible=True,
114
  ),
115
- gr.Plot(rf_sizes_plot, visible=True),
 
116
  b_block,
117
  r_sizes_component,
118
  )
@@ -140,9 +279,10 @@ with gr.Blocks(theme="ocean") as demo:
140
  with gr.Row(visible=False) as results_block:
141
  with gr.Column():
142
  gr.Markdown("## File Information")
 
143
  with gr.Row():
144
  file_info = gr.Dataframe(visible=False)
145
- file_info_plot = gr.Plot(visible=False)
146
  with gr.Row(visible=False) as branch_block:
147
  with gr.Column():
148
  gr.Markdown("## Branch Sizes")
 
6
  import pandas as pd
7
  import plotly.express as px
8
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
9
+ from collections import defaultdict
10
+ import numpy as np
11
 
12
  HF_API = HfApi()
13
 
14
 
15
+ def apply_power_scaling(sizes, exponent=0.2):
16
+ """Apply custom power scaling to the sizes."""
17
+ """skip over if size is none, but make sure to fill it as 0"""
18
+ return [size**exponent if size is not None else 0 for size in sizes]
19
+
20
+
21
+ def count_chunks(sizes):
22
+ """Count the number of chunks, which are 64KB each in size - which are bytes"""
23
+ """always round up to the nearest chunk"""
24
+ return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
25
+
26
+
27
+ def build_hierarchy(siblings):
28
+ """Builds a hierarchical structure from the list of RepoSibling objects."""
29
+ hierarchy = defaultdict(dict)
30
+
31
+ for sibling in siblings:
32
+ path_parts = sibling.rfilename.split("/")
33
+ size = sibling.lfs.size if sibling.lfs else sibling.size
34
+
35
+ current_level = hierarchy
36
+ for part in path_parts[:-1]: # Traverse directories
37
+ current_level = current_level.setdefault(part, {})
38
+ current_level[path_parts[-1]] = size # Assign size to the file
39
+
40
+ return hierarchy
41
+
42
+
43
+ def calculate_directory_sizes(hierarchy):
44
+ """Recursively calculates the size of each directory as the sum of its contents."""
45
+ total_size = 0
46
+
47
+ for key, value in hierarchy.items():
48
+ if isinstance(value, dict): # Directory
49
+ dir_size = calculate_directory_sizes(value) # Recursively calculate size
50
+ hierarchy[key] = {
51
+ "__size__": dir_size,
52
+ **value,
53
+ } # Add size to directory metadata
54
+ total_size += dir_size
55
+ else: # File
56
+ total_size += value
57
+
58
+ return total_size
59
+
60
+
61
+ def flatten_hierarchy_with_directory_sizes(hierarchy, root_name="Repository"):
62
+ """Flatten a nested dictionary into Plotly-compatible treemap data with a defined root node."""
63
+ labels = []
64
+ parents = []
65
+ sizes = []
66
+
67
+ # Recursively process the hierarchy
68
+ def process_level(current_hierarchy, current_parent):
69
+ for key, value in current_hierarchy.items():
70
+ if isinstance(value, dict) and "__size__" in value: # Directory
71
+ dir_size = value.pop("__size__") # Extract directory size
72
+ labels.append(key)
73
+ parents.append(current_parent)
74
+ sizes.append(dir_size)
75
+ process_level(value, key) # Recurse into subdirectories
76
+ else: # File
77
+ labels.append(key)
78
+ parents.append(current_parent)
79
+ sizes.append(value)
80
+
81
+ # Add the root node
82
+ total_size = calculate_directory_sizes(hierarchy)
83
+ labels.append(root_name)
84
+ parents.append("") # Root has no parent
85
+ sizes.append(total_size)
86
+
87
+ # Process the hierarchy
88
+ process_level(hierarchy, root_name)
89
+
90
+ return labels, parents, sizes
91
+
92
+
93
+ def visualize_repo_treemap(r_info):
94
+ """Visualizes the repository as a treemap with directory sizes and human-readable tooltips."""
95
+ siblings = r_info.siblings
96
+ hierarchy = build_hierarchy(siblings)
97
+
98
+ # Calculate directory sizes
99
+ calculate_directory_sizes(hierarchy)
100
+
101
+ # Flatten the hierarchy into Plotly-compatible format
102
+ labels, parents, sizes = flatten_hierarchy_with_directory_sizes(hierarchy)
103
+
104
+ # Apply the chosen scaling function for visualization
105
+ scaled_sizes = apply_power_scaling(sizes)
106
+
107
+ # Format the original sizes using the helper function
108
+ formatted_sizes = [
109
+ (
110
+ format_repo_size(size) if size is not None else None
111
+ ) # Format both files and directories
112
+ for size in sizes
113
+ ]
114
+
115
+ chunks = count_chunks(sizes)
116
+
117
+ # Create the treemap
118
+ fig = px.treemap(
119
+ names=labels,
120
+ parents=parents,
121
+ values=scaled_sizes,
122
+ title="Repo by Chunks",
123
+ custom_data=[formatted_sizes, chunks],
124
+ )
125
+
126
+ # Add subtitle by updating the layout
127
+ fig.update_layout(
128
+ title={
129
+ "text": "Repo File Size Treemap<br><span style='font-size:14px;'>Hover over each directory or file to see the size of the file and its number of chunks</span>",
130
+ "x": 0.5, # Center the title and subtitle
131
+ "xanchor": "center",
132
+ }
133
+ )
134
+
135
+ # Customize the hover template to include directory sizes
136
+ fig.update_traces(
137
+ hovertemplate=(
138
+ "<b>%{label}</b><br>" # File/Directory name
139
+ "Size: %{customdata[0]}<br>" # Scaled size shown in treemap
140
+ "# of Chunks: %{customdata[1]}" # Formatted size from custom data
141
+ )
142
+ )
143
+ fig.update_traces(root_color="lightgrey")
144
+ fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
145
+
146
+ return fig
147
+
148
+
149
  def format_repo_size(r_size: int) -> str:
150
  units = {0: "B", 1: "KB", 2: "MB", 3: "GB", 4: "TB", 5: "PB"}
151
  order = 0
 
157
 
158
  def repo_files(r_type: str, r_id: str) -> dict:
159
  r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
160
+ fig = visualize_repo_treemap(r_info)
161
  files = {}
162
  for sibling in r_info.siblings:
163
  ext = sibling.rfilename.split(".")[-1]
 
168
  files[ext] = {}
169
  files[ext]["size"] = sibling.size
170
  files[ext]["count"] = 1
171
+ return files, fig
172
 
173
 
174
  def repo_size(r_type, r_id):
 
201
  def get_repo_info(r_type, r_id):
202
  try:
203
  repo_sizes = repo_size(r_type, r_id)
204
+ repo_files_info, treemap_fig = repo_files(r_type, r_id)
205
  except RepositoryNotFoundError:
206
  gr.Warning(
207
  "Repository not found. Make sure you've entered a valid repo ID and type that corresponds to the repository."
 
213
  gr.Row(visible=False),
214
  gr.Dataframe(visible=False),
215
  )
216
+
217
  rf_sizes_df = (
218
  pd.DataFrame(repo_files_info)
219
  .T.reset_index(names="ext")
 
250
  value=rf_sizes_df[["Extension", "Count", "Size"]],
251
  visible=True,
252
  ),
253
+ # gr.Plot(rf_sizes_plot, visible=True),
254
+ gr.Plot(treemap_fig, visible=True),
255
  b_block,
256
  r_sizes_component,
257
  )
 
279
  with gr.Row(visible=False) as results_block:
280
  with gr.Column():
281
  gr.Markdown("## File Information")
282
+ file_info_plot = gr.Plot(visible=False)
283
  with gr.Row():
284
  file_info = gr.Dataframe(visible=False)
285
+ # file_info_plot = gr.Plot(visible=False)
286
  with gr.Row(visible=False) as branch_block:
287
  with gr.Column():
288
  gr.Markdown("## Branch Sizes")
poetry.lock CHANGED
@@ -1539,4 +1539,4 @@ files = [
1539
  [metadata]
1540
  lock-version = "2.0"
1541
  python-versions = "^3.12"
1542
- content-hash = "06e3ab80abee5517a984d2efc1f476e92534906505389d7d51f1e6f127b34de6"
 
1539
  [metadata]
1540
  lock-version = "2.0"
1541
  python-versions = "^3.12"
1542
+ content-hash = "44af67d58b93b4a2ee1847c1da93be8215965b566e188ec1aa04febb608388ea"
pyproject.toml CHANGED
@@ -10,6 +10,7 @@ python = "^3.12"
10
  gradio = "^5.5.0"
11
  huggingface-hub = "^0.26.2"
12
  plotly = "^5.24.1"
 
13
 
14
 
15
  [build-system]
 
10
  gradio = "^5.5.0"
11
  huggingface-hub = "^0.26.2"
12
  plotly = "^5.24.1"
13
+ numpy = "^2.1.3"
14
 
15
 
16
  [build-system]