visualizing by chunks
Browse files- app.py +144 -4
- poetry.lock +1 -1
- pyproject.toml +1 -0
app.py
CHANGED
@@ -6,10 +6,146 @@ from huggingface_hub.errors import RepositoryNotFoundError
|
|
6 |
import pandas as pd
|
7 |
import plotly.express as px
|
8 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
|
|
|
|
9 |
|
10 |
HF_API = HfApi()
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def format_repo_size(r_size: int) -> str:
|
14 |
units = {0: "B", 1: "KB", 2: "MB", 3: "GB", 4: "TB", 5: "PB"}
|
15 |
order = 0
|
@@ -21,6 +157,7 @@ def format_repo_size(r_size: int) -> str:
|
|
21 |
|
22 |
def repo_files(r_type: str, r_id: str) -> dict:
|
23 |
r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
|
|
|
24 |
files = {}
|
25 |
for sibling in r_info.siblings:
|
26 |
ext = sibling.rfilename.split(".")[-1]
|
@@ -31,7 +168,7 @@ def repo_files(r_type: str, r_id: str) -> dict:
|
|
31 |
files[ext] = {}
|
32 |
files[ext]["size"] = sibling.size
|
33 |
files[ext]["count"] = 1
|
34 |
-
return files
|
35 |
|
36 |
|
37 |
def repo_size(r_type, r_id):
|
@@ -64,7 +201,7 @@ def repo_size(r_type, r_id):
|
|
64 |
def get_repo_info(r_type, r_id):
|
65 |
try:
|
66 |
repo_sizes = repo_size(r_type, r_id)
|
67 |
-
repo_files_info = repo_files(r_type, r_id)
|
68 |
except RepositoryNotFoundError:
|
69 |
gr.Warning(
|
70 |
"Repository not found. Make sure you've entered a valid repo ID and type that corresponds to the repository."
|
@@ -76,6 +213,7 @@ def get_repo_info(r_type, r_id):
|
|
76 |
gr.Row(visible=False),
|
77 |
gr.Dataframe(visible=False),
|
78 |
)
|
|
|
79 |
rf_sizes_df = (
|
80 |
pd.DataFrame(repo_files_info)
|
81 |
.T.reset_index(names="ext")
|
@@ -112,7 +250,8 @@ def get_repo_info(r_type, r_id):
|
|
112 |
value=rf_sizes_df[["Extension", "Count", "Size"]],
|
113 |
visible=True,
|
114 |
),
|
115 |
-
gr.Plot(rf_sizes_plot, visible=True),
|
|
|
116 |
b_block,
|
117 |
r_sizes_component,
|
118 |
)
|
@@ -140,9 +279,10 @@ with gr.Blocks(theme="ocean") as demo:
|
|
140 |
with gr.Row(visible=False) as results_block:
|
141 |
with gr.Column():
|
142 |
gr.Markdown("## File Information")
|
|
|
143 |
with gr.Row():
|
144 |
file_info = gr.Dataframe(visible=False)
|
145 |
-
file_info_plot = gr.Plot(visible=False)
|
146 |
with gr.Row(visible=False) as branch_block:
|
147 |
with gr.Column():
|
148 |
gr.Markdown("## Branch Sizes")
|
|
|
6 |
import pandas as pd
|
7 |
import plotly.express as px
|
8 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
9 |
+
from collections import defaultdict
|
10 |
+
import numpy as np
|
11 |
|
12 |
HF_API = HfApi()
|
13 |
|
14 |
|
15 |
+
def apply_power_scaling(sizes, exponent=0.2):
|
16 |
+
"""Apply custom power scaling to the sizes."""
|
17 |
+
"""skip over if size is none, but make sure to fill it as 0"""
|
18 |
+
return [size**exponent if size is not None else 0 for size in sizes]
|
19 |
+
|
20 |
+
|
21 |
+
def count_chunks(sizes):
|
22 |
+
"""Count the number of chunks, which are 64KB each in size - which are bytes"""
|
23 |
+
"""always round up to the nearest chunk"""
|
24 |
+
return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
|
25 |
+
|
26 |
+
|
27 |
+
def build_hierarchy(siblings):
|
28 |
+
"""Builds a hierarchical structure from the list of RepoSibling objects."""
|
29 |
+
hierarchy = defaultdict(dict)
|
30 |
+
|
31 |
+
for sibling in siblings:
|
32 |
+
path_parts = sibling.rfilename.split("/")
|
33 |
+
size = sibling.lfs.size if sibling.lfs else sibling.size
|
34 |
+
|
35 |
+
current_level = hierarchy
|
36 |
+
for part in path_parts[:-1]: # Traverse directories
|
37 |
+
current_level = current_level.setdefault(part, {})
|
38 |
+
current_level[path_parts[-1]] = size # Assign size to the file
|
39 |
+
|
40 |
+
return hierarchy
|
41 |
+
|
42 |
+
|
43 |
+
def calculate_directory_sizes(hierarchy):
|
44 |
+
"""Recursively calculates the size of each directory as the sum of its contents."""
|
45 |
+
total_size = 0
|
46 |
+
|
47 |
+
for key, value in hierarchy.items():
|
48 |
+
if isinstance(value, dict): # Directory
|
49 |
+
dir_size = calculate_directory_sizes(value) # Recursively calculate size
|
50 |
+
hierarchy[key] = {
|
51 |
+
"__size__": dir_size,
|
52 |
+
**value,
|
53 |
+
} # Add size to directory metadata
|
54 |
+
total_size += dir_size
|
55 |
+
else: # File
|
56 |
+
total_size += value
|
57 |
+
|
58 |
+
return total_size
|
59 |
+
|
60 |
+
|
61 |
+
def flatten_hierarchy_with_directory_sizes(hierarchy, root_name="Repository"):
|
62 |
+
"""Flatten a nested dictionary into Plotly-compatible treemap data with a defined root node."""
|
63 |
+
labels = []
|
64 |
+
parents = []
|
65 |
+
sizes = []
|
66 |
+
|
67 |
+
# Recursively process the hierarchy
|
68 |
+
def process_level(current_hierarchy, current_parent):
|
69 |
+
for key, value in current_hierarchy.items():
|
70 |
+
if isinstance(value, dict) and "__size__" in value: # Directory
|
71 |
+
dir_size = value.pop("__size__") # Extract directory size
|
72 |
+
labels.append(key)
|
73 |
+
parents.append(current_parent)
|
74 |
+
sizes.append(dir_size)
|
75 |
+
process_level(value, key) # Recurse into subdirectories
|
76 |
+
else: # File
|
77 |
+
labels.append(key)
|
78 |
+
parents.append(current_parent)
|
79 |
+
sizes.append(value)
|
80 |
+
|
81 |
+
# Add the root node
|
82 |
+
total_size = calculate_directory_sizes(hierarchy)
|
83 |
+
labels.append(root_name)
|
84 |
+
parents.append("") # Root has no parent
|
85 |
+
sizes.append(total_size)
|
86 |
+
|
87 |
+
# Process the hierarchy
|
88 |
+
process_level(hierarchy, root_name)
|
89 |
+
|
90 |
+
return labels, parents, sizes
|
91 |
+
|
92 |
+
|
93 |
+
def visualize_repo_treemap(r_info):
|
94 |
+
"""Visualizes the repository as a treemap with directory sizes and human-readable tooltips."""
|
95 |
+
siblings = r_info.siblings
|
96 |
+
hierarchy = build_hierarchy(siblings)
|
97 |
+
|
98 |
+
# Calculate directory sizes
|
99 |
+
calculate_directory_sizes(hierarchy)
|
100 |
+
|
101 |
+
# Flatten the hierarchy into Plotly-compatible format
|
102 |
+
labels, parents, sizes = flatten_hierarchy_with_directory_sizes(hierarchy)
|
103 |
+
|
104 |
+
# Apply the chosen scaling function for visualization
|
105 |
+
scaled_sizes = apply_power_scaling(sizes)
|
106 |
+
|
107 |
+
# Format the original sizes using the helper function
|
108 |
+
formatted_sizes = [
|
109 |
+
(
|
110 |
+
format_repo_size(size) if size is not None else None
|
111 |
+
) # Format both files and directories
|
112 |
+
for size in sizes
|
113 |
+
]
|
114 |
+
|
115 |
+
chunks = count_chunks(sizes)
|
116 |
+
|
117 |
+
# Create the treemap
|
118 |
+
fig = px.treemap(
|
119 |
+
names=labels,
|
120 |
+
parents=parents,
|
121 |
+
values=scaled_sizes,
|
122 |
+
title="Repo by Chunks",
|
123 |
+
custom_data=[formatted_sizes, chunks],
|
124 |
+
)
|
125 |
+
|
126 |
+
# Add subtitle by updating the layout
|
127 |
+
fig.update_layout(
|
128 |
+
title={
|
129 |
+
"text": "Repo File Size Treemap<br><span style='font-size:14px;'>Hover over each directory or file to see the size of the file and its number of chunks</span>",
|
130 |
+
"x": 0.5, # Center the title and subtitle
|
131 |
+
"xanchor": "center",
|
132 |
+
}
|
133 |
+
)
|
134 |
+
|
135 |
+
# Customize the hover template to include directory sizes
|
136 |
+
fig.update_traces(
|
137 |
+
hovertemplate=(
|
138 |
+
"<b>%{label}</b><br>" # File/Directory name
|
139 |
+
"Size: %{customdata[0]}<br>" # Scaled size shown in treemap
|
140 |
+
"# of Chunks: %{customdata[1]}" # Formatted size from custom data
|
141 |
+
)
|
142 |
+
)
|
143 |
+
fig.update_traces(root_color="lightgrey")
|
144 |
+
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
|
145 |
+
|
146 |
+
return fig
|
147 |
+
|
148 |
+
|
149 |
def format_repo_size(r_size: int) -> str:
|
150 |
units = {0: "B", 1: "KB", 2: "MB", 3: "GB", 4: "TB", 5: "PB"}
|
151 |
order = 0
|
|
|
157 |
|
158 |
def repo_files(r_type: str, r_id: str) -> dict:
|
159 |
r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
|
160 |
+
fig = visualize_repo_treemap(r_info)
|
161 |
files = {}
|
162 |
for sibling in r_info.siblings:
|
163 |
ext = sibling.rfilename.split(".")[-1]
|
|
|
168 |
files[ext] = {}
|
169 |
files[ext]["size"] = sibling.size
|
170 |
files[ext]["count"] = 1
|
171 |
+
return files, fig
|
172 |
|
173 |
|
174 |
def repo_size(r_type, r_id):
|
|
|
201 |
def get_repo_info(r_type, r_id):
|
202 |
try:
|
203 |
repo_sizes = repo_size(r_type, r_id)
|
204 |
+
repo_files_info, treemap_fig = repo_files(r_type, r_id)
|
205 |
except RepositoryNotFoundError:
|
206 |
gr.Warning(
|
207 |
"Repository not found. Make sure you've entered a valid repo ID and type that corresponds to the repository."
|
|
|
213 |
gr.Row(visible=False),
|
214 |
gr.Dataframe(visible=False),
|
215 |
)
|
216 |
+
|
217 |
rf_sizes_df = (
|
218 |
pd.DataFrame(repo_files_info)
|
219 |
.T.reset_index(names="ext")
|
|
|
250 |
value=rf_sizes_df[["Extension", "Count", "Size"]],
|
251 |
visible=True,
|
252 |
),
|
253 |
+
# gr.Plot(rf_sizes_plot, visible=True),
|
254 |
+
gr.Plot(treemap_fig, visible=True),
|
255 |
b_block,
|
256 |
r_sizes_component,
|
257 |
)
|
|
|
279 |
with gr.Row(visible=False) as results_block:
|
280 |
with gr.Column():
|
281 |
gr.Markdown("## File Information")
|
282 |
+
file_info_plot = gr.Plot(visible=False)
|
283 |
with gr.Row():
|
284 |
file_info = gr.Dataframe(visible=False)
|
285 |
+
# file_info_plot = gr.Plot(visible=False)
|
286 |
with gr.Row(visible=False) as branch_block:
|
287 |
with gr.Column():
|
288 |
gr.Markdown("## Branch Sizes")
|
poetry.lock
CHANGED
@@ -1539,4 +1539,4 @@ files = [
|
|
1539 |
[metadata]
|
1540 |
lock-version = "2.0"
|
1541 |
python-versions = "^3.12"
|
1542 |
-
content-hash = "
|
|
|
1539 |
[metadata]
|
1540 |
lock-version = "2.0"
|
1541 |
python-versions = "^3.12"
|
1542 |
+
content-hash = "44af67d58b93b4a2ee1847c1da93be8215965b566e188ec1aa04febb608388ea"
|
pyproject.toml
CHANGED
@@ -10,6 +10,7 @@ python = "^3.12"
|
|
10 |
gradio = "^5.5.0"
|
11 |
huggingface-hub = "^0.26.2"
|
12 |
plotly = "^5.24.1"
|
|
|
13 |
|
14 |
|
15 |
[build-system]
|
|
|
10 |
gradio = "^5.5.0"
|
11 |
huggingface-hub = "^0.26.2"
|
12 |
plotly = "^5.24.1"
|
13 |
+
numpy = "^2.1.3"
|
14 |
|
15 |
|
16 |
[build-system]
|