wrapping
Browse files
app.py
CHANGED
@@ -17,8 +17,10 @@ def apply_power_scaling(sizes: list, exponent=0.2) -> list:
|
|
17 |
return [size**exponent if size is not None else 0 for size in sizes]
|
18 |
|
19 |
|
20 |
-
def count_chunks(sizes: list) -> list:
|
21 |
"""Count the number of chunks, which are 64KB each in size; always roundup"""
|
|
|
|
|
22 |
return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
|
23 |
|
24 |
|
@@ -99,7 +101,7 @@ def flatten_hierarchy(hierarchy, root_name="Repository"):
|
|
99 |
return labels, parents, sizes, ids
|
100 |
|
101 |
|
102 |
-
def visualize_repo_treemap(r_info):
|
103 |
"""Visualizes the repository as a treemap with directory sizes and human-readable tooltips."""
|
104 |
siblings = r_info.siblings
|
105 |
hierarchy = build_hierarchy(siblings)
|
@@ -108,7 +110,7 @@ def visualize_repo_treemap(r_info):
|
|
108 |
calculate_directory_sizes(hierarchy)
|
109 |
|
110 |
# Flatten the hierarchy for Plotly
|
111 |
-
labels, parents, sizes, ids = flatten_hierarchy(hierarchy)
|
112 |
|
113 |
# Scale for vix
|
114 |
scaled_sizes = apply_power_scaling(sizes)
|
@@ -138,7 +140,7 @@ def visualize_repo_treemap(r_info):
|
|
138 |
values=scaled_sizes,
|
139 |
color=normalized_colors,
|
140 |
color_continuous_scale=colorscale,
|
141 |
-
title="
|
142 |
custom_data=[formatted_sizes, chunks],
|
143 |
height=1000,
|
144 |
ids=ids,
|
@@ -149,7 +151,7 @@ def visualize_repo_treemap(r_info):
|
|
149 |
# Add subtitle by updating the layout
|
150 |
fig.update_layout(
|
151 |
title={
|
152 |
-
"text": "
|
153 |
"x": 0.5,
|
154 |
"xanchor": "center",
|
155 |
},
|
@@ -189,16 +191,18 @@ def format_repo_size(r_size: int) -> str:
|
|
189 |
|
190 |
def repo_files(r_type: str, r_id: str) -> dict:
|
191 |
r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
|
192 |
-
fig = visualize_repo_treemap(r_info)
|
193 |
files = {}
|
194 |
for sibling in r_info.siblings:
|
195 |
ext = sibling.rfilename.split(".")[-1]
|
196 |
if ext in files:
|
197 |
files[ext]["size"] += sibling.size
|
|
|
198 |
files[ext]["count"] += 1
|
199 |
else:
|
200 |
files[ext] = {}
|
201 |
files[ext]["size"] = sibling.size
|
|
|
202 |
files[ext]["count"] = 1
|
203 |
return files, fig
|
204 |
|
@@ -226,7 +230,11 @@ def repo_size(r_type, r_id):
|
|
226 |
return {}
|
227 |
size = response.get("size")
|
228 |
if size is not None:
|
229 |
-
repo_sizes[branch.name] =
|
|
|
|
|
|
|
|
|
230 |
return repo_sizes
|
231 |
|
232 |
|
@@ -246,40 +254,32 @@ def get_repo_info(r_type, r_id):
|
|
246 |
gr.Dataframe(visible=False),
|
247 |
)
|
248 |
|
249 |
-
rf_sizes_df = (
|
250 |
-
pd.DataFrame(repo_files_info)
|
251 |
-
.T.reset_index(names="ext")
|
252 |
-
.sort_values(by="size", ascending=False)
|
253 |
-
)
|
254 |
# check if repo_sizes is just {}
|
255 |
if not repo_sizes:
|
256 |
r_sizes_component = gr.Dataframe(visible=False)
|
257 |
b_block = gr.Row(visible=False)
|
258 |
else:
|
259 |
-
r_sizes_df = pd.DataFrame(repo_sizes
|
260 |
-
|
|
|
261 |
)
|
262 |
-
r_sizes_df["
|
263 |
-
r_sizes_df.columns = ["Branch", "bytes", "Size"]
|
264 |
r_sizes_component = gr.Dataframe(
|
265 |
-
value=r_sizes_df[["Branch", "Size"]], visible=True
|
266 |
)
|
267 |
b_block = gr.Row(visible=True)
|
268 |
|
269 |
-
rf_sizes_df
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
values="bytes",
|
274 |
-
names="Extension",
|
275 |
-
hover_data=["Size"],
|
276 |
-
title=f"File Distribution in {r_id}",
|
277 |
-
hole=0.3,
|
278 |
)
|
|
|
|
|
279 |
return (
|
280 |
gr.Row(visible=True),
|
281 |
gr.Dataframe(
|
282 |
-
value=rf_sizes_df[["Extension", "Count", "Size"]],
|
283 |
visible=True,
|
284 |
),
|
285 |
# gr.Plot(rf_sizes_plot, visible=True),
|
@@ -290,9 +290,9 @@ def get_repo_info(r_type, r_id):
|
|
290 |
|
291 |
|
292 |
with gr.Blocks(theme="ocean") as demo:
|
293 |
-
gr.Markdown("#
|
294 |
gr.Markdown(
|
295 |
-
"Search for a model or dataset repository using the autocomplete below, select the repository type, and get back information about the repository's
|
296 |
)
|
297 |
with gr.Blocks():
|
298 |
# repo_id = gr.Textbox(label="Repository ID", placeholder="123456")
|
@@ -310,15 +310,23 @@ with gr.Blocks(theme="ocean") as demo:
|
|
310 |
with gr.Blocks():
|
311 |
with gr.Row(visible=False) as results_block:
|
312 |
with gr.Column():
|
313 |
-
gr.Markdown("##
|
314 |
file_info_plot = gr.Plot(visible=False)
|
315 |
-
with gr.Row():
|
316 |
-
file_info = gr.Dataframe(visible=False)
|
317 |
-
# file_info_plot = gr.Plot(visible=False)
|
318 |
with gr.Row(visible=False) as branch_block:
|
319 |
with gr.Column():
|
320 |
-
gr.Markdown("
|
|
|
|
|
|
|
321 |
branch_sizes = gr.Dataframe(visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
|
323 |
search_button.click(
|
324 |
get_repo_info,
|
|
|
17 |
return [size**exponent if size is not None else 0 for size in sizes]
|
18 |
|
19 |
|
20 |
+
def count_chunks(sizes: list | int) -> list:
|
21 |
"""Count the number of chunks, which are 64KB each in size; always roundup"""
|
22 |
+
if isinstance(sizes, int):
|
23 |
+
return int(np.ceil(sizes / 64_000))
|
24 |
return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
|
25 |
|
26 |
|
|
|
101 |
return labels, parents, sizes, ids
|
102 |
|
103 |
|
104 |
+
def visualize_repo_treemap(r_info: dict, r_id: str) -> px.treemap:
|
105 |
"""Visualizes the repository as a treemap with directory sizes and human-readable tooltips."""
|
106 |
siblings = r_info.siblings
|
107 |
hierarchy = build_hierarchy(siblings)
|
|
|
110 |
calculate_directory_sizes(hierarchy)
|
111 |
|
112 |
# Flatten the hierarchy for Plotly
|
113 |
+
labels, parents, sizes, ids = flatten_hierarchy(hierarchy, r_id)
|
114 |
|
115 |
# Scale for vix
|
116 |
scaled_sizes = apply_power_scaling(sizes)
|
|
|
140 |
values=scaled_sizes,
|
141 |
color=normalized_colors,
|
142 |
color_continuous_scale=colorscale,
|
143 |
+
title=f"{r_id} by Chunks",
|
144 |
custom_data=[formatted_sizes, chunks],
|
145 |
height=1000,
|
146 |
ids=ids,
|
|
|
151 |
# Add subtitle by updating the layout
|
152 |
fig.update_layout(
|
153 |
title={
|
154 |
+
"text": f"{r_id} file and chunk treemap<br><span style='font-size:14px;'>Hover over each directory/file to see its size and number of chunks it contains.</span>",
|
155 |
"x": 0.5,
|
156 |
"xanchor": "center",
|
157 |
},
|
|
|
191 |
|
192 |
def repo_files(r_type: str, r_id: str) -> dict:
|
193 |
r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
|
194 |
+
fig = visualize_repo_treemap(r_info, r_id)
|
195 |
files = {}
|
196 |
for sibling in r_info.siblings:
|
197 |
ext = sibling.rfilename.split(".")[-1]
|
198 |
if ext in files:
|
199 |
files[ext]["size"] += sibling.size
|
200 |
+
files[ext]["chunks"] += count_chunks(sibling.size)
|
201 |
files[ext]["count"] += 1
|
202 |
else:
|
203 |
files[ext] = {}
|
204 |
files[ext]["size"] = sibling.size
|
205 |
+
files[ext]["chunks"] = count_chunks(sibling.size)
|
206 |
files[ext]["count"] = 1
|
207 |
return files, fig
|
208 |
|
|
|
230 |
return {}
|
231 |
size = response.get("size")
|
232 |
if size is not None:
|
233 |
+
repo_sizes[branch.name] = {
|
234 |
+
"size_in_bytes": size,
|
235 |
+
"size_in_chunks": count_chunks(size),
|
236 |
+
}
|
237 |
+
|
238 |
return repo_sizes
|
239 |
|
240 |
|
|
|
254 |
gr.Dataframe(visible=False),
|
255 |
)
|
256 |
|
|
|
|
|
|
|
|
|
|
|
257 |
# check if repo_sizes is just {}
|
258 |
if not repo_sizes:
|
259 |
r_sizes_component = gr.Dataframe(visible=False)
|
260 |
b_block = gr.Row(visible=False)
|
261 |
else:
|
262 |
+
r_sizes_df = pd.DataFrame(repo_sizes).T.reset_index(names="branch")
|
263 |
+
r_sizes_df["formatted_size"] = r_sizes_df["size_in_bytes"].apply(
|
264 |
+
format_repo_size
|
265 |
)
|
266 |
+
r_sizes_df.columns = ["Branch", "size_in_bytes", "Chunks", "Size"]
|
|
|
267 |
r_sizes_component = gr.Dataframe(
|
268 |
+
value=r_sizes_df[["Branch", "Size", "Chunks"]], visible=True
|
269 |
)
|
270 |
b_block = gr.Row(visible=True)
|
271 |
|
272 |
+
rf_sizes_df = (
|
273 |
+
pd.DataFrame(repo_files_info)
|
274 |
+
.T.reset_index(names="ext")
|
275 |
+
.sort_values(by="size", ascending=False)
|
|
|
|
|
|
|
|
|
|
|
276 |
)
|
277 |
+
rf_sizes_df["formatted_size"] = rf_sizes_df["size"].apply(format_repo_size)
|
278 |
+
rf_sizes_df.columns = ["Extension", "bytes", "Chunks", "Count", "Size"]
|
279 |
return (
|
280 |
gr.Row(visible=True),
|
281 |
gr.Dataframe(
|
282 |
+
value=rf_sizes_df[["Extension", "Count", "Size", "Chunks"]],
|
283 |
visible=True,
|
284 |
),
|
285 |
# gr.Plot(rf_sizes_plot, visible=True),
|
|
|
290 |
|
291 |
|
292 |
with gr.Blocks(theme="ocean") as demo:
|
293 |
+
gr.Markdown("# Chunking Repos")
|
294 |
gr.Markdown(
|
295 |
+
"Search for a model or dataset repository using the autocomplete below, select the repository type, and get back information about the repository's contents including the [number of chunks each file might be split into with Xet backed storage](https://huggingface.co/blog/from-files-to-chunks)."
|
296 |
)
|
297 |
with gr.Blocks():
|
298 |
# repo_id = gr.Textbox(label="Repository ID", placeholder="123456")
|
|
|
310 |
with gr.Blocks():
|
311 |
with gr.Row(visible=False) as results_block:
|
312 |
with gr.Column():
|
313 |
+
gr.Markdown("## Repo Info")
|
314 |
file_info_plot = gr.Plot(visible=False)
|
|
|
|
|
|
|
315 |
with gr.Row(visible=False) as branch_block:
|
316 |
with gr.Column():
|
317 |
+
gr.Markdown("### Branch Sizes")
|
318 |
+
gr.Markdown(
|
319 |
+
"The size of each branch in the repository and how many chunks it might need (assuming no dedupe)."
|
320 |
+
)
|
321 |
branch_sizes = gr.Dataframe(visible=False)
|
322 |
+
with gr.Row():
|
323 |
+
with gr.Column():
|
324 |
+
gr.Markdown("### File Sizes")
|
325 |
+
gr.Markdown(
|
326 |
+
"The cumulative size of each filetype in the repository (in the `main` branch) and how many chunks they might need (assuming no dedupe)."
|
327 |
+
)
|
328 |
+
file_info = gr.Dataframe(visible=False)
|
329 |
+
# file_info_plot = gr.Plot(visible=False)
|
330 |
|
331 |
search_button.click(
|
332 |
get_repo_info,
|