Spaces:
Running
Running
added new new chart
Browse files
app.py
CHANGED
@@ -21,18 +21,25 @@ def process_dataset():
|
|
21 |
"""
|
22 |
|
23 |
file_counts_and_sizes = pd.read_parquet(
|
24 |
-
"hf://datasets/xet-team/lfs-analysis-data/file_counts_and_sizes.parquet"
|
25 |
)
|
26 |
repo_by_size_df = pd.read_parquet(
|
27 |
-
"hf://datasets/xet-team/lfs-analysis-data/repo_by_size.parquet"
|
28 |
)
|
29 |
unique_files_df = pd.read_parquet(
|
30 |
-
"hf://datasets/xet-team/lfs-analysis-data/repo_by_size_file_dedupe.parquet"
|
31 |
)
|
32 |
file_extensions = pd.read_parquet(
|
33 |
-
"hf://datasets/xet-team/lfs-analysis-data/file_extensions.parquet"
|
34 |
)
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
# Convert the total size to petabytes and format to two decimal places
|
37 |
file_counts_and_sizes = format_dataframe_size_column(
|
38 |
file_counts_and_sizes, "total_size"
|
@@ -55,7 +62,13 @@ def process_dataset():
|
|
55 |
# drop nas from the extension column
|
56 |
file_extensions = file_extensions.dropna(subset=["extension"])
|
57 |
|
58 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
|
61 |
def format_dataframe_size_column(_df, column_name):
|
@@ -196,9 +209,52 @@ def plot_total_sum(by_type_arr):
|
|
196 |
return fig
|
197 |
|
198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
# Create a gradio blocks interface and launch a demo
|
200 |
with gr.Blocks() as demo:
|
201 |
-
df, file_df, by_type, by_extension = process_dataset()
|
202 |
|
203 |
# Add a heading
|
204 |
gr.Markdown("# Git LFS Analysis Across the Hub")
|
@@ -258,5 +314,18 @@ with gr.Blocks() as demo:
|
|
258 |
)
|
259 |
gr.Dataframe(by_extension_size)
|
260 |
|
|
|
|
|
|
|
|
|
261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
demo.launch()
|
|
|
21 |
"""
|
22 |
|
23 |
file_counts_and_sizes = pd.read_parquet(
|
24 |
+
"hf://datasets/xet-team/lfs-analysis-data/transformed/file_counts_and_sizes.parquet"
|
25 |
)
|
26 |
repo_by_size_df = pd.read_parquet(
|
27 |
+
"hf://datasets/xet-team/lfs-analysis-data/transformed/repo_by_size.parquet"
|
28 |
)
|
29 |
unique_files_df = pd.read_parquet(
|
30 |
+
"hf://datasets/xet-team/lfs-analysis-data/transformed/repo_by_size_file_dedupe.parquet"
|
31 |
)
|
32 |
file_extensions = pd.read_parquet(
|
33 |
+
"hf://datasets/xet-team/lfs-analysis-data/transformed/file_extensions.parquet"
|
34 |
)
|
35 |
|
36 |
+
# read the file_extensions_by_month.parquet file
|
37 |
+
file_extensions_by_month = pd.read_parquet(
|
38 |
+
"hf://datasets/xet-team/lfs-analysis-data/transformed/file_extensions_by_month.parquet"
|
39 |
+
)
|
40 |
+
# drop any nas
|
41 |
+
file_extensions_by_month = file_extensions_by_month.dropna()
|
42 |
+
|
43 |
# Convert the total size to petabytes and format to two decimal places
|
44 |
file_counts_and_sizes = format_dataframe_size_column(
|
45 |
file_counts_and_sizes, "total_size"
|
|
|
62 |
# drop nas from the extension column
|
63 |
file_extensions = file_extensions.dropna(subset=["extension"])
|
64 |
|
65 |
+
return (
|
66 |
+
repo_by_size_df,
|
67 |
+
unique_files_df,
|
68 |
+
file_counts_and_sizes,
|
69 |
+
file_extensions,
|
70 |
+
file_extensions_by_month,
|
71 |
+
)
|
72 |
|
73 |
|
74 |
def format_dataframe_size_column(_df, column_name):
|
|
|
209 |
return fig
|
210 |
|
211 |
|
212 |
+
def filter_by_extension_month(_df, _extension):
|
213 |
+
"""
|
214 |
+
Filters the given DataFrame (_df) by the specified extension and creates a line plot using Plotly.
|
215 |
+
|
216 |
+
Parameters:
|
217 |
+
_df (DataFrame): The input DataFrame containing the data.
|
218 |
+
extension (str): The extension to filter the DataFrame by. If set to "All", no filtering is applied.
|
219 |
+
|
220 |
+
Returns:
|
221 |
+
fig (Figure): The Plotly figure object representing the line plot.
|
222 |
+
"""
|
223 |
+
# Filter the DataFrame by the specified extension or extensions
|
224 |
+
if len(_extension) == 1 and "All" in _extension or len(_extension) == 0:
|
225 |
+
pass
|
226 |
+
else:
|
227 |
+
_df = _df[_df["extension"].isin(_extension)].copy()
|
228 |
+
|
229 |
+
# Convert year and month into a datetime column and sort by date
|
230 |
+
_df["date"] = pd.to_datetime(_df[["year", "month"]].assign(day=1))
|
231 |
+
_df = _df.sort_values(by="date")
|
232 |
+
|
233 |
+
# Pivot the DataFrame to get the total size for each extension and make this plotable as a time series
|
234 |
+
pivot_df = _df.pivot_table(
|
235 |
+
index="date", columns="extension", values="total_size"
|
236 |
+
).fillna(0)
|
237 |
+
|
238 |
+
# Plot!!
|
239 |
+
fig = go.Figure()
|
240 |
+
for i, column in enumerate(pivot_df.columns):
|
241 |
+
if column != "":
|
242 |
+
fig.add_trace(
|
243 |
+
go.Scatter(
|
244 |
+
x=pivot_df.index,
|
245 |
+
y=pivot_df[column] / 1e15, # Convert to petabytes
|
246 |
+
mode="lines",
|
247 |
+
name=column.capitalize(),
|
248 |
+
line=dict(color=px.colors.qualitative.Alphabet[i]),
|
249 |
+
)
|
250 |
+
)
|
251 |
+
|
252 |
+
return fig
|
253 |
+
|
254 |
+
|
255 |
# Create a gradio blocks interface and launch a demo
|
256 |
with gr.Blocks() as demo:
|
257 |
+
df, file_df, by_type, by_extension, by_extension_month = process_dataset()
|
258 |
|
259 |
# Add a heading
|
260 |
gr.Markdown("# Git LFS Analysis Across the Hub")
|
|
|
314 |
)
|
315 |
gr.Dataframe(by_extension_size)
|
316 |
|
317 |
+
gr.Markdown("## File Extension Growth Over Time")
|
318 |
+
gr.Markdown(
|
319 |
+
"Want to dig a little deeper? Select a file extension to see how many bytes of that type were uploaded to the Hub each month."
|
320 |
+
)
|
321 |
|
322 |
+
# build a dropdown using the unique values in the extension column
|
323 |
+
extension = gr.Dropdown(
|
324 |
+
choices=by_extension["extension"].unique().tolist(),
|
325 |
+
value="All",
|
326 |
+
allow_custom_value=True,
|
327 |
+
multiselect=True,
|
328 |
+
)
|
329 |
+
_by_extension_month = gr.State(by_extension_month)
|
330 |
+
gr.Plot(filter_by_extension_month, inputs=[_by_extension_month, extension])
|
331 |
demo.launch()
|