osanseviero commited on
Commit
46c15e8
1 Parent(s): de86128
Files changed (2) hide show
  1. models.py +312 -16
  2. requirements.txt +2 -1
models.py CHANGED
@@ -3,14 +3,16 @@ import pandas as pd
3
  from datasets import load_dataset
4
  from ast import literal_eval
5
  import altair as alt
 
 
6
 
7
- nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering"
8
  "translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
9
  ]
10
  audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
11
  cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
12
  multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
13
- tabular = ["tabular-clasification", "tabular-regression"]
14
 
15
  modalities = {
16
  "nlp": nlp_tasks,
@@ -52,10 +54,23 @@ base = st.selectbox(
52
  supported_revisions)
53
  data = process_dataset(base)
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  total_samples = data.shape[0]
56
  st.metric(label="Total models", value=total_samples)
57
 
58
- tab1, tab2, tab3, tab4, tab5, tab6, tab7 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users"])
59
 
60
  with tab1:
61
  st.header("Languages info")
@@ -78,9 +93,9 @@ with tab1:
78
  return leng
79
 
80
  data["languages"] = data.apply(make_list, axis=1)
81
- data["repos_count"] = data.apply(language_count, axis=1)
82
 
83
- models_with_langs = data[data["repos_count"] > 0]
84
  langs = models_with_langs["languages"].explode()
85
  langs = langs[langs != {}]
86
  total_langs = len(langs.unique())
@@ -93,7 +108,8 @@ with tab1:
93
  with col3:
94
  st.metric(label="Total Unique Languages", value=total_langs)
95
 
96
- st.subheader("Distribution of languages per model repo")
 
97
  linguality = st.selectbox(
98
  'All or just Multilingual',
99
  ["All", "Just Multilingual", "Three or more languages"])
@@ -104,11 +120,11 @@ with tab1:
104
  elif linguality == "Three or more languages":
105
  filter = 2
106
 
107
- models_with_langs = data[data["repos_count"] > filter]
108
- df1 = models_with_langs['repos_count'].value_counts()
109
  st.bar_chart(df1)
110
 
111
- st.subheader("Distribution of repos per language")
112
  linguality_2 = st.selectbox(
113
  'All or filtered',
114
  ["All", "No English", "Remove top 10"])
@@ -121,7 +137,7 @@ with tab1:
121
  else:
122
  filter = 2
123
 
124
- models_with_langs = data[data["repos_count"] > 0]
125
  langs = models_with_langs["languages"].explode()
126
  langs = langs[langs != {}]
127
 
@@ -187,9 +203,8 @@ with tab2:
187
  x='counts',
188
  y=alt.X('license', sort=None)
189
  ))
190
- st.text("There are some edge cases, as old repos using lists of licenses. We are working on fixing this.")
191
 
192
-
193
  st.subheader("Raw Data")
194
  d = data["license"].value_counts().rename_axis("license").to_frame('counts').reset_index()
195
  st.dataframe(d)
@@ -197,18 +212,23 @@ with tab2:
197
  with tab3:
198
  st.header("Pipeline info")
199
 
 
 
 
 
 
 
200
  no_pipeline_count = data["pipeline"].isna().sum()
201
  col1, col2, col3 = st.columns(3)
202
  with col1:
203
- st.metric(label="Pipeline Specified", value=total_samples-no_pipeline_count)
204
  with col2:
205
  st.metric(label="No pipeline Specified", value=no_pipeline_count)
206
  with col3:
207
  st.metric(label="Total Unique Pipelines", value=len(data["pipeline"].unique()))
208
 
209
- st.subheader("Distribution of pipelines per model repo")
210
  pipeline_filter = st.selectbox(
211
- 'All or filtered',
212
  ["All", "NLP", "CV", "Audio", "RL", "Multimodal", "Tabular"])
213
 
214
  filter = 0
@@ -227,30 +247,306 @@ with tab3:
227
  elif pipeline_filter == "Tabular":
228
  filter = 6
229
 
230
- d = data["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
 
232
  st.write(alt.Chart(d).mark_bar().encode(
233
  x='counts',
234
  y=alt.X('pipeline', sort=None)
235
  ))
236
 
 
 
237
 
 
 
 
 
 
 
 
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
 
240
 
241
 
 
 
242
 
 
 
243
 
 
 
 
 
 
 
 
 
 
244
 
 
 
 
 
 
 
 
245
 
 
 
246
 
 
 
247
 
 
 
 
 
 
 
 
 
248
 
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
 
 
 
 
 
 
 
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
 
256
 
 
3
  from datasets import load_dataset
4
  from ast import literal_eval
5
  import altair as alt
6
+ import plotly.graph_objs as go
7
+ import matplotlib.pyplot as plt
8
 
9
+ nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering",
10
  "translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
11
  ]
12
  audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
13
  cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
14
  multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
15
+ tabular = ["tabular-classification", "tabular-regression"]
16
 
17
  modalities = {
18
  "nlp": nlp_tasks,
 
54
  supported_revisions)
55
  data = process_dataset(base)
56
 
57
+ def eval_tags(row):
58
+ tags = row["tags"]
59
+ if tags == "none" or tags == [] or tags == "{}":
60
+ return []
61
+ if tags[0] != "[":
62
+ tags = str([tags])
63
+ val = literal_eval(tags)
64
+ if isinstance(val, dict):
65
+ return []
66
+ return val
67
+
68
+ data["tags"] = data.apply(eval_tags, axis=1)
69
+
70
  total_samples = data.shape[0]
71
  st.metric(label="Total models", value=total_samples)
72
 
73
+ tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
74
 
75
  with tab1:
76
  st.header("Languages info")
 
93
  return leng
94
 
95
  data["languages"] = data.apply(make_list, axis=1)
96
+ data["language_count"] = data.apply(language_count, axis=1)
97
 
98
+ models_with_langs = data[data["language_count"] > 0]
99
  langs = models_with_langs["languages"].explode()
100
  langs = langs[langs != {}]
101
  total_langs = len(langs.unique())
 
108
  with col3:
109
  st.metric(label="Total Unique Languages", value=total_langs)
110
 
111
+ st.subheader("Count of languages per model repo")
112
+ st.text("Some repos are for multiple languages, so the count is greater than 1")
113
  linguality = st.selectbox(
114
  'All or just Multilingual',
115
  ["All", "Just Multilingual", "Three or more languages"])
 
120
  elif linguality == "Three or more languages":
121
  filter = 2
122
 
123
+ models_with_langs = data[data["language_count"] > filter]
124
+ df1 = models_with_langs['language_count'].value_counts()
125
  st.bar_chart(df1)
126
 
127
+ st.subheader("Most frequent languages")
128
  linguality_2 = st.selectbox(
129
  'All or filtered',
130
  ["All", "No English", "Remove top 10"])
 
137
  else:
138
  filter = 2
139
 
140
+ models_with_langs = data[data["language_count"] > 0]
141
  langs = models_with_langs["languages"].explode()
142
  langs = langs[langs != {}]
143
 
 
203
  x='counts',
204
  y=alt.X('license', sort=None)
205
  ))
206
+ st.text("There are some edge cases, as old repos using lists of licenses.")
207
 
 
208
  st.subheader("Raw Data")
209
  d = data["license"].value_counts().rename_axis("license").to_frame('counts').reset_index()
210
  st.dataframe(d)
 
212
  with tab3:
213
  st.header("Pipeline info")
214
 
215
+ tags = data["tags"].explode()
216
+ tags = tags[tags.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
217
+ s = tags["tag"]
218
+ s = s[s.apply(type) == str]
219
+ unique_tags = len(s.unique())
220
+
221
  no_pipeline_count = data["pipeline"].isna().sum()
222
  col1, col2, col3 = st.columns(3)
223
  with col1:
224
+ st.metric(label="# models that have any pipeline", value=total_samples-no_pipeline_count)
225
  with col2:
226
  st.metric(label="No pipeline Specified", value=no_pipeline_count)
227
  with col3:
228
  st.metric(label="Total Unique Pipelines", value=len(data["pipeline"].unique()))
229
 
 
230
  pipeline_filter = st.selectbox(
231
+ 'Modalities',
232
  ["All", "NLP", "CV", "Audio", "RL", "Multimodal", "Tabular"])
233
 
234
  filter = 0
 
247
  elif pipeline_filter == "Tabular":
248
  filter = 6
249
 
250
+ st.subheader("High-level metrics")
251
+ filtered_data = data[data['pipeline'].notna()]
252
+
253
+ if filter == 1:
254
+ filtered_data = data[data["modality"] == "nlp"]
255
+ elif filter == 2:
256
+ filtered_data = data[data["modality"] == "cv"]
257
+ elif filter == 3:
258
+ filtered_data = data[data["modality"] == "audio"]
259
+ elif filter == 4:
260
+ filtered_data = data[data["modality"] == "rl"]
261
+ elif filter == 5:
262
+ filtered_data = data[data["modality"] == "multimodal"]
263
+ elif filter == 6:
264
+ filtered_data = data[data["modality"] == "tabular"]
265
+
266
+ col1, col2, col3 = st.columns(3)
267
+ with col1:
268
+ p = st.selectbox(
269
+ 'What pipeline do you want to see?',
270
+ ["all", *filtered_data["pipeline"].unique()]
271
+ )
272
+ with col2:
273
+ l = st.selectbox(
274
+ 'What library do you want to see?',
275
+ ["all", *filtered_data["library"].unique()]
276
+ )
277
+ with col3:
278
+ f = st.selectbox(
279
+ 'What framework support? (transformers)',
280
+ ["all", "py", "tf", "jax"]
281
+ )
282
+
283
+ col1, col2 = st.columns(2)
284
+ with col1:
285
+ filt = st.multiselect(
286
+ label="Tags (All by default)",
287
+ options=s.unique(),
288
+ default=None)
289
+ with col2:
290
+ o = st.selectbox(
291
+ label="Operation (for tags)",
292
+ options=["Any", "All", "None"]
293
+ )
294
+
295
+ def filter_fn(row):
296
+ tags = row["tags"]
297
+ tags[:] = [d for d in tags if isinstance(d, str)]
298
+ if o == "All":
299
+ if all(elem in tags for elem in filt):
300
+ return True
301
+
302
+ s1 = set(tags)
303
+ s2 = set(filt)
304
+ if o == "Any":
305
+ if bool(s1 & s2):
306
+ return True
307
+ if o == "None":
308
+ if len(s1.intersection(s2)) == 0:
309
+ return True
310
+ return False
311
+
312
+
313
+ if p != "all":
314
+ filtered_data = filtered_data[filtered_data["pipeline"] == p]
315
+ if l != "all":
316
+ filtered_data = filtered_data[filtered_data["library"] == l]
317
+ if f != "all":
318
+ if f == "py":
319
+ filtered_data = filtered_data[filtered_data["pytorch"] == 1]
320
+ elif f == "tf":
321
+ filtered_data = filtered_data[filtered_data["tensorflow"] == 1]
322
+ elif f == "jax":
323
+ filtered_data = filtered_data[filtered_data["jax"] == 1]
324
+ if filt != []:
325
+ filtered_data = filtered_data[filtered_data.apply(filter_fn, axis=1)]
326
+
327
+
328
+ d = filtered_data["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
329
+ columns_of_interest = ["downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
330
+ grouped_data = filtered_data.groupby("pipeline").sum()[columns_of_interest]
331
+ final_data = pd.merge(
332
+ d, grouped_data, how="outer", on="pipeline"
333
+ )
334
+ sums = grouped_data.sum()
335
+
336
+ col1, col2, col3 = st.columns(3)
337
+ with col1:
338
+ st.metric(label="Total models", value=filtered_data.shape[0])
339
+ with col2:
340
+ st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"])
341
+ with col3:
342
+ st.metric(label="Cumulative likes", value=sums["likes"])
343
+
344
+ col1, col2, col3 = st.columns(3)
345
+ with col1:
346
+ st.metric(label="Total in PT", value=sums["pytorch"])
347
+ with col2:
348
+ st.metric(label="Total in TF", value=sums["tensorflow"])
349
+ with col3:
350
+ st.metric(label="Total in JAX", value=sums["jax"])
351
+
352
+ st.metric(label="Unique Tags", value=unique_tags)
353
+
354
+
355
 
356
+ st.subheader("Count of models per pipeline")
357
  st.write(alt.Chart(d).mark_bar().encode(
358
  x='counts',
359
  y=alt.X('pipeline', sort=None)
360
  ))
361
 
362
+ st.subheader("Aggregated data")
363
+ st.dataframe(final_data)
364
 
365
+ st.subheader("Most common model types (specific to transformers")
366
+ d = filtered_data["model_type"].value_counts().rename_axis("model_type").to_frame('counts').reset_index()
367
+ d = d.iloc[:15]
368
+ st.write(alt.Chart(d).mark_bar().encode(
369
+ x='counts',
370
+ y=alt.X('model_type', sort=None)
371
+ ))
372
 
373
+ st.subheader("Most common library types (Learn more in library tab)")
374
+ d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index().head(15)
375
+ st.write(alt.Chart(d).mark_bar().encode(
376
+ x='counts',
377
+ y=alt.X('library', sort=None)
378
+ ))
379
+
380
+ st.subheader("Tags by count")
381
+ tags = filtered_data["tags"].explode()
382
+ tags = tags[tags.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
383
+ st.write(alt.Chart(tags.head(30)).mark_bar().encode(
384
+ x='counts',
385
+ y=alt.X('tag', sort=None)
386
+ ))
387
+
388
+ st.subheader("Raw Data")
389
+ columns_of_interest = [
390
+ "repo_id", "author", "model_type", "files_per_repo", "library",
391
+ "downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
392
+ raw_data = filtered_data[columns_of_interest]
393
+ st.dataframe(raw_data)
394
+
395
+
396
 
397
+ # todo : add activity metric
398
 
399
 
400
+ with tab4:
401
+ st.header("Discussions Tab info")
402
 
403
+ columns_of_interest = ["prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]
404
+ sums = data[columns_of_interest].sum()
405
 
406
+ col1, col2, col3, col4 = st.columns(4)
407
+ with col1:
408
+ st.metric(label="Total PRs", value=sums["prs_count"])
409
+ with col2:
410
+ st.metric(label="PRs opened", value=sums["prs_open"])
411
+ with col3:
412
+ st.metric(label="PRs merged", value=sums["prs_merged"])
413
+ with col4:
414
+ st.metric(label="PRs closed", value=sums["prs_closed"])
415
 
416
+ col1, col2, col3 = st.columns(3)
417
+ with col1:
418
+ st.metric(label="Total discussions", value=sums["discussions_count"])
419
+ with col2:
420
+ st.metric(label="Discussions open", value=sums["discussions_open"])
421
+ with col3:
422
+ st.metric(label="Discussions closed", value=sums["discussions_closed"])
423
 
424
+ filtered_data = data[["repo_id", "prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]].sort_values("prs_count", ascending=False).reset_index(drop=True)
425
+ st.dataframe(filtered_data)
426
 
427
+ with tab5:
428
+ st.header("Library info")
429
 
430
+ no_library_count = data["library"].isna().sum()
431
+ col1, col2, col3 = st.columns(3)
432
+ with col1:
433
+ st.metric(label="# models that have any library", value=total_samples-no_library_count)
434
+ with col2:
435
+ st.metric(label="No library Specified", value=no_library_count)
436
+ with col3:
437
+ st.metric(label="Total Unique library", value=len(data["library"].unique()))
438
 
439
 
440
+ st.subheader("High-level metrics")
441
+ filtered_data = data[data['library'].notna()]
442
+
443
+ col1, col2 = st.columns(2)
444
+ with col1:
445
+ lib = st.selectbox(
446
+ 'What library do you want to see? ',
447
+ ["all", *filtered_data["library"].unique()]
448
+ )
449
+ with col2:
450
+ pip = st.selectbox(
451
+ 'What pipeline do you want to see? ',
452
+ ["all", *filtered_data["pipeline"].unique()]
453
+ )
454
+
455
+ if pip != "all":
456
+ filtered_data = filtered_data[filtered_data["pipeline"] == pip]
457
+ if lib != "all":
458
+ filtered_data = filtered_data[filtered_data["library"] == lib]
459
 
460
 
461
+ d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index()
462
+ grouped_data = filtered_data.groupby("library").sum()[["downloads_30d", "likes"]]
463
+ final_data = pd.merge(
464
+ d, grouped_data, how="outer", on="library"
465
+ )
466
+ sums = grouped_data.sum()
467
 
468
+ col1, col2, col3 = st.columns(3)
469
+ with col1:
470
+ st.metric(label="Total models", value=filtered_data.shape[0])
471
+ with col2:
472
+ st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"])
473
+ with col3:
474
+ st.metric(label="Cumulative likes", value=sums["likes"])
475
+
476
+ st.subheader("Most common library types (Learn more in library tab)")
477
+ d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index().head(15)
478
+ st.write(alt.Chart(d).mark_bar().encode(
479
+ x='counts',
480
+ y=alt.X('library', sort=None)
481
+ ))
482
 
483
+
484
+
485
+ st.subheader("Aggregated Data")
486
+ st.dataframe(final_data)
487
+
488
+ st.subheader("Raw Data")
489
+ columns_of_interest = ["repo_id", "author", "files_per_repo", "library", "downloads_30d", "likes"]
490
+ filtered_data = filtered_data[columns_of_interest]
491
+ st.dataframe(filtered_data)
492
+
493
+ with tab6:
494
+ st.header("Model cards")
495
+
496
+ columns_of_interest = ["has_model_index", "has_metadata", "has_text", "text_length"]
497
+ rows = data.shape[0]
498
+
499
+ cond = data["has_model_index"] | data["has_text"]
500
+ with_model_card = data[cond]
501
+ c_model_card = with_model_card.shape[0]
502
+ st.subheader("High-level metrics")
503
+ col1, col2, col3 = st.columns(3)
504
+ with col1:
505
+ st.metric(label="# models with model card file", value=c_model_card)
506
+ with col2:
507
+ st.metric(label="# models without model card file", value=rows-c_model_card)
508
+
509
+ with_index = data["has_model_index"].sum()
510
+ with col1:
511
+ st.metric(label="# models with model index", value=with_index)
512
+ with col2:
513
+ st.metric(label="# models without model index", value=rows-with_index)
514
+
515
+ with_text = data["has_text"]
516
+ with col1:
517
+ st.metric(label="# models with model card text", value=with_text.sum())
518
+ with col2:
519
+ st.metric(label="# models without model card text", value=rows-with_text.sum())
520
+
521
+
522
+ st.subheader("Length (chars) of model card content")
523
+ fig, ax = plt.subplots()
524
+ ax = data["length_bins"].value_counts().plot.bar()
525
+ st.metric(label="# average length of model card (chars)", value=data[with_text]["text_length"].mean())
526
+ st.pyplot(fig)
527
+
528
+ st.subheader("Tags (Read more in Pipeline tab)")
529
+ tags = data["tags"].explode()
530
+ tags = tags[tags.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
531
+ st.write(alt.Chart(tags.head(30)).mark_bar().encode(
532
+ x='counts',
533
+ y=alt.X('tag', sort=None)
534
+ ))
535
+
536
+ with tab7:
537
+ st.header("Authors")
538
+ st.text("This info corresponds to the repos owned by the authors")
539
+ authors = data.groupby("author").sum().drop(["text_length", "Unnamed: 0", "language_count"], axis=1).sort_values("downloads_30d", ascending=False)
540
+ d = data["author"].value_counts().rename_axis("author").to_frame('counts').reset_index()
541
+ final_data = pd.merge(
542
+ d, authors, how="outer", on="author"
543
+ )
544
+ st.dataframe(final_data)
545
+
546
+ with tab8:
547
+ st.header("Raw Data")
548
+ d = data.astype(str)
549
+ st.dataframe(d)
550
 
551
 
552
 
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- datasets
 
 
1
+ datasets
2
+ plotly