osanseviero commited on
Commit
1581d20
1 Parent(s): 44bb8e2
Files changed (1) hide show
  1. models.py +553 -547
models.py CHANGED
@@ -6,564 +6,570 @@ import altair as alt
6
  import plotly.graph_objs as go
7
  import matplotlib.pyplot as plt
8
 
9
- nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering",
10
- "translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
11
- ]
12
- audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
13
- cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
14
- multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
15
- tabular = ["tabular-classification", "tabular-regression"]
16
-
17
- modalities = {
18
- "nlp": nlp_tasks,
19
- "audio": audio_tasks,
20
- "cv": cv_tasks,
21
- "multimodal": multimodal,
22
- "tabular": tabular,
23
- "rl": ["reinforcement-learning"]
24
- }
25
-
26
- def modality(row):
27
- pipeline = row["pipeline"]
28
- for modality, tasks in modalities.items():
29
- if pipeline in tasks:
30
- return modality
31
- if type(pipeline) == "str":
32
- return "unk_modality"
33
- return None
34
-
35
- supported_revisions = ["27_09_22"]
36
-
37
- st.cache(allow_output_mutation=True)
38
- def process_dataset(version):
39
- # Load dataset at specified revision
40
- dataset = load_dataset("open-source-metrics/model-repos-stats", revision=version)
41
-
42
- # Convert to pandas dataframe
43
- data = dataset["train"].to_pandas()
44
-
45
- # Add modality column
46
- data["modality"] = data.apply(modality, axis=1)
47
-
48
- # Bin the model card length into some bins
49
- data["length_bins"] = pd.cut(data["text_length"], [0, 200, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000])
50
-
51
- return data
52
-
53
- base = st.selectbox(
54
- 'What revision do you want to use',
55
- supported_revisions)
56
- data = process_dataset(base)
57
-
58
- def eval_tags(row):
59
- tags = row["tags"]
60
- if tags == "none" or tags == [] or tags == "{}":
61
- return []
62
- if tags[0] != "[":
63
- tags = str([tags])
64
- val = literal_eval(tags)
65
- if isinstance(val, dict):
66
- return []
67
- return val
68
-
69
- data["tags"] = data.apply(eval_tags, axis=1)
70
-
71
- total_samples = data.shape[0]
72
- st.metric(label="Total models", value=total_samples)
73
-
74
- # Tabs don't work in Spaces st version
75
- #tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
76
-
77
- tab = st.selectbox(
78
- 'Topic of interest',
79
- ["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
80
-
81
- # with tab1:
82
- if tab == "Language":
83
- st.header("Languages info")
84
-
85
- data.loc[data.languages == "False", 'languages'] = None
86
- data.loc[data.languages == {}, 'languages'] = None
87
-
88
- no_lang_count = data["languages"].isna().sum()
89
- data["languages"] = data["languages"].fillna('none')
90
-
91
- def make_list(row):
92
- languages = row["languages"]
93
- if languages == "none":
94
  return []
95
- return literal_eval(languages)
96
-
97
- def language_count(row):
98
- languages = row["languages"]
99
- leng = len(languages)
100
- return leng
101
-
102
- data["languages"] = data.apply(make_list, axis=1)
103
- data["language_count"] = data.apply(language_count, axis=1)
104
-
105
- models_with_langs = data[data["language_count"] > 0]
106
- langs = models_with_langs["languages"].explode()
107
- langs = langs[langs != {}]
108
- total_langs = len(langs.unique())
109
-
110
- col1, col2, col3 = st.columns(3)
111
- with col1:
112
- st.metric(label="Language Specified", value=total_samples-no_lang_count)
113
- with col2:
114
- st.metric(label="No Language Specified", value=no_lang_count)
115
- with col3:
116
- st.metric(label="Total Unique Languages", value=total_langs)
117
-
118
- st.subheader("Count of languages per model repo")
119
- st.text("Some repos are for multiple languages, so the count is greater than 1")
120
- linguality = st.selectbox(
121
- 'All or just Multilingual',
122
- ["All", "Just Multilingual", "Three or more languages"])
123
-
124
- filter = 0
125
- if linguality == "Just Multilingual":
126
- filter = 1
127
- elif linguality == "Three or more languages":
128
- filter = 2
129
-
130
- models_with_langs = data[data["language_count"] > filter]
131
- df1 = models_with_langs['language_count'].value_counts()
132
- st.bar_chart(df1)
133
-
134
- st.subheader("Most frequent languages")
135
- linguality_2 = st.selectbox(
136
- 'All or filtered',
137
- ["All", "No English", "Remove top 10"])
138
-
139
- filter = 0
140
- if linguality_2 == "All":
 
 
 
 
 
 
 
 
 
 
 
141
  filter = 0
142
- elif linguality_2 == "No English":
143
- filter = 1
144
- else:
145
- filter = 2
146
-
147
- models_with_langs = data[data["language_count"] > 0]
148
- langs = models_with_langs["languages"].explode()
149
- langs = langs[langs != {}]
150
-
151
- d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
152
- if filter == 1:
153
- d = d.iloc[1:]
154
- elif filter == 2:
155
- d = d.iloc[10:]
156
-
157
- # Just keep top 25 to avoid vertical scroll
158
- d = d.iloc[:25]
159
-
160
- st.write(alt.Chart(d).mark_bar().encode(
161
- x='counts',
162
- y=alt.X('language', sort=None)
163
- ))
164
-
165
- st.subheader("Raw Data")
166
- col1, col2 = st.columns(2)
167
- with col1:
168
- st.dataframe(df1)
169
- with col2:
170
  d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
171
- st.dataframe(d)
172
-
173
-
174
-
175
- #with tab2:
176
- if tab == "License":
177
- st.header("License info")
178
-
179
- no_license_count = data["license"].isna().sum()
180
- col1, col2, col3 = st.columns(3)
181
- with col1:
182
- st.metric(label="License Specified", value=total_samples-no_license_count)
183
- with col2:
184
- st.metric(label="No license Specified", value=no_license_count)
185
- with col3:
186
- st.metric(label="Total Unique Licenses", value=len(data["license"].unique()))
187
-
188
- st.subheader("Distribution of licenses per model repo")
189
- license_filter = st.selectbox(
190
- 'All or filtered',
191
- ["All", "No Apache 2.0", "Remove top 10"])
192
-
193
- filter = 0
194
- if license_filter == "All":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  filter = 0
196
- elif license_filter == "No Apache 2.0":
197
- filter = 1
198
- else:
199
- filter = 2
200
-
201
- d = data["license"].value_counts().rename_axis("license").to_frame('counts').reset_index()
202
- if filter == 1:
203
- d = d.iloc[1:]
204
- elif filter == 2:
205
- d = d.iloc[10:]
206
-
207
- # Just keep top 25 to avoid vertical scroll
208
- d = d.iloc[:25]
209
-
210
- st.write(alt.Chart(d).mark_bar().encode(
211
- x='counts',
212
- y=alt.X('license', sort=None)
213
- ))
214
- st.text("There are some edge cases, as old repos using lists of licenses.")
215
-
216
- st.subheader("Raw Data")
217
- d = data["license"].value_counts().rename_axis("license").to_frame('counts').reset_index()
218
- st.dataframe(d)
219
-
220
- #with tab3:
221
- if tab == "Pipeline":
222
- st.header("Pipeline info")
223
-
224
- tags = data["tags"].explode()
225
- tags = tags[tags.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
226
- s = tags["tag"]
227
- s = s[s.apply(type) == str]
228
- unique_tags = len(s.unique())
229
-
230
- no_pipeline_count = data["pipeline"].isna().sum()
231
- col1, col2, col3 = st.columns(3)
232
- with col1:
233
- st.metric(label="# models that have any pipeline", value=total_samples-no_pipeline_count)
234
- with col2:
235
- st.metric(label="No pipeline Specified", value=no_pipeline_count)
236
- with col3:
237
- st.metric(label="Total Unique Pipelines", value=len(data["pipeline"].unique()))
238
-
239
- pipeline_filter = st.selectbox(
240
- 'Modalities',
241
- ["All", "NLP", "CV", "Audio", "RL", "Multimodal", "Tabular"])
242
-
243
- filter = 0
244
- if pipeline_filter == "All":
245
  filter = 0
246
- elif pipeline_filter == "NLP":
247
- filter = 1
248
- elif pipeline_filter == "CV":
249
- filter = 2
250
- elif pipeline_filter == "Audio":
251
- filter = 3
252
- elif pipeline_filter == "RL":
253
- filter = 4
254
- elif pipeline_filter == "Multimodal":
255
- filter = 5
256
- elif pipeline_filter == "Tabular":
257
- filter = 6
258
-
259
- st.subheader("High-level metrics")
260
- filtered_data = data[data['pipeline'].notna()]
261
-
262
- if filter == 1:
263
- filtered_data = data[data["modality"] == "nlp"]
264
- elif filter == 2:
265
- filtered_data = data[data["modality"] == "cv"]
266
- elif filter == 3:
267
- filtered_data = data[data["modality"] == "audio"]
268
- elif filter == 4:
269
- filtered_data = data[data["modality"] == "rl"]
270
- elif filter == 5:
271
- filtered_data = data[data["modality"] == "multimodal"]
272
- elif filter == 6:
273
- filtered_data = data[data["modality"] == "tabular"]
274
-
275
- col1, col2, col3 = st.columns(3)
276
- with col1:
277
- p = st.selectbox(
278
- 'What pipeline do you want to see?',
279
- ["all", *filtered_data["pipeline"].unique()]
280
- )
281
- with col2:
282
- l = st.selectbox(
283
- 'What library do you want to see?',
284
- ["all", *filtered_data["library"].unique()]
285
- )
286
- with col3:
287
- f = st.selectbox(
288
- 'What framework support? (transformers)',
289
- ["all", "py", "tf", "jax"]
290
- )
291
-
292
- col1, col2 = st.columns(2)
293
- with col1:
294
- filt = st.multiselect(
295
- label="Tags (All by default)",
296
- options=s.unique(),
297
- default=None)
298
- with col2:
299
- o = st.selectbox(
300
- label="Operation (for tags)",
301
- options=["Any", "All", "None"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  )
303
-
304
- def filter_fn(row):
305
- tags = row["tags"]
306
- tags[:] = [d for d in tags if isinstance(d, str)]
307
- if o == "All":
308
- if all(elem in tags for elem in filt):
309
- return True
310
-
311
- s1 = set(tags)
312
- s2 = set(filt)
313
- if o == "Any":
314
- if bool(s1 & s2):
315
- return True
316
- if o == "None":
317
- if len(s1.intersection(s2)) == 0:
318
- return True
319
- return False
320
-
321
-
322
- if p != "all":
323
- filtered_data = filtered_data[filtered_data["pipeline"] == p]
324
- if l != "all":
325
- filtered_data = filtered_data[filtered_data["library"] == l]
326
- if f != "all":
327
- if f == "py":
328
- filtered_data = filtered_data[filtered_data["pytorch"] == 1]
329
- elif f == "tf":
330
- filtered_data = filtered_data[filtered_data["tensorflow"] == 1]
331
- elif f == "jax":
332
- filtered_data = filtered_data[filtered_data["jax"] == 1]
333
- if filt != []:
334
- filtered_data = filtered_data[filtered_data.apply(filter_fn, axis=1)]
335
-
336
-
337
- d = filtered_data["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
338
- columns_of_interest = ["downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
339
- grouped_data = filtered_data.groupby("pipeline").sum()[columns_of_interest]
340
- final_data = pd.merge(
341
- d, grouped_data, how="outer", on="pipeline"
342
- )
343
- sums = grouped_data.sum()
344
-
345
- col1, col2, col3 = st.columns(3)
346
- with col1:
347
- st.metric(label="Total models", value=filtered_data.shape[0])
348
- with col2:
349
- st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"])
350
- with col3:
351
- st.metric(label="Cumulative likes", value=sums["likes"])
352
-
353
- col1, col2, col3 = st.columns(3)
354
- with col1:
355
- st.metric(label="Total in PT", value=sums["pytorch"])
356
- with col2:
357
- st.metric(label="Total in TF", value=sums["tensorflow"])
358
- with col3:
359
- st.metric(label="Total in JAX", value=sums["jax"])
360
-
361
- st.metric(label="Unique Tags", value=unique_tags)
362
-
363
-
364
-
365
- st.subheader("Count of models per pipeline")
366
- st.write(alt.Chart(d).mark_bar().encode(
367
- x='counts',
368
- y=alt.X('pipeline', sort=None)
369
- ))
370
-
371
- st.subheader("Aggregated data")
372
- st.dataframe(final_data)
373
-
374
- st.subheader("Most common model types (specific to transformers")
375
- d = filtered_data["model_type"].value_counts().rename_axis("model_type").to_frame('counts').reset_index()
376
- d = d.iloc[:15]
377
- st.write(alt.Chart(d).mark_bar().encode(
378
- x='counts',
379
- y=alt.X('model_type', sort=None)
380
- ))
381
-
382
- st.subheader("Most common library types (Learn more in library tab)")
383
- d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index().head(15)
384
- st.write(alt.Chart(d).mark_bar().encode(
385
- x='counts',
386
- y=alt.X('library', sort=None)
387
- ))
388
-
389
- st.subheader("Tags by count")
390
- tags = filtered_data["tags"].explode()
391
- tags = tags[tags.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
392
- st.write(alt.Chart(tags.head(30)).mark_bar().encode(
393
- x='counts',
394
- y=alt.X('tag', sort=None)
395
- ))
396
-
397
- st.subheader("Raw Data")
398
- columns_of_interest = [
399
- "repo_id", "author", "model_type", "files_per_repo", "library",
400
- "downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
401
- raw_data = filtered_data[columns_of_interest]
402
- st.dataframe(raw_data)
403
-
404
-
405
-
406
- # todo : add activity metric
407
-
408
-
409
- #with tab4:
410
- if tab == "Discussion Features":
411
- st.header("Discussions Tab info")
412
-
413
- columns_of_interest = ["prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]
414
- sums = data[columns_of_interest].sum()
415
-
416
- col1, col2, col3, col4 = st.columns(4)
417
- with col1:
418
- st.metric(label="Total PRs", value=sums["prs_count"])
419
- with col2:
420
- st.metric(label="PRs opened", value=sums["prs_open"])
421
- with col3:
422
- st.metric(label="PRs merged", value=sums["prs_merged"])
423
- with col4:
424
- st.metric(label="PRs closed", value=sums["prs_closed"])
425
-
426
- col1, col2, col3 = st.columns(3)
427
- with col1:
428
- st.metric(label="Total discussions", value=sums["discussions_count"])
429
- with col2:
430
- st.metric(label="Discussions open", value=sums["discussions_open"])
431
- with col3:
432
- st.metric(label="Discussions closed", value=sums["discussions_closed"])
433
-
434
- filtered_data = data[["repo_id", "prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]].sort_values("prs_count", ascending=False).reset_index(drop=True)
435
- st.dataframe(filtered_data)
436
-
437
- #with tab5:
438
- if tab == "Libraries":
439
- st.header("Library info")
440
-
441
- no_library_count = data["library"].isna().sum()
442
- col1, col2, col3 = st.columns(3)
443
- with col1:
444
- st.metric(label="# models that have any library", value=total_samples-no_library_count)
445
- with col2:
446
- st.metric(label="No library Specified", value=no_library_count)
447
- with col3:
448
- st.metric(label="Total Unique library", value=len(data["library"].unique()))
449
-
450
-
451
- st.subheader("High-level metrics")
452
- filtered_data = data[data['library'].notna()]
453
-
454
- col1, col2 = st.columns(2)
455
- with col1:
456
- lib = st.selectbox(
457
- 'What library do you want to see? ',
458
- ["all", *filtered_data["library"].unique()]
459
  )
460
- with col2:
461
- pip = st.selectbox(
462
- 'What pipeline do you want to see? ',
463
- ["all", *filtered_data["pipeline"].unique()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  )
 
465
 
466
- if pip != "all":
467
- filtered_data = filtered_data[filtered_data["pipeline"] == pip]
468
- if lib != "all":
469
- filtered_data = filtered_data[filtered_data["library"] == lib]
470
-
471
-
472
- d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index()
473
- grouped_data = filtered_data.groupby("library").sum()[["downloads_30d", "likes"]]
474
- final_data = pd.merge(
475
- d, grouped_data, how="outer", on="library"
476
- )
477
- sums = grouped_data.sum()
478
-
479
- col1, col2, col3 = st.columns(3)
480
- with col1:
481
- st.metric(label="Total models", value=filtered_data.shape[0])
482
- with col2:
483
- st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"])
484
- with col3:
485
- st.metric(label="Cumulative likes", value=sums["likes"])
486
-
487
- st.subheader("Most common library types (Learn more in library tab)")
488
- d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index().head(15)
489
- st.write(alt.Chart(d).mark_bar().encode(
490
- x='counts',
491
- y=alt.X('library', sort=None)
492
- ))
493
-
494
-
495
-
496
- st.subheader("Aggregated Data")
497
- st.dataframe(final_data)
498
-
499
- st.subheader("Raw Data")
500
- columns_of_interest = ["repo_id", "author", "files_per_repo", "library", "downloads_30d", "likes"]
501
- filtered_data = filtered_data[columns_of_interest]
502
- st.dataframe(filtered_data)
503
-
504
- #with tab6:
505
- if tab == "Model Cards":
506
- st.header("Model cards")
507
-
508
- columns_of_interest = ["has_model_index", "has_metadata", "has_text", "text_length"]
509
- rows = data.shape[0]
510
-
511
- cond = data["has_model_index"] | data["has_text"]
512
- with_model_card = data[cond]
513
- c_model_card = with_model_card.shape[0]
514
- st.subheader("High-level metrics")
515
- col1, col2, col3 = st.columns(3)
516
- with col1:
517
- st.metric(label="# models with model card file", value=c_model_card)
518
- with col2:
519
- st.metric(label="# models without model card file", value=rows-c_model_card)
520
-
521
- with_index = data["has_model_index"].sum()
522
- with col1:
523
- st.metric(label="# models with model index", value=with_index)
524
- with col2:
525
- st.metric(label="# models without model index", value=rows-with_index)
526
-
527
- with_text = data["has_text"]
528
- with col1:
529
- st.metric(label="# models with model card text", value=with_text.sum())
530
- with col2:
531
- st.metric(label="# models without model card text", value=rows-with_text.sum())
532
-
533
-
534
- st.subheader("Length (chars) of model card content")
535
- fig, ax = plt.subplots()
536
- ax = data["length_bins"].value_counts().plot.bar()
537
- st.metric(label="# average length of model card (chars)", value=data[with_text]["text_length"].mean())
538
- st.pyplot(fig)
539
-
540
- st.subheader("Tags (Read more in Pipeline tab)")
541
- tags = data["tags"].explode()
542
- tags = tags[tags.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
543
- st.write(alt.Chart(tags.head(30)).mark_bar().encode(
544
- x='counts',
545
- y=alt.X('tag', sort=None)
546
- ))
547
-
548
- #with tab7:
549
- if tab == "Super Users":
550
- st.header("Authors")
551
- st.text("This info corresponds to the repos owned by the authors")
552
- authors = data.groupby("author").sum().drop(["text_length", "Unnamed: 0", "language_count"], axis=1).sort_values("downloads_30d", ascending=False)
553
- d = data["author"].value_counts().rename_axis("author").to_frame('counts').reset_index()
554
- final_data = pd.merge(
555
- d, authors, how="outer", on="author"
556
- )
557
- st.dataframe(final_data)
558
-
559
- #with tab2:
560
- if tab == "Raw Data":
561
- st.header("Raw Data")
562
- d = data.astype(str)
563
- st.dataframe(d)
564
-
565
 
566
 
 
 
567
 
568
 
569
 
 
6
  import plotly.graph_objs as go
7
  import matplotlib.pyplot as plt
8
 
9
+ def main():
10
+ print("Build")
11
+ nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering",
12
+ "translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
13
+ ]
14
+ audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
15
+ cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
16
+ multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
17
+ tabular = ["tabular-classification", "tabular-regression"]
18
+
19
+ modalities = {
20
+ "nlp": nlp_tasks,
21
+ "audio": audio_tasks,
22
+ "cv": cv_tasks,
23
+ "multimodal": multimodal,
24
+ "tabular": tabular,
25
+ "rl": ["reinforcement-learning"]
26
+ }
27
+
28
+ def modality(row):
29
+ pipeline = row["pipeline"]
30
+ for modality, tasks in modalities.items():
31
+ if pipeline in tasks:
32
+ return modality
33
+ if type(pipeline) == "str":
34
+ return "unk_modality"
35
+ return None
36
+
37
+ supported_revisions = ["27_09_22"]
38
+
39
+ st.cache(allow_output_mutation=True)
40
+ def process_dataset(version):
41
+ # Load dataset at specified revision
42
+ dataset = load_dataset("open-source-metrics/model-repos-stats", revision=version)
43
+
44
+ # Convert to pandas dataframe
45
+ data = dataset["train"].to_pandas()
46
+
47
+ # Add modality column
48
+ data["modality"] = data.apply(modality, axis=1)
49
+
50
+ # Bin the model card length into some bins
51
+ data["length_bins"] = pd.cut(data["text_length"], [0, 200, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000])
52
+
53
+ return data
54
+
55
+ base = st.selectbox(
56
+ 'What revision do you want to use',
57
+ supported_revisions)
58
+ data = process_dataset(base)
59
+
60
+ def eval_tags(row):
61
+ tags = row["tags"]
62
+ if tags == "none" or tags == [] or tags == "{}":
63
+ return []
64
+ if tags[0] != "[":
65
+ tags = str([tags])
66
+ val = literal_eval(tags)
67
+ if isinstance(val, dict):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  return []
69
+ return val
70
+
71
+ data["tags"] = data.apply(eval_tags, axis=1)
72
+
73
+ total_samples = data.shape[0]
74
+ st.metric(label="Total models", value=total_samples)
75
+
76
+ # Tabs don't work in Spaces st version
77
+ #tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
78
+
79
+ tab = st.selectbox(
80
+ 'Topic of interest',
81
+ ["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
82
+
83
+ # with tab1:
84
+ if tab == "Language":
85
+ st.header("Languages info")
86
+
87
+ data.loc[data.languages == "False", 'languages'] = None
88
+ data.loc[data.languages == {}, 'languages'] = None
89
+
90
+ no_lang_count = data["languages"].isna().sum()
91
+ data["languages"] = data["languages"].fillna('none')
92
+
93
+ def make_list(row):
94
+ languages = row["languages"]
95
+ if languages == "none":
96
+ return []
97
+ return literal_eval(languages)
98
+
99
+ def language_count(row):
100
+ languages = row["languages"]
101
+ leng = len(languages)
102
+ return leng
103
+
104
+ data["languages"] = data.apply(make_list, axis=1)
105
+ data["language_count"] = data.apply(language_count, axis=1)
106
+
107
+ models_with_langs = data[data["language_count"] > 0]
108
+ langs = models_with_langs["languages"].explode()
109
+ langs = langs[langs != {}]
110
+ total_langs = len(langs.unique())
111
+
112
+ col1, col2, col3 = st.columns(3)
113
+ with col1:
114
+ st.metric(label="Language Specified", value=total_samples-no_lang_count)
115
+ with col2:
116
+ st.metric(label="No Language Specified", value=no_lang_count)
117
+ with col3:
118
+ st.metric(label="Total Unique Languages", value=total_langs)
119
+
120
+ st.subheader("Count of languages per model repo")
121
+ st.text("Some repos are for multiple languages, so the count is greater than 1")
122
+ linguality = st.selectbox(
123
+ 'All or just Multilingual',
124
+ ["All", "Just Multilingual", "Three or more languages"])
125
+
126
  filter = 0
127
+ st.text("Tofix: This just takes into account count of languages, it misses the multilingual tag")
128
+ if linguality == "Just Multilingual":
129
+ filter = 1
130
+ elif linguality == "Three or more languages":
131
+ filter = 2
132
+
133
+ models_with_langs = data[data["language_count"] > filter]
134
+ df1 = models_with_langs['language_count'].value_counts()
135
+ st.bar_chart(df1)
136
+
137
+ st.subheader("Most frequent languages")
138
+ linguality_2 = st.selectbox(
139
+ 'All or filtered',
140
+ ["All", "No English", "Remove top 10"])
141
+
142
+ filter = 0
143
+ if linguality_2 == "All":
144
+ filter = 0
145
+ elif linguality_2 == "No English":
146
+ filter = 1
147
+ else:
148
+ filter = 2
149
+
150
+ models_with_langs = data[data["language_count"] > 0]
151
+ langs = models_with_langs["languages"].explode()
152
+ langs = langs[langs != {}]
153
+
 
154
  d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
155
+ if filter == 1:
156
+ d = d.iloc[1:]
157
+ elif filter == 2:
158
+ d = d.iloc[10:]
159
+
160
+ # Just keep top 25 to avoid vertical scroll
161
+ d = d.iloc[:25]
162
+
163
+ st.write(alt.Chart(d).mark_bar().encode(
164
+ x='counts',
165
+ y=alt.X('language', sort=None)
166
+ ))
167
+
168
+ st.subheader("Raw Data")
169
+ col1, col2 = st.columns(2)
170
+ with col1:
171
+ l = df1.rename_axis("lang_count").reset_index().rename(columns={"language_count": "repos_count"})
172
+ print(l)
173
+ st.dataframe(l)
174
+ with col2:
175
+ d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
176
+ st.dataframe(d)
177
+
178
+
179
+
180
+ #with tab2:
181
+ if tab == "License":
182
+ st.header("License info")
183
+
184
+ no_license_count = data["license"].isna().sum()
185
+ col1, col2, col3 = st.columns(3)
186
+ with col1:
187
+ st.metric(label="License Specified", value=total_samples-no_license_count)
188
+ with col2:
189
+ st.metric(label="No license Specified", value=no_license_count)
190
+ with col3:
191
+ st.metric(label="Total Unique Licenses", value=len(data["license"].unique()))
192
+
193
+ st.subheader("Distribution of licenses per model repo")
194
+ license_filter = st.selectbox(
195
+ 'All or filtered',
196
+ ["All", "No Apache 2.0", "Remove top 10"])
197
+
198
  filter = 0
199
+ if license_filter == "All":
200
+ filter = 0
201
+ elif license_filter == "No Apache 2.0":
202
+ filter = 1
203
+ else:
204
+ filter = 2
205
+
206
+ d = data["license"].value_counts().rename_axis("license").to_frame('counts').reset_index()
207
+ if filter == 1:
208
+ d = d.iloc[1:]
209
+ elif filter == 2:
210
+ d = d.iloc[10:]
211
+
212
+ # Just keep top 25 to avoid vertical scroll
213
+ d = d.iloc[:25]
214
+
215
+ st.write(alt.Chart(d).mark_bar().encode(
216
+ x='counts',
217
+ y=alt.X('license', sort=None)
218
+ ))
219
+ st.text("There are some edge cases, as old repos using lists of licenses.")
220
+
221
+ st.subheader("Raw Data")
222
+ d = data["license"].value_counts().rename_axis("license").to_frame('counts').reset_index()
223
+ st.dataframe(d)
224
+
225
+ #with tab3:
226
+ if tab == "Pipeline":
227
+ st.header("Pipeline info")
228
+
229
+ tags = data["tags"].explode()
230
+ tags = tags[tags.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
231
+ s = tags["tag"]
232
+ s = s[s.apply(type) == str]
233
+ unique_tags = len(s.unique())
234
+
235
+ no_pipeline_count = data["pipeline"].isna().sum()
236
+ col1, col2, col3 = st.columns(3)
237
+ with col1:
238
+ st.metric(label="# models that have any pipeline", value=total_samples-no_pipeline_count)
239
+ with col2:
240
+ st.metric(label="No pipeline Specified", value=no_pipeline_count)
241
+ with col3:
242
+ st.metric(label="Total Unique Pipelines", value=len(data["pipeline"].unique()))
243
+
244
+ pipeline_filter = st.selectbox(
245
+ 'Modalities',
246
+ ["All", "NLP", "CV", "Audio", "RL", "Multimodal", "Tabular"])
247
+
248
  filter = 0
249
+ if pipeline_filter == "All":
250
+ filter = 0
251
+ elif pipeline_filter == "NLP":
252
+ filter = 1
253
+ elif pipeline_filter == "CV":
254
+ filter = 2
255
+ elif pipeline_filter == "Audio":
256
+ filter = 3
257
+ elif pipeline_filter == "RL":
258
+ filter = 4
259
+ elif pipeline_filter == "Multimodal":
260
+ filter = 5
261
+ elif pipeline_filter == "Tabular":
262
+ filter = 6
263
+
264
+ st.subheader("High-level metrics")
265
+ filtered_data = data[data['pipeline'].notna()]
266
+
267
+ if filter == 1:
268
+ filtered_data = data[data["modality"] == "nlp"]
269
+ elif filter == 2:
270
+ filtered_data = data[data["modality"] == "cv"]
271
+ elif filter == 3:
272
+ filtered_data = data[data["modality"] == "audio"]
273
+ elif filter == 4:
274
+ filtered_data = data[data["modality"] == "rl"]
275
+ elif filter == 5:
276
+ filtered_data = data[data["modality"] == "multimodal"]
277
+ elif filter == 6:
278
+ filtered_data = data[data["modality"] == "tabular"]
279
+
280
+ col1, col2, col3 = st.columns(3)
281
+ with col1:
282
+ p = st.selectbox(
283
+ 'What pipeline do you want to see?',
284
+ ["all", *filtered_data["pipeline"].unique()]
285
+ )
286
+ with col2:
287
+ l = st.selectbox(
288
+ 'What library do you want to see?',
289
+ ["all", *filtered_data["library"].unique()]
290
+ )
291
+ with col3:
292
+ f = st.selectbox(
293
+ 'What framework support? (transformers)',
294
+ ["all", "py", "tf", "jax"]
295
+ )
296
+
297
+ col1, col2 = st.columns(2)
298
+ with col1:
299
+ filt = st.multiselect(
300
+ label="Tags (All by default)",
301
+ options=s.unique(),
302
+ default=None)
303
+ with col2:
304
+ o = st.selectbox(
305
+ label="Operation (for tags)",
306
+ options=["Any", "All", "None"]
307
+ )
308
+
309
+ def filter_fn(row):
310
+ tags = row["tags"]
311
+ tags[:] = [d for d in tags if isinstance(d, str)]
312
+ if o == "All":
313
+ if all(elem in tags for elem in filt):
314
+ return True
315
+
316
+ s1 = set(tags)
317
+ s2 = set(filt)
318
+ if o == "Any":
319
+ if bool(s1 & s2):
320
+ return True
321
+ if o == "None":
322
+ if len(s1.intersection(s2)) == 0:
323
+ return True
324
+ return False
325
+
326
+
327
+ if p != "all":
328
+ filtered_data = filtered_data[filtered_data["pipeline"] == p]
329
+ if l != "all":
330
+ filtered_data = filtered_data[filtered_data["library"] == l]
331
+ if f != "all":
332
+ if f == "py":
333
+ filtered_data = filtered_data[filtered_data["pytorch"] == 1]
334
+ elif f == "tf":
335
+ filtered_data = filtered_data[filtered_data["tensorflow"] == 1]
336
+ elif f == "jax":
337
+ filtered_data = filtered_data[filtered_data["jax"] == 1]
338
+ if filt != []:
339
+ filtered_data = filtered_data[filtered_data.apply(filter_fn, axis=1)]
340
+
341
+
342
+ d = filtered_data["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
343
+ columns_of_interest = ["downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
344
+ grouped_data = filtered_data.groupby("pipeline").sum()[columns_of_interest]
345
+ final_data = pd.merge(
346
+ d, grouped_data, how="outer", on="pipeline"
347
  )
348
+ sums = grouped_data.sum()
349
+
350
+ col1, col2, col3 = st.columns(3)
351
+ with col1:
352
+ st.metric(label="Total models", value=filtered_data.shape[0])
353
+ with col2:
354
+ st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"])
355
+ with col3:
356
+ st.metric(label="Cumulative likes", value=sums["likes"])
357
+
358
+ col1, col2, col3 = st.columns(3)
359
+ with col1:
360
+ st.metric(label="Total in PT", value=sums["pytorch"])
361
+ with col2:
362
+ st.metric(label="Total in TF", value=sums["tensorflow"])
363
+ with col3:
364
+ st.metric(label="Total in JAX", value=sums["jax"])
365
+
366
+ st.metric(label="Unique Tags", value=unique_tags)
367
+
368
+
369
+
370
+ st.subheader("Count of models per pipeline")
371
+ st.write(alt.Chart(d).mark_bar().encode(
372
+ x='counts',
373
+ y=alt.X('pipeline', sort=None)
374
+ ))
375
+
376
+ st.subheader("Aggregated data")
377
+ st.dataframe(final_data)
378
+
379
+ st.subheader("Most common model types (specific to transformers)")
380
+ d = filtered_data["model_type"].value_counts().rename_axis("model_type").to_frame('counts').reset_index()
381
+ d = d.iloc[:15]
382
+ st.write(alt.Chart(d).mark_bar().encode(
383
+ x='counts',
384
+ y=alt.X('model_type', sort=None)
385
+ ))
386
+
387
+ st.subheader("Most common library types (Learn more in library tab)")
388
+ d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index().head(15)
389
+ st.write(alt.Chart(d).mark_bar().encode(
390
+ x='counts',
391
+ y=alt.X('library', sort=None)
392
+ ))
393
+
394
+ st.subheader("Tags by count")
395
+ tags = filtered_data["tags"].explode()
396
+ tags = tags[tags.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
397
+ st.write(alt.Chart(tags.head(30)).mark_bar().encode(
398
+ x='counts',
399
+ y=alt.X('tag', sort=None)
400
+ ))
401
+
402
+ st.subheader("Raw Data")
403
+ columns_of_interest = [
404
+ "repo_id", "author", "model_type", "files_per_repo", "library",
405
+ "downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
406
+ raw_data = filtered_data[columns_of_interest]
407
+ st.dataframe(raw_data)
408
+
409
+
410
+
411
+ # todo : add activity metric
412
+
413
+
414
+ #with tab4:
415
+ if tab == "Discussion Features":
416
+ st.header("Discussions Tab info")
417
+
418
+ columns_of_interest = ["prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]
419
+ sums = data[columns_of_interest].sum()
420
+
421
+ col1, col2, col3, col4 = st.columns(4)
422
+ with col1:
423
+ st.metric(label="Total PRs", value=sums["prs_count"])
424
+ with col2:
425
+ st.metric(label="PRs opened", value=sums["prs_open"])
426
+ with col3:
427
+ st.metric(label="PRs merged", value=sums["prs_merged"])
428
+ with col4:
429
+ st.metric(label="PRs closed", value=sums["prs_closed"])
430
+
431
+ col1, col2, col3 = st.columns(3)
432
+ with col1:
433
+ st.metric(label="Total discussions", value=sums["discussions_count"])
434
+ with col2:
435
+ st.metric(label="Discussions open", value=sums["discussions_open"])
436
+ with col3:
437
+ st.metric(label="Discussions closed", value=sums["discussions_closed"])
438
+
439
+ filtered_data = data[["repo_id", "prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]].sort_values("prs_count", ascending=False).reset_index(drop=True)
440
+ st.dataframe(filtered_data)
441
+
442
+ #with tab5:
443
+ if tab == "Libraries":
444
+ st.header("Library info")
445
+
446
+ no_library_count = data["library"].isna().sum()
447
+ col1, col2, col3 = st.columns(3)
448
+ with col1:
449
+ st.metric(label="# models that have any library", value=total_samples-no_library_count)
450
+ with col2:
451
+ st.metric(label="No library Specified", value=no_library_count)
452
+ with col3:
453
+ st.metric(label="Total Unique library", value=len(data["library"].unique()))
454
+
455
+
456
+ st.subheader("High-level metrics")
457
+ filtered_data = data[data['library'].notna()]
458
+
459
+ col1, col2 = st.columns(2)
460
+ with col1:
461
+ lib = st.selectbox(
462
+ 'What library do you want to see? ',
463
+ ["all", *filtered_data["library"].unique()]
464
+ )
465
+ with col2:
466
+ pip = st.selectbox(
467
+ 'What pipeline do you want to see? ',
468
+ ["all", *filtered_data["pipeline"].unique()]
469
+ )
470
+
471
+ if pip != "all":
472
+ filtered_data = filtered_data[filtered_data["pipeline"] == pip]
473
+ if lib != "all":
474
+ filtered_data = filtered_data[filtered_data["library"] == lib]
475
+
476
+
477
+ d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index()
478
+ grouped_data = filtered_data.groupby("library").sum()[["downloads_30d", "likes"]]
479
+ final_data = pd.merge(
480
+ d, grouped_data, how="outer", on="library"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
  )
482
+ sums = grouped_data.sum()
483
+
484
+ col1, col2, col3 = st.columns(3)
485
+ with col1:
486
+ st.metric(label="Total models", value=filtered_data.shape[0])
487
+ with col2:
488
+ st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"])
489
+ with col3:
490
+ st.metric(label="Cumulative likes", value=sums["likes"])
491
+
492
+ st.subheader("Most common library types (Learn more in library tab)")
493
+ d = filtered_data["library"].value_counts().rename_axis("library").to_frame('counts').reset_index().head(15)
494
+ st.write(alt.Chart(d).mark_bar().encode(
495
+ x='counts',
496
+ y=alt.X('library', sort=None)
497
+ ))
498
+
499
+
500
+
501
+ st.subheader("Aggregated Data")
502
+ st.dataframe(final_data)
503
+
504
+ st.subheader("Raw Data")
505
+ columns_of_interest = ["repo_id", "author", "files_per_repo", "library", "downloads_30d", "likes"]
506
+ filtered_data = filtered_data[columns_of_interest]
507
+ st.dataframe(filtered_data)
508
+
509
+ #with tab6:
510
+ if tab == "Model Cards":
511
+ st.header("Model cards")
512
+
513
+ columns_of_interest = ["has_model_index", "has_metadata", "has_text", "text_length"]
514
+ rows = data.shape[0]
515
+
516
+ cond = data["has_model_index"] | data["has_text"]
517
+ with_model_card = data[cond]
518
+ c_model_card = with_model_card.shape[0]
519
+ st.subheader("High-level metrics")
520
+ col1, col2, col3 = st.columns(3)
521
+ with col1:
522
+ st.metric(label="# models with model card file", value=c_model_card)
523
+ with col2:
524
+ st.metric(label="# models without model card file", value=rows-c_model_card)
525
+
526
+ with_index = data["has_model_index"].sum()
527
+ with col1:
528
+ st.metric(label="# models with model index", value=with_index)
529
+ with col2:
530
+ st.metric(label="# models without model index", value=rows-with_index)
531
+
532
+ with_text = data["has_text"]
533
+ with col1:
534
+ st.metric(label="# models with model card text", value=with_text.sum())
535
+ with col2:
536
+ st.metric(label="# models without model card text", value=rows-with_text.sum())
537
+
538
+
539
+ st.subheader("Length (chars) of model card content")
540
+ fig, ax = plt.subplots()
541
+ ax = data["length_bins"].value_counts().plot.bar()
542
+ st.metric(label="# average length of model card (chars)", value=data[with_text]["text_length"].mean())
543
+ st.pyplot(fig)
544
+
545
+ st.subheader("Tags (Read more in Pipeline tab)")
546
+ tags = data["tags"].explode()
547
+ tags = tags[tags.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
548
+ st.write(alt.Chart(tags.head(30)).mark_bar().encode(
549
+ x='counts',
550
+ y=alt.X('tag', sort=None)
551
+ ))
552
+
553
+ #with tab7:
554
+ if tab == "Super Users":
555
+ st.header("Authors")
556
+ st.text("This info corresponds to the repos owned by the authors")
557
+ authors = data.groupby("author").sum().drop(["text_length", "Unnamed: 0"], axis=1).sort_values("downloads_30d", ascending=False)
558
+ d = data["author"].value_counts().rename_axis("author").to_frame('counts').reset_index()
559
+ final_data = pd.merge(
560
+ d, authors, how="outer", on="author"
561
  )
562
+ st.dataframe(final_data)
563
 
564
+ #with tab2:
565
+ if tab == "Raw Data":
566
+ st.header("Raw Data")
567
+ d = data.astype(str)
568
+ st.dataframe(d)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
 
570
 
571
+ if __name__ == '__main__':
572
+ main()
573
 
574
 
575