Tom Aarsen commited on
Commit
6c6aac5
β€’
1 Parent(s): cfacdee

Add Sentence Transformers model type option

Browse files
Files changed (1) hide show
  1. app.py +114 -9
app.py CHANGED
@@ -1003,6 +1003,104 @@ MODELS_TO_SKIP = {
1003
  "Koat/gte-tiny",
1004
  }
1005
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1006
  def add_lang(examples):
1007
  if not(examples["eval_language"]):
1008
  examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
@@ -1170,6 +1268,8 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
1170
  except:
1171
  pass
1172
  df_list.append(out)
 
 
1173
  df = pd.DataFrame(df_list)
1174
  # If there are any models that are the same, merge them
1175
  # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
@@ -1863,22 +1963,21 @@ def update_url_language(event: gr.SelectData, current_task_language: dict, langu
1863
 
1864
  NUMERIC_INTERVALS = {
1865
  "<100M": pd.Interval(0, 100, closed="right"),
1866
- ">100M, <250M": pd.Interval(100, 250, closed="right"),
1867
- ">250M, <500M": pd.Interval(250, 500, closed="right"),
1868
- ">500M, <1B": pd.Interval(500, 1000, closed="right"),
1869
  ">1B": pd.Interval(1000, 1_000_000, closed="right"),
1870
  }
1871
 
1872
  MODEL_TYPES = [
1873
  "Open",
1874
  "Proprietary",
 
1875
  ]
1876
 
1877
  def filter_data(search_query, model_types, model_sizes, *full_dataframes):
1878
  output_dataframes = []
1879
  for df in full_dataframes:
1880
- # df = pd.DataFrame(data=dataframe.value["data"], columns=dataframe.value["headers"])
1881
-
1882
  # Apply the search query
1883
  if search_query:
1884
  names = df["Model"].map(lambda x: re.match("<a .+?>(.+)</a>", x).group(1))
@@ -1895,7 +1994,12 @@ def filter_data(search_query, model_types, model_sizes, *full_dataframes):
1895
  masks.append(df["Model Size (Million Parameters)"] != "")
1896
  elif model_type == "Proprietary":
1897
  masks.append(df["Model Size (Million Parameters)"] == "")
1898
- df = df[reduce(lambda a, b: a | b, masks)]
 
 
 
 
 
1899
 
1900
  # Apply the model size filtering
1901
  if model_sizes != list(NUMERIC_INTERVALS.keys()):
@@ -1920,8 +2024,8 @@ with gr.Blocks(css=css) as block:
1920
 
1921
  with gr.Row():
1922
  search_bar = gr.Textbox(
1923
- label="Search Bar",
1924
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press enter...",
1925
  )
1926
  filter_model_type = gr.CheckboxGroup(
1927
  label="Model types",
@@ -1935,7 +2039,8 @@ with gr.Blocks(css=css) as block:
1935
  choices=list(NUMERIC_INTERVALS.keys()),
1936
  value=list(NUMERIC_INTERVALS.keys()),
1937
  interactive=True,
1938
- elem_classes=["filter-checkbox-group"]
 
1939
  )
1940
 
1941
  with gr.Tabs() as outer_tabs:
 
1003
  "Koat/gte-tiny",
1004
  }
1005
 
1006
+ SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {
1007
+ "allenai-specter",
1008
+ "allenai-specter",
1009
+ "all-MiniLM-L12-v2",
1010
+ "all-MiniLM-L6-v2",
1011
+ "all-mpnet-base-v2",
1012
+ "bert-base-10lang-cased",
1013
+ "bert-base-15lang-cased",
1014
+ "bert-base-25lang-cased",
1015
+ "bert-base-multilingual-cased",
1016
+ "bert-base-multilingual-uncased",
1017
+ "bert-base-swedish-cased",
1018
+ "bert-base-uncased",
1019
+ "bge-base-zh-v1.5",
1020
+ "bge-large-zh-v1.5",
1021
+ "bge-large-zh-noinstruct",
1022
+ "bge-small-zh-v1.5",
1023
+ "camembert-base",
1024
+ "camembert-large",
1025
+ "contriever-base-msmarco",
1026
+ "cross-en-de-roberta-sentence-transformer",
1027
+ "DanskBERT",
1028
+ "distilbert-base-25lang-cased",
1029
+ "distilbert-base-en-fr-cased",
1030
+ "distilbert-base-en-fr-es-pt-it-cased",
1031
+ "distilbert-base-fr-cased",
1032
+ "distilbert-base-uncased",
1033
+ "distiluse-base-multilingual-cased-v2",
1034
+ "dfm-encoder-large-v1",
1035
+ "dfm-sentence-encoder-large-1",
1036
+ "e5-base",
1037
+ "e5-large",
1038
+ "e5-mistral-7b-instruct",
1039
+ "e5-small",
1040
+ "electra-small-nordic",
1041
+ "electra-small-swedish-cased-discriminator",
1042
+ "flaubert_base_cased",
1043
+ "flaubert_base_uncased",
1044
+ "flaubert_large_cased",
1045
+ "gbert-base",
1046
+ "gbert-large",
1047
+ "gelectra-base",
1048
+ "gelectra-large",
1049
+ "glove.6B.300d",
1050
+ "gottbert-base",
1051
+ "gtr-t5-base",
1052
+ "gtr-t5-large",
1053
+ "gtr-t5-xl",
1054
+ "gtr-t5-xxl",
1055
+ "herbert-base-retrieval-v2",
1056
+ "komninos",
1057
+ "luotuo-bert-medium",
1058
+ "LaBSE",
1059
+ "m3e-base",
1060
+ "m3e-large",
1061
+ "msmarco-bert-co-condensor",
1062
+ "multi-qa-MiniLM-L6-cos-v1",
1063
+ "multilingual-e5-base",
1064
+ "multilingual-e5-large",
1065
+ "multilingual-e5-small",
1066
+ "nb-bert-base",
1067
+ "nb-bert-large",
1068
+ "nomic-embed-text-v1.5-64",
1069
+ "nomic-embed-text-v1.5-128",
1070
+ "nomic-embed-text-v1.5-256",
1071
+ "nomic-embed-text-v1.5-512",
1072
+ "norbert3-base",
1073
+ "norbert3-large",
1074
+ "paraphrase-multilingual-mpnet-base-v2",
1075
+ "paraphrase-multilingual-MiniLM-L12-v2",
1076
+ "sentence-camembert-base",
1077
+ "sentence-camembert-large",
1078
+ "sentence-croissant-llm-base",
1079
+ "sentence-bert-swedish-cased",
1080
+ "sentence-t5-base",
1081
+ "sentence-t5-large",
1082
+ "sentence-t5-xl",
1083
+ "sentence-t5-xxl",
1084
+ "silver-retriever-base-v1",
1085
+ "sup-simcse-bert-base-uncased",
1086
+ "st-polish-paraphrase-from-distilroberta",
1087
+ "st-polish-paraphrase-from-mpnet",
1088
+ "text2vec-base-chinese",
1089
+ "text2vec-large-chinese",
1090
+ "udever-bloom-1b1",
1091
+ "udever-bloom-560m",
1092
+ "universal-sentence-encoder-multilingual-3",
1093
+ "universal-sentence-encoder-multilingual-large-3",
1094
+ "unsup-simcse-bert-base-uncased",
1095
+ "use-cmlm-multilingual",
1096
+ "xlm-roberta-base",
1097
+ "xlm-roberta-large",
1098
+ }
1099
+ SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {
1100
+ make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, "https://huggingface.co/spaces/mteb/leaderboard"))
1101
+ for model in SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS
1102
+ }
1103
+
1104
  def add_lang(examples):
1105
  if not(examples["eval_language"]):
1106
  examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
 
1268
  except:
1269
  pass
1270
  df_list.append(out)
1271
+ if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
1272
+ SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
1273
  df = pd.DataFrame(df_list)
1274
  # If there are any models that are the same, merge them
1275
  # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
 
1963
 
1964
  NUMERIC_INTERVALS = {
1965
  "<100M": pd.Interval(0, 100, closed="right"),
1966
+ "100M to 250M": pd.Interval(100, 250, closed="right"),
1967
+ "250M to 500M": pd.Interval(250, 500, closed="right"),
1968
+ "500M to 1B": pd.Interval(500, 1000, closed="right"),
1969
  ">1B": pd.Interval(1000, 1_000_000, closed="right"),
1970
  }
1971
 
1972
  MODEL_TYPES = [
1973
  "Open",
1974
  "Proprietary",
1975
+ "Sentence Transformers",
1976
  ]
1977
 
1978
  def filter_data(search_query, model_types, model_sizes, *full_dataframes):
1979
  output_dataframes = []
1980
  for df in full_dataframes:
 
 
1981
  # Apply the search query
1982
  if search_query:
1983
  names = df["Model"].map(lambda x: re.match("<a .+?>(.+)</a>", x).group(1))
 
1994
  masks.append(df["Model Size (Million Parameters)"] != "")
1995
  elif model_type == "Proprietary":
1996
  masks.append(df["Model Size (Million Parameters)"] == "")
1997
+ elif model_type == "Sentence Transformers":
1998
+ masks.append(df["Model"].isin(SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS))
1999
+ if masks:
2000
+ df = df[reduce(lambda a, b: a | b, masks)]
2001
+ else:
2002
+ df = pd.DataFrame(columns=df.columns)
2003
 
2004
  # Apply the model size filtering
2005
  if model_sizes != list(NUMERIC_INTERVALS.keys()):
 
2024
 
2025
  with gr.Row():
2026
  search_bar = gr.Textbox(
2027
+ label="Search Bar (separate multiple queries with `;`)",
2028
+ placeholder=" πŸ” Search for a model and press enter...",
2029
  )
2030
  filter_model_type = gr.CheckboxGroup(
2031
  label="Model types",
 
2039
  choices=list(NUMERIC_INTERVALS.keys()),
2040
  value=list(NUMERIC_INTERVALS.keys()),
2041
  interactive=True,
2042
+ elem_classes=["filter-checkbox-group"],
2043
+ scale=2,
2044
  )
2045
 
2046
  with gr.Tabs() as outer_tabs: