Add 2k checkpoint

Files changed (6) hide show

README.md CHANGED Viewed

@@ -1,3 +1,15 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
+language:
+- en
+pipeline_tag: text-classification
 ---
+# Monarch Mixer-BERT
+The 80M checkpoint for M2-BERT-base from the paper [Monarch Mixer: A Simple Sub-Quadratic GEMM-Based Architecture](https://arxiv.org/abs/2310.12109).
+This model has been pretrained with sequence length 2K.
+This model was trained by Dan Fu, Jon Saad-Falcon, and Simran Arora.
+Check out our [GitHub](https://github.com/HazyResearch/m2/tree/main) for instructions on how to download and fine-tune it!

config.json ADDED Viewed

+{
+    "model_type": "m2_bert"
+}

config.yaml ADDED Viewed

+# Note that some of the fields in this template haven't been filled in yet.
+# Please resolve any `null` fields before launching!
+precision: amp_bf16
+max_seq_len: 2048
+# Tokenizer for dataset creation
+tokenizer_name: bert-base-uncased
+# Base model config
+model:
+  name: bert
+  pretrained_model_name: ${tokenizer_name}
+  tokenizer_name: ${tokenizer_name}
+  model_config:
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    attention_probs_dropout_prob: 0.0
+    max_position_embeddings: 2048
+    monarch_mixer_sequence_mixing: True
+    long_conv_l_max: 2048
+    long_conv_kernel_learning_rate: 1e-3
+    hyena_lr_pos_emb: 1e-5
+    hyena_w: 10
+    hyena_wd: 0.1
+    hyena_emb_dim: 5
+    hyena_filter_order: 128
+    hyena_training_additions: False
+    bidirectional: true
+    residual_long_conv: true
+    use_glu_mlp: True
+    use_monarch_mlp: True
+    monarch_mlp_nblocks: 4
+    use_positional_encodings: True

model.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:afd8cf8bdbc6727345f1f28a8791152368d1f34f3a9e5a96114cc155770a73e6
+size 328424417

model.pt ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:8524693401aab571061c1b1b792aff2e8a6cc725aceed7a06ec205163abb4b9b
+size 984849524

version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1