Upload 9 files

Browse files

Files changed (9) hide show

LICENSE +202 -0
README.md +56 -3
elm/infer_elm.py +132 -0
elm/model.py +418 -0
elm/positional_embeddings.py +86 -0
elm/utils.py +25 -0
models/.gitattributes +2 -0
requirements.txt +2 -0
run.py +24 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,56 @@
----
-license: apache-2.0
----

+# SliceX AI™ ELM (Efficient Language Models)
+This repository contains code to run our ELM models.
+Models are located in the "models" folder. ELM models in this repository comes in three sizes (elm-1.0, elm-0.75 and elm-0.25) and supports the following use-cases.
+- news_classification
+- toxicity_detection
+- news_content_generation
+## Download ELM repo
+```bash
+git clone [email protected]:slicexai/elm-0.25-v0.1
+sudo apt-get intall git-lfs
+git lfs install
+```
+(Optional) Installing git-lfs without sudo,
+```bash
+wget https://github.com/git-lfs/git-lfs/releases/download/v3.2.0/git-lfs-linux-amd64-v3.2.0.tar.gz
+tar -xzf git-lfs-linux-amd64-v3.2.0.tar.gz
+PATH=$PATH:/<absolute-path>/git-lfs-3.2.0/
+git lfs install
+```
+## Download ELM task-specific model checkpoints
+```bash
+cd elm-0.25-v0.1
+git lfs pull -I models/elm-0.25_news_classification/ckpt.pt
+git lfs pull -I models/elm-0.25_toxicity_detection/ckpt.pt
+git lfs pull -I models/elm-0.25_news_content_generation/ckpt.pt
+```
+## Installation
+```bash
+pip install -r requirements.txt
+```
+## How to use - Run ELM on a sample task (e.g., news classification)
+```bash
+python run.py <elm-model-directory>
+E.g. python run.py models/elm-0.25_news_classification
+```
+Prompts for the specific tasks can be found in the corresponding checkpoint directory. See an example below in the form of `models/elm-0.25_news_classification/example_prompts.json`.
+```json
+{
+    "inputs": ["GM May Close Plant in Europe  DETROIT (Reuters) - General Motors Corp. &lt;A HREF=\"http://www.investor.reuters.com/FullQuote.aspx?ticker=GM.N target=/stocks/quickinfo/fullquote\"&gt;GM.N&lt;/A&gt; will likely  cut some jobs in Europe and may close a plant there as part of  a restructuring plan under development to try to return the  region to profitability, the U.S. automaker said on Wednesday."],
+    "template": "[INST]Below is a news article. Please classify it under one of the following classes (World, Business, Sports, Sci/Tech). Please format your response as a JSON payload.\n\n### Article: {input}\n\n### JSON Response:[/INST]"
+}
+```
+Running the above command returns the following response
+```json
+{
+    "prompt": "[INST]Below is a news article. Please classify it under one of the following classes (World, Business, Sports, Sci/Tech). Please format your response as a JSON payload.\n\n### Article: GM May Close Plant in Europe  DETROIT (Reuters) - General Motors Corp. &lt;A HREF=\"http://www.investor.reuters.com/FullQuote.aspx?ticker=GM.N target=/stocks/quickinfo/fullquote\"&gt;GM.N&lt;/A&gt; will likely  cut some jobs in Europe and may close a plant there as part of  a restructuring plan under development to try to return the  region to profitability, the U.S. automaker said on Wednesday.\n\n### JSON Response:[/INST]",
+    "response": "{'text_label': 'Business'}"
+}
+```

elm/infer_elm.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright (c) 2024, SliceX AI, Inc. All Rights Reserved.
+from elm.model import *
+from elm.utils import batchify
+from transformers import AutoTokenizer
+import json
+def load_elm_model_and_tokenizer(local_path,
+                                 model_config_dict,
+                                 device="cuda",
+                                 load_partial=True,
+                                 get_num_layers_from_ckpt=True):
+    """Load ELM model and tokenizer from local checkpoint."""
+    model_args = ModelArgs(**model_config_dict)
+    model = load_elm_model_from_ckpt(local_path, device=device, model_args=model_args, load_partial=load_partial, get_num_layers_from_ckpt=get_num_layers_from_ckpt)
+    tokenizer = AutoTokenizer.from_pretrained(local_path)
+    tokenizer.padding_side = "left"
+    tokenizer.truncation_side = "left"
+    return model, tokenizer
+def generate_elm_response_given_model(prompts, model, tokenizer,
+                          device="cuda",
+                          max_ctx_word_len=1024,
+                          max_ctx_token_len=0,
+                          max_new_tokens=500,
+                          temperature=0.8, # set to 0 for greedy decoding
+                          top_k=200,
+                          return_tok_cnt=False,
+                          return_gen_only=False,
+                          early_stop_on_eos=False):
+    """Generate responses from ELM model given an input list of prompts ([str])."""
+    if max_ctx_token_len > 0:
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_ctx_token_len).to(device)
+    else:
+        prompts = [" ".join(p.split(" ")[-max_ctx_word_len:]) for p in prompts]
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(device)
+    results = []
+    input_tok_cnt = torch.numel(inputs.input_ids)
+    model.eval()
+    out_tok_cnt = 0
+    with torch.no_grad():
+        temperature = temperature
+        top_k = top_k
+        outputs = model.generate(inputs.input_ids, max_new_tokens, temperature=temperature, top_k=top_k,
+                                 return_gen_only=return_gen_only)
+        if return_tok_cnt:
+            out_tok_cnt += torch.numel(outputs)
+        if early_stop_on_eos:
+            mod_outputs = []
+            for i in range(len(outputs)):
+                curr_out = outputs[i]
+                eos_loc_id = -1
+                for j in range(len(outputs[i])):
+                    tok_id = outputs[i][j]
+                    if tok_id == tokenizer.eos_token_id:
+                        eos_loc_id = j
+                        break
+                if eos_loc_id >= 0:
+                    curr_out = outputs[i][:eos_loc_id]
+                mod_outputs.append(curr_out)
+            outputs = mod_outputs
+        detokenized_output = tokenizer.batch_decode(outputs, skip_special_tokens=False)
+        results = detokenized_output
+    if return_tok_cnt:
+        return results, (input_tok_cnt, out_tok_cnt)
+    return results
+def generate_elm_responses(elm_model_path,
+                           prompts,
+                           device=None,
+                           elm_model_config={},
+                           eval_batch_size=1,
+                           verbose=True):
+    if not device:
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Setting device to {device}")
+    model_config_dict = {
+        "hidden_size": elm_model_config.get("hidden_size", 2048),
+        "max_inp_len": elm_model_config.get("max_inp_len", 2048),
+        "num_attention_heads": elm_model_config.get("num_attention_heads", 32),
+        "num_layers": elm_model_config.get("num_layers", 48),
+        "bits": elm_model_config.get("bits", 256),
+        "vocab_size": elm_model_config.get("vocab_size", 50304),
+        "dropout": elm_model_config.get("dropout", 0.1),
+        "use_rotary_embeddings": elm_model_config.get("use_rotary_embeddings", True)
+    }
+    model, tokenizer = load_elm_model_and_tokenizer(local_path=elm_model_path, model_config_dict=model_config_dict, device=device, load_partial=True)
+    #prompts = [prompt if "[INST]" in prompt else f"[INST]{prompt}[/INST]" for prompt in prompts]
+    max_new_tokens = 128
+    if "classification" in elm_model_path or "detection" in elm_model_path:
+        max_new_tokens = 12
+    result = []
+    for prompt_batch in batchify(prompts, eval_batch_size):
+        responses, _ = generate_elm_response_given_model(prompt_batch,
+                                                            model,
+                                                            tokenizer,
+                                                            device=device,
+                                                            max_ctx_word_len=1024,
+                                                            max_ctx_token_len=512,
+                                                            max_new_tokens=max_new_tokens,
+                                                            return_tok_cnt=True,
+                                                            return_gen_only=False,
+                                                            temperature=0.0,
+                                                            early_stop_on_eos=True)
+        for prompt, response in zip(prompt_batch, responses):
+            response = response.split("[/INST]")[-1].strip()
+            result.append(response)
+            if verbose:
+                print(json.dumps({"prompt": prompt, "response": response}, indent=4))
+                print("\n***\n")
+    return result

elm/model.py ADDED Viewed

	@@ -0,0 +1,418 @@

+# Copyright (c) 2024, SliceX AI, Inc. All Rights Reserved.
+import copy
+import inspect
+import math
+import numpy as np
+import os
+from dataclasses import dataclass, field
+from typing import List, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from elm.utils import *
+from elm.positional_embeddings import *
+def get_elm_model_map(model_name):
+    """Map the model type to corresponding class."""
+    elm_model_map = {
+        "rambutan": RambutanSlice,
+    }
+    return elm_model_map.get(model_name, RambutanSlice)
+@dataclass
+class ModelArgs:
+    """ELM Model Args"""
+    model_name_or_path: str = "ELM"
+    compile_model: bool = False
+    elm_model_class: Optional[str] = "rambutan"
+    hidden_size: Optional[int] = 2048
+    max_inp_len: Optional[int] = 2048
+    attn_window_size: Optional[int] = max_inp_len
+    num_attention_heads: Optional[int] = 32
+    layernorm_eps: float = 1e-5
+    attention_dropout: float = 0.1
+    hidden_dropout: float = 0.1
+    num_layers: Optional[int] = 16
+    bits: Optional[int] = 256
+    vocab_size: Optional[int] = 50304
+    dropout: Optional[int] = 0.1
+    use_rotary_embeddings: Optional[bool] = True
+    tokenizer: Optional[str] = None
+class ELM(torch.nn.Module):
+    """ELM (SliceX GPT) model."""
+    def __init__(self,
+                 model_args: ModelArgs):
+        """Initialize an ELM model instance."""
+        super().__init__()
+        self.model_args = model_args
+        elm_model_class = model_args.elm_model_class
+        hidden_size = model_args.hidden_size
+        max_inp_len = model_args.max_inp_len
+        num_attention_heads = model_args.num_attention_heads
+        layernorm_eps = model_args.layernorm_eps
+        attention_dropout = model_args.attention_dropout
+        hidden_dropout = model_args.hidden_dropout
+        num_layers = model_args.num_layers
+        bits = model_args.bits
+        vocab_size = model_args.vocab_size
+        use_rotary_embeddings = model_args.use_rotary_embeddings
+        layer_class = get_elm_model_map(elm_model_class)
+        self.slice_transformer = torch.nn.ModuleDict(dict(
+            temb = torch.nn.Embedding(vocab_size, hidden_size),
+            pemb = torch.nn.Embedding(max_inp_len, hidden_size) if not use_rotary_embeddings else None,
+            drop = torch.nn.Dropout(hidden_dropout),
+            h = torch.nn.ModuleList([ layer_class(model_args=model_args) for _ in range(num_layers) ]),
+            ln_f = torch.nn.LayerNorm(hidden_size, eps=layernorm_eps),
+        ))
+        self.lm_head = torch.nn.Linear(hidden_size, vocab_size, bias=False)
+        print("Number of model parameters: %.2fM" % (self.get_num_params(False)/1e6,))
+    def forward(self,
+                x: torch.Tensor,
+                attention_mask: Optional[torch.Tensor] = None,
+                targets: Optional[torch.Tensor] = None):
+        device = x.device
+        batch, seqlen = x.size()
+        tok_emb = self.slice_transformer.temb(x)
+        if not self.model_args.use_rotary_embeddings:
+            pos = torch.arange(0, seqlen, dtype=torch.long, device=device)
+            pos_emb = self.slice_transformer.pemb(pos)
+            x = self.slice_transformer.drop(tok_emb + pos_emb)
+        else:
+            x = self.slice_transformer.drop(tok_emb)
+        tlayer_id = 0
+        ignore_index_id = -100
+        loss = torch.zeros(1).to(device)
+        loss_denom = 0
+        for tlayer in self.slice_transformer.h:
+            x = tlayer(x, attention_mask=attention_mask)
+            tlayer_id += 1
+        x = self.slice_transformer.ln_f(x)
+        if targets is not None:
+            logits = self.lm_head(x)
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_targets = targets[..., 1:].contiguous()
+            curr_loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)),
+                                        shift_targets.view(-1),
+                                        ignore_index=ignore_index_id)
+            loss += curr_loss.float()
+            loss_denom += 1
+        else:
+            logits = self.lm_head(x[:, [-1], :])
+        loss = loss / loss_denom
+        return logits, loss
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        This assumes parameter tying between input and final layer embeddings. Oherwise
+        If there is no parameter sharing , set the flag to False to include parameters for both layers.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding and not self.model_args.use_rotary_embeddings:
+            n_params -= self.slice_transformer.pemb.weight.numel()
+        return n_params
+    @torch.no_grad()
+    def generate(self, x, max_new_tokens, temperature=0.8, top_k=200, top_p=0.9,
+                 return_gen_only=False):
+        max_inp_len = self.model_args.max_inp_len
+        for _ in range(max_new_tokens):
+            x_ctxt = x if x.size(1) <= max_inp_len else x[:, -max_inp_len:]
+            logits, _ = self(x_ctxt)
+            next_id = None
+            if temperature <= 0:
+                next_id = torch.argmax(logits, dim=-1)
+            else:
+                logits = logits[:, -1, :] / temperature
+                if top_k is not None:
+                    v, k = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                probs = F.softmax(logits, dim=-1)
+                if top_p is None:
+                    next_id = torch.multinomial(probs, num_samples=1)
+                else:
+                    next_id = sample_top_p(probs, top_p)
+            x = torch.cat((x, next_id), dim=1)
+        if return_gen_only:
+            return x[:,-max_new_tokens:]
+        return x
+class RambutanMLP(torch.nn.Module):
+    """RambutanMLP version of MLP module used in the ELM (SliceX GPT) Transformer block."""
+    def __init__(self, dim=768, bits=32, dropout = 0.0):
+        super(RambutanMLP, self).__init__()
+        self.dim = dim
+        self.bits = bits
+        self.dropout = torch.nn.Dropout(dropout)
+        self.A1_c_w = torch.nn.Linear(self.dim, self.bits, bias=True)
+        self.Hexperts = 4
+        self.Hexpertemb = torch.nn.Embedding(self.bits, self.dim)
+        self.expert_aggr = torch.nn.Linear(self.Hexperts, 1)
+    def forward(self, x):
+        h_c = torch.nn.functional.softmax(self.A1_c_w(x), dim=-1)
+        v, i = torch.topk(h_c, self.Hexperts)
+        if len(x.size()) < 3:
+            p = v.unsqueeze(-1).expand(-1,-1,self.dim)
+        else:
+            p = v.unsqueeze(-1).expand(-1,-1,-1,self.dim)
+        h_emb = p * self.Hexpertemb(i)
+        if len(x.size()) < 3:
+            out = self.expert_aggr(h_emb.transpose(1,2)).reshape(h_emb.size(0), -1)
+        else:
+            out = self.expert_aggr(h_emb.transpose(-2,-1)).reshape(x.size())
+        out = x * out
+        out = self.dropout(out)
+        return out
+class RambutanSlice(torch.nn.Module):
+    """Rambutan version of ELM (SliceX GPT) Transformer block."""
+    def __init__(self,
+                 model_args: ModelArgs):
+        super().__init__()
+        self.model_args = model_args
+        self.num_attention_heads = model_args.num_attention_heads
+        self.kv_channels = model_args.hidden_size // model_args.num_attention_heads
+        self.ln1 = torch.nn.LayerNorm(model_args.hidden_size, eps=model_args.layernorm_eps)
+        self.ln2 = torch.nn.LayerNorm(model_args.hidden_size, eps=model_args.layernorm_eps)
+        self.mlp = RambutanMLP(dim=model_args.hidden_size, bits=model_args.bits)
+        self.cattn =  RambutanCausalSelfAttention(model_args=model_args)
+    def forward(self,
+                x: torch.Tensor,
+                attention_mask: torch.Tensor = None):
+        res = x
+        x = self.ln1(x)
+        x = self.cattn(x, attention_mask=attention_mask)
+        x = res + x
+        res = x
+        x = self.ln2(x)
+        x = self.mlp(x)
+        return x + res
+class RambutanCausalSelfAttention(torch.nn.Module):
+    """Rambutan version of self-attention module used in the ELM (SliceX GPT) transformer block."""
+    def __init__(self,
+                 model_args: ModelArgs):
+        super().__init__()
+        self.model_args = model_args
+        n_embd = model_args.hidden_size
+        n_head = model_args.num_attention_heads
+        bias = False
+        dropout = model_args.attention_dropout
+        assert n_embd % n_head == 0
+        self.c_attn = torch.nn.Linear(n_embd, 3 * n_embd, bias=bias)
+        self.c_proj = torch.nn.Linear(n_embd, n_embd, bias=bias)
+        self.attn_dropout = torch.nn.Dropout(dropout)
+        self.resid_dropout = torch.nn.Dropout(dropout)
+        self.n_head = n_head
+        self.n_embd = n_embd
+        self.dropout = dropout
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.flash:
+            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
+        self.rotary_embeddings = (
+            RotaryEmbedding(n_embd // n_head) if model_args.use_rotary_embeddings else None
+        )
+    def forward(self, x, attention_mask: torch.Tensor = None):
+        B, T, C = x.size()
+        device = x.device
+        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        if self.rotary_embeddings:
+            q, k = self.rotary_embeddings(q=q, k=k)
+        is_causal = True
+        attn_mask = None
+        if attention_mask is not None:
+            att_mask_input = attention_mask
+            att_mask_input = att_mask_input.unsqueeze(-1).expand(B, T, T)
+            if is_causal:
+                att_mask_causal = torch.tril(torch.ones(T, T)).view(1,T,T).expand(B,T,T).to(device)
+                attn_mask = (att_mask_causal * att_mask_input)
+            else:
+                attn_mask = att_mask_input
+            attn_mask = attn_mask.unsqueeze(1).expand(B, self.n_head, T, T)
+            attn_mask.float().to(device)
+        if self.flash:
+            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=self.dropout if self.training else 0, is_causal=True)
+        else:
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            if is_causal and attn_mask is None:
+                attn_mask = torch.tril(torch.ones(T, T)).view(1,T,T).expand(B,T,T).to(device)
+                attn_mask = attn_mask.unsqueeze(1).expand(B, self.n_head, T, T)
+            if attn_mask is not None:
+                att = att.masked_fill(attn_mask == 0, torch.finfo(att.dtype).min)
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+def init_elm_model(model_args=ModelArgs(), device="cuda", model_config_dict=None):
+    """Initialize ELM model using default or model_config parameters."""
+    if model_config_dict:
+        model_args = ModelArgs(**model_config_dict)
+    dtype = torch.bfloat16 if device=="cuda" and torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
+    model = ELM(model_args=model_args).to(dtype=dtype)
+    return model
+def get_h_layers_in_ckpt(ckpt_state_dict,
+                         layer_name_template = 'slice_transformer.h.{layer_num}.'):
+    num_layers_in_ckpt = 0
+    from collections import defaultdict
+    layer_wise_dict = defaultdict(lambda: defaultdict(list))
+    layer_num_found = True
+    while layer_num_found:
+        layer_num_found = False
+        for layer_name in ckpt_state_dict.keys():
+            if layer_name_template.format(layer_num=num_layers_in_ckpt) in layer_name:
+                layer_wise_dict[num_layers_in_ckpt][layer_name] = ckpt_state_dict[layer_name]
+                layer_num_found = True
+        num_layers_in_ckpt += 1
+    return layer_wise_dict
+def load_elm_model_from_ckpt(ckpt_dir, device='cuda', load_partial=False, model_args=ModelArgs(), get_num_layers_from_ckpt=True):
+    """Load ELM model from local checkpoint."""
+    print(f"Loading ELM checkpoint from {ckpt_dir}")
+    ckpt_path = os.path.join(ckpt_dir, 'ckpt.pt')
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    if get_num_layers_from_ckpt:
+        layer_name_template = 'slice_transformer.h.{layer_num}.'
+        ckpt_layer_wise_dict = get_h_layers_in_ckpt(checkpoint['model'],
+                                                    layer_name_template = layer_name_template)
+        model_args.num_layers = len(ckpt_layer_wise_dict)
+    model = init_elm_model(model_args=model_args, device=device)
+    ckpt_state_dict = checkpoint['model']
+    unwanted_prefix = '_orig_mod.'
+    for k,v in list(ckpt_state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            ckpt_state_dict[k[len(unwanted_prefix):]] = ckpt_state_dict.pop(k)
+    if load_partial:
+        mod_state_dict = model.state_dict()
+        for k,v in list(ckpt_state_dict.items()):
+            if k in mod_state_dict:
+                v_size = v.size()
+                mod_size = mod_state_dict[k].size()
+                if v_size == mod_size:
+                    mod_state_dict[k] = v
+                else:
+                    if len(v_size) == 1:
+                        mod_state_dict[k][:v_size[-1]] = v
+                    elif len(v_size) == 2:
+                        mod_state_dict[k][:v_size[-2], :v_size[-1]] = v
+        ckpt_state_dict = mod_state_dict
+    load_status = model.load_state_dict(ckpt_state_dict)
+    print(load_status)
+    model.to(device)
+    return model
+def sample_top_p(probs, threshold):
+    """Perform top-p sampling on probability distribution using a threshold."""
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > threshold
+    probs_sort[mask] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token

elm/positional_embeddings.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+from typing import Optional, Tuple
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+@torch.jit.script
+def apply_rotary_pos_emb(x, cos, sin):
+    # NOTE: This could probably be moved to Triton
+    # Handle a possible sequence length mismatch in between q and k
+    cos = cos[:, :, : x.shape[-2], :]
+    sin = sin[:, :, : x.shape[-2], :]
+    return (x * cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    Rotary position embeddings from RoFormer (Su et. al, 2021).
+    """
+    def __init__(self, dim_model: int, *_, **__):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim_model, 2).float() / dim_model))
+        self.register_buffer("inv_freq", inv_freq)
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+    def update_cos_sin_tables(self, x, seq_dimension=1):
+        seq_len = x.shape[seq_dimension]
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seq_len != self._seq_len_cached
+            or self._cos_cached.device != x.device
+            or self._cos_cached.dtype != x.dtype
+        ):
+            self._seq_len_cached = seq_len
+            t = torch.arange(
+                x.shape[seq_dimension], device=x.device, dtype=torch.float32
+            )
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq.to(x.dtype))
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self._cos_cached = emb.cos()[None, None, :, :].to(x.dtype)
+            self._sin_cached = emb.sin()[None, None, :, :].to(x.dtype)
+        return self._cos_cached, self._sin_cached
+    def forward(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self._cos_cached, self._sin_cached = self.update_cos_sin_tables(
+            k, seq_dimension=-2
+        )
+        return (
+            apply_rotary_pos_emb(q, self._cos_cached, self._sin_cached),
+            apply_rotary_pos_emb(k, self._cos_cached, self._sin_cached),
+        )
+def __test_rope__():
+    dtype=torch.float16
+    batch=4
+    seqlen=2048
+    dim=4096
+    num_heads=32
+    dim_key_head=dim // num_heads
+    x=torch.randn(batch,seqlen,num_heads,dim_key_head).to(dtype=dtype).to('cuda')
+    rpe=RotaryEmbedding(dim_key_head).to(dtype=dtype).to('cuda')
+    q,k=rpe(q=x,k=x)
+#__test_rope__()

elm/utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (c) 2024, SliceX AI, Inc. All Rights Reserved.
+from prettytable import PrettyTable
+def count_parameters(model):
+    """Count the number of parameters in the model."""
+    table = PrettyTable(["Modules", "Parameters"])
+    total_params = 0
+    for name, parameter in model.named_parameters():
+        if not parameter.requires_grad: continue
+        params = parameter.numel()
+        table.add_row([name, params])
+        total_params+=params
+    print(table)
+    print(f"Total Trainable Params: {total_params}")
+    return total_params
+def batchify(lst, n):
+    """Divide a list into chunks of size n."""
+    return [lst[i:i + n] for i in range(0, len(lst), n)]

models/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ “*.pt” filter=lfs diff=lfs merge=lfs -text
2	+ *.pt filter=lfs diff=lfs merge=lfs -text

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch
2	+ transformers

run.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+import argparse
+import json
+from elm.infer_elm import generate_elm_responses
+parser = argparse.ArgumentParser(description='run prompts with elm model.')
+parser.add_argument('elm_model_path', help='Path to the elm_model_path')
+def get_prompt_config_file(elm_model_path):
+    return os.path.join(elm_model_path, "example_prompts.json")
+def run(elm_model_path: str):
+    prompt_config_file = get_prompt_config_file(elm_model_path)
+    with open(prompt_config_file, "r") as f:
+        prompt_info = json.load(f)
+    prompts = [prompt_info["template"].format(input=input) for input in prompt_info["inputs"]]
+    print(f"Loaded prompts from: {prompt_config_file}")
+    generate_elm_responses(elm_model_path, prompts, verbose=True)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    run(args.elm_model_path)