diff --git a/.github/ISSUE_TEMPLATE/---bug-report.md b/.github/ISSUE_TEMPLATE/---bug-report.md
new file mode 100644
index 0000000000000000000000000000000000000000..a0d6c303eacb737308aeb5e2d78e75722bda4b21
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---bug-report.md
@@ -0,0 +1,36 @@
+---
+name: "\U0001F41B Bug report"
+about: Submit a bug report to improve our library!
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+<!-- Please check for related issues (both open and closed) before filing this issue. -->
+
+## Environment
+<!-- Please copy paste the output of running `composer_collect_env` below-->
+<!--
+    If you can't install composer for some reason, you can also use the PyTorch collect env script
+
+    wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
+    # For security purposes, please check the contents of collect_env.py before running it.
+    python collect_env.py
+-->
+
+## To reproduce
+
+Steps to reproduce the behavior:
+
+1.
+2.
+3.
+
+## Expected behavior
+
+<!-- A clear and concise description of what you would expect to happen. -->
+
+## Additional context
+
+<!-- Please provide any additional context. -->
diff --git a/.github/ISSUE_TEMPLATE/---feature-request.md b/.github/ISSUE_TEMPLATE/---feature-request.md
new file mode 100644
index 0000000000000000000000000000000000000000..800be70e93ee6f2a24d1764a24d73adb0d96f352
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---feature-request.md
@@ -0,0 +1,25 @@
+---
+name: "\U0001F680 Feature request"
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+<!-- Please check for related feature requests (both open and closed) before filing this request. -->
+
+## 🚀 Feature Request
+<!-- A clear and concise description of the feature proposal -->
+
+## Motivation
+
+<!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too -->
+
+## [Optional] Implementation
+
+<!-- Optionally, sketch out an implementation or interface needed. -->
+
+## Additional context
+
+<!-- Add any other context or screenshots about the feature request here. -->
diff --git a/.github/ISSUE_TEMPLATE/---model-questions.md b/.github/ISSUE_TEMPLATE/---model-questions.md
new file mode 100644
index 0000000000000000000000000000000000000000..b186ec4e817594933dbfc572db0c7aab1e9e1686
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---model-questions.md
@@ -0,0 +1,17 @@
+---
+name: "\U00002753 Model-related question"
+about: Ask a question about using our released models
+title: ''
+labels: question
+assignees: ''
+
+---
+
+<!-- Please check for related question (both open and closed) before filing this question. -->
+
+## ❓ Question
+<!-- A clear and concise description of the question -->
+
+## Additional context
+
+<!-- Add any other context or screenshots about the feature request here. -->
diff --git a/.github/mcp/mcp_pytest.py b/.github/mcp/mcp_pytest.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dd5645bcae429a5e514e7eaf1294566293aadcf
--- /dev/null
+++ b/.github/mcp/mcp_pytest.py
@@ -0,0 +1,139 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Run pytest using MCP."""
+
+import argparse
+import time
+
+from mcli.sdk import (RunConfig, RunStatus, create_run, follow_run_logs,
+                      stop_run, wait_for_run_status)
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--name',
+                        type=str,
+                        default='mcp-pytest',
+                        help='Base name of run')
+    parser.add_argument('--cluster',
+                        type=str,
+                        default='r1z4',
+                        help='Cluster to use')
+    parser.add_argument('--gpu_type',
+                        type=str,
+                        default='a100_40gb',
+                        help='Type of GPU to use')
+    parser.add_argument('--gpu_num',
+                        type=int,
+                        default=2,
+                        help='Number of the GPU to use')
+    parser.add_argument('--image',
+                        type=str,
+                        default='mosaicml/pytorch:latest',
+                        help='Docker image to use')
+    parser.add_argument('--git_branch',
+                        type=str,
+                        help='Git branch to check out')
+    parser.add_argument(
+        '--git_commit',
+        type=str,
+        help='Git commit to check out. Overrides git_branch if specified')
+    parser.add_argument(
+        '--pr_number',
+        type=int,
+        help=
+        'PR number to check out. Overrides git_branch/git_commit if specified')
+    parser.add_argument('--pytest_markers',
+                        type=str,
+                        help='Markers to pass to pytest')
+    parser.add_argument('--pytest_command',
+                        type=str,
+                        help='Command to run pytest')
+    parser.add_argument('--timeout',
+                        type=int,
+                        default=1800,
+                        help='Timeout for run (in seconds)')
+    args = parser.parse_args()
+
+    name = args.name
+    git_integration = {
+        'integration_type': 'git_repo',
+        'git_repo': 'mosaicml/llm-foundry',
+        'ssh_clone': 'False',
+    }
+    if args.git_branch is not None and args.git_commit is None:
+        name += f'-branch-{args.git_branch}'
+        git_integration['git_branch'] = args.git_branch
+    if args.git_commit is not None:
+        name += f'-commit-{args.git_commit}'
+        git_integration['git_commit'] = args.git_commit
+
+    command = 'cd llm-foundry'
+
+    # Checkout a specific PR if specified
+    if args.pr_number is not None:
+        name += f'-pr-{args.pr_number}'
+        command += f'''
+
+        git fetch origin pull/{args.pr_number}/head:pr_branch
+
+        git checkout pr_branch
+
+        '''
+
+    # Shorten name if too long
+    if len(name) > 56:
+        name = name[:56]
+
+    command += f'''
+
+    pip install --upgrade --user .[all]
+
+    export COMMON_ARGS="-v --durations=20 -m '{args.pytest_markers}'"
+
+    make test PYTEST='{args.pytest_command}' EXTRA_ARGS="$COMMON_ARGS --codeblocks"
+
+    make test-dist PYTEST='{args.pytest_command}' EXTRA_ARGS="$COMMON_ARGS" WORLD_SIZE=2
+
+    python -m coverage combine
+
+    python -m coverage report
+    '''
+
+    config = RunConfig(
+        name=name,
+        cluster=args.cluster,
+        gpu_type=args.gpu_type,
+        gpu_num=args.gpu_num,
+        image=args.image,
+        integrations=[git_integration],
+        command=command,
+    )
+
+    # Create run
+    run = create_run(config)
+    print(f'[GHA] Run created: {run.name}')
+
+    # Wait until run starts before fetching logs
+    run = wait_for_run_status(run, status='running')
+    start_time = time.time()
+    print('[GHA] Run started. Following logs...')
+
+    # Print logs
+    for line in follow_run_logs(run):
+        print(line, end='')
+        # Check if args.timeout seconds have elapsed
+        if time.time() - start_time > args.timeout:
+            print(
+                f'[GHA] Run timed out and did not complete in {args.timeout/60} minutes.'
+            )
+            run = stop_run(run)
+            print('[GHA] Run stopped.')
+            break
+
+    print('[GHA] Run completed. Waiting for run to finish...')
+    run = wait_for_run_status(run, status='completed')
+
+    # Fail if command exited with non-zero exit code or timed out
+    assert run.status == RunStatus.COMPLETED
diff --git a/.github/workflows/FUNDING.md b/.github/workflows/FUNDING.md
new file mode 100644
index 0000000000000000000000000000000000000000..c626b001b20836d3548b5b72932611ebdfaf7d58
--- /dev/null
+++ b/.github/workflows/FUNDING.md
@@ -0,0 +1,13 @@
+# These are supported funding model platforms
+
+github: [kyegomez]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: #Nothing
diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..69cd901726483c9be93dbc315e90b0c752bd5cc5
--- /dev/null
+++ b/.github/workflows/code-quality.yaml
@@ -0,0 +1,44 @@
+name: Code Quality Checks
+on:
+  push:
+    branches:
+    - main
+    - release/**
+  pull_request:
+    branches:
+    - main
+    - release/**
+  workflow_call:
+  workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+defaults:
+  run:
+    working-directory: .
+jobs:
+  code-quality:
+    runs-on: ubuntu-20.04
+    timeout-minutes: 10
+    strategy:
+      matrix:
+        python_version:
+        - '3.8'
+        - '3.9'
+        - '3.10'
+        pip_deps:
+        - '[dev]'
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python_version }}
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        python -m pip install --upgrade .${{ matrix.pip_deps }}
+    - name: Run checks
+      run: |
+        pre-commit run --all-files
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7fb270db975093bab956fe7bd5db69ce74aa3237
--- /dev/null
+++ b/.github/workflows/codeql-analysis.yml
@@ -0,0 +1,70 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: 'CodeQL'
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [main]
+  schedule:
+  - cron: '0 9 * * 1'  # Every Monday at 09:00 (9:00 AM)
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: ['python']
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
+        # Learn more about CodeQL language support at https://git.io/codeql-language-support
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v2
+      with:
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+        # queries: ./path/to/local/query, your-org/your-repo/queries@main
+
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v2
+
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 https://git.io/JvXDl
+
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
+    #    and modify them (or add more) to build your code if your project
+    #    uses a compiled language
+
+    # - run: |
+    #   make bootstrap
+    #   make release
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v2
diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f89d67ec39fd7c4e608ea8bdeda75fdb010d1aa0
--- /dev/null
+++ b/.github/workflows/coverage.yaml
@@ -0,0 +1,32 @@
+name: PyTest Coverage
+on:
+  workflow_call:
+    inputs:
+      download-path:
+        required: true
+        type: string
+jobs:
+  coverage:
+    timeout-minutes: 5
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        pip install coverage[toml]==6.5.0
+    - name: Download artifacts
+      uses: actions/download-artifact@v3
+      with:
+        path: ${{ inputs.download-path }}
+    - name: Generate coverage report
+      run: |
+        set -ex
+
+        # Flatten the coverage files
+        ls ${{ inputs.download-path }} | while read x; do mv ${{ inputs.download-path }}/$x/.coverage .coverage.$x; done
+
+        python -m coverage combine
+        python -m coverage report
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..28084b7fb48ec3ae2c629181e65f12dfc32dfd59
--- /dev/null
+++ b/.github/workflows/docker.yaml
@@ -0,0 +1,62 @@
+name: Docker
+on:
+  push:
+    branches:
+    - main
+  workflow_dispatch: {}
+jobs:
+  docker-build:
+    runs-on: ubuntu-latest
+    if: github.repository_owner == 'mosaicml'
+    strategy:
+      matrix:
+        include:
+        - name: '1.13.1_cu117'
+          base_image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+        - name: '2.0.1_cu118'
+          base_image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
+
+    steps:
+    - name: Maximize Build Space on Worker
+      uses: easimon/maximize-build-space@v4
+      with:
+        overprovision-lvm: true
+        remove-dotnet: true
+        remove-android: true
+        remove-haskell: true
+
+    - name: Checkout
+      uses: actions/checkout@v3
+
+    - name: Setup QEMU
+      uses: docker/setup-qemu-action@v2
+
+    - name: Setup Docker Buildx
+      uses: docker/setup-buildx-action@v2
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v2
+      with:
+        username: ${{ secrets.DOCKER_HUB_USERNAME }}
+        password: ${{ secrets.DOCKER_HUB_PASSWORD }}
+
+    - name: Calculate Docker Image Variables
+      run: |
+        set -euxo pipefail
+
+        ###################
+        # Calculate the tag
+        ###################
+        GIT_SHA=$(echo ${{ github.sha }} | cut -c1-7)
+        echo "IMAGE_TAG=${GIT_SHA}" >> ${GITHUB_ENV}
+
+    - name: Build and Push the Docker Image
+      uses: docker/build-push-action@v3
+      with:
+        context: .
+        tags: mosaicml/llm-foundry:${{ matrix.name }}-latest,
+          mosaicml/llm-foundry:${{ matrix.name }}-${{ env.IMAGE_TAG }}
+        push: true
+        cache-from: type=registry,ref=mosaicml/llm-foundry:${{ matrix.name }}-buildcache
+        cache-to: type=registry,ref=mosaicml/llm-foundry:${{ matrix.name }}-buildcache,mode=max
+        build-args: BASE_IMAGE=${{ matrix.base_image }}
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6af87346c8e5c0c46aaf8156483c01af64ee27a5
--- /dev/null
+++ b/.github/workflows/pr-cpu.yaml
@@ -0,0 +1,43 @@
+name: PR CPU tests
+on:
+  push:
+    branches:
+    - main
+    - release/*
+  pull_request:
+    branches:
+    - main
+    - release/*
+  workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+jobs:
+  pytest-cpu:
+    uses: ./.github/workflows/pytest-cpu.yaml
+    strategy:
+      matrix:
+        include:
+        - name: 'cpu-latest'
+          container: mosaicml/pytorch:latest_cpu  # mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
+          markers: 'not gpu'
+          pytest_command: 'coverage run -m pytest'
+        - name: 'cpu-2.0.1'
+          container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
+          markers: 'not gpu'
+          pytest_command: 'coverage run -m pytest'
+    name: ${{ matrix.name }}
+    if: github.repository_owner == 'mosaicml'
+    with:
+      container: ${{ matrix.container }}
+      name: ${{ matrix.name }}
+      pytest-command: ${{ matrix.pytest_command }}
+      pytest-markers: ${{ matrix.markers }}
+  coverage:
+    uses: ./.github/workflows/coverage.yaml
+    name: Coverage Results
+    if: github.repository_owner == 'mosaicml'
+    needs: [pytest-cpu]
+    with:
+      download-path: artifacts
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d228802ddc5a55f746e1e9ad9490323158f43453
--- /dev/null
+++ b/.github/workflows/pr-gpu.yaml
@@ -0,0 +1,40 @@
+name: PR GPU tests
+on:
+  push:
+    branches:
+    - main
+    - release/*
+  pull_request_target:
+    branches:
+    - main
+    - release/**
+  workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+jobs:
+  pytest-gpu:
+    uses: ./.github/workflows/pytest-gpu.yaml
+    strategy:
+      matrix:
+        include:
+        - name: 'gpu-latest'
+          container: mosaicml/pytorch:latest  # mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+          markers: 'gpu'
+          pytest_command: 'coverage run -m pytest'
+        - name: 'gpu-2.0.1'
+          container: mosaicml/pytorch:2.0.1_cu117-python3.10-ubuntu20.04
+          markers: 'gpu'
+          pytest_command: 'coverage run -m pytest'
+    name: ${{ matrix.name }}
+    if: github.repository_owner == 'mosaicml'
+    with:
+      container: ${{ matrix.container }}
+      mcloud-timeout: 1200
+      name: ${{ matrix.name }}
+      pytest-command: ${{ matrix.pytest_command }}
+      pytest-markers: ${{ matrix.markers }}
+      python-version: 3.9
+    secrets:
+      mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}
diff --git a/.github/workflows/pytest-cpu.yaml b/.github/workflows/pytest-cpu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5fe309cf3e2cac0693fa101a751635fb584e6fd
--- /dev/null
+++ b/.github/workflows/pytest-cpu.yaml
@@ -0,0 +1,48 @@
+name: Pytest CPU
+on:
+  workflow_call:
+    inputs:
+      container:
+        required: true
+        type: string
+      name:
+        required: true
+        type: string
+      pytest-command:
+        required: true
+        type: string
+      pytest-markers:
+        required: true
+        type: string
+jobs:
+  pytest-cpu:
+    timeout-minutes: 30
+    runs-on: ubuntu-latest
+    container: ${{ inputs.container }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Setup
+      run: |
+        set -ex
+        export PATH=/composer-python:$PATH
+        python -m pip install --upgrade 'pip<23' wheel
+        python -m pip install --upgrade .[dev]
+    - name: Run Tests
+      id: tests
+      run: |
+        set -ex
+        export PATH=/composer-python:$PATH
+        export COMMON_ARGS="-v --durations=20 -m '${{ inputs.pytest-markers }}'"
+
+        # Necessary to run git diff for doctests
+        git config --global --add safe.directory /__w/llm-foundry/llm-foundry
+
+        make test PYTEST='${{ inputs.pytest-command }}' EXTRA_ARGS="$COMMON_ARGS --codeblocks"
+        # make test-dist PYTEST='${{ inputs.pytest-command }}' EXTRA_ARGS="$COMMON_ARGS" WORLD_SIZE=2
+
+        python -m coverage combine
+    - uses: actions/upload-artifact@v3
+      with:
+        name: coverage-${{ github.sha }}-${{ inputs.name }}
+        path: .coverage
diff --git a/.github/workflows/pytest-gpu.yaml b/.github/workflows/pytest-gpu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45b49366c9210099479bc8ea6a274121897227cd
--- /dev/null
+++ b/.github/workflows/pytest-gpu.yaml
@@ -0,0 +1,80 @@
+name: Pytest GPU
+on:
+  workflow_call:
+    inputs:
+      container:
+        required: true
+        type: string
+      mcloud-timeout:
+        required: false
+        type: number
+        default: 1800
+      name:
+        required: true
+        type: string
+      pytest-command:
+        required: true
+        type: string
+      pytest-markers:
+        required: true
+        type: string
+      python-version:
+        required: false
+        type: string
+        default: 3.9
+    secrets:
+      mcloud-api-key:
+        required: true
+jobs:
+  pytest-gpu:
+    timeout-minutes: 60 # ${{ inputs.gha-timeout }} for some reason not able to turn this into an input
+    runs-on: ubuntu-latest
+    env:
+      MOSAICML_API_KEY: ${{ secrets.mcloud-api-key }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python-version }}
+    - name: Cache pip
+      uses: actions/cache@v3
+      with:
+        # This path is specific to Ubuntu
+        path: ~/.cache/pip
+        # Look to see if there is a cache hit for the corresponding requirements file
+        key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+          ${{ runner.os }}-
+    - name: Setup MCLI
+      run: |
+        set -ex
+        python -m pip install mosaicml-cli
+        mcli init --mcloud
+        mcli version
+    - name: Submit Run
+      id: tests
+      run: |
+        set -ex
+
+        PR_NUMBER="$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH")"
+        REF_ARGS=""
+
+        # Use the PR number if it exists, commit SHA for protected branches and the branch name otherwise
+        if [ -z "$PR_NUMBER" ] || [ "$PR_NUMBER" = "null" ]; then
+          if [[ "$GITHUB_REF" =~ "refs/heads/main" || "$GITHUB_REF" =~ "refs/heads/release" ]]; then
+            REF_ARGS="--git_commit $GITHUB_SHA"
+          else
+            REF_ARGS="--git_branch $GITHUB_REF_NAME"
+          fi
+        else
+          REF_ARGS="--pr_number $PR_NUMBER"
+        fi
+
+        python .github/mcp/mcp_pytest.py \
+              --image '${{ inputs.container }}' \
+              --pytest_markers '${{ inputs.pytest-markers }}' \
+              --pytest_command '${{ inputs.pytest-command }}' \
+              --timeout ${{ inputs.mcloud-timeout }} ${REF_ARGS}
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..dbaedd4f32c5fdcd84b0c250af1994b82ff4a671
--- /dev/null
+++ b/.github/workflows/python-publish.yml
@@ -0,0 +1,39 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: python -m build
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f241323b952bd489af7879dacef76de28a78329
--- /dev/null
+++ b/.github/workflows/release.yaml
@@ -0,0 +1,60 @@
+name: Release
+
+on:
+  push:
+    tags:
+    - 'v*'
+  workflow_dispatch:
+
+jobs:
+  code-quality:
+    uses: ./.github/workflows/code-quality.yaml
+
+  pypi-packaging:
+    name: Build and Publish llm-foundry PyPI Package
+    needs:
+    - code-quality
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout source
+      uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.9'
+
+    - name: Build source and wheel distributions
+      run: |
+        if [[ "${{ github.ref }}" =~ refs\/tags\/v ]]; then
+          PYPI_PACKAGE_NAME="llm-foundry"
+        else
+          PYPI_PACKAGE_NAME="llm-foundry-test-$(date +%Y%m%d%H%M%S)"
+        fi
+
+        # Remove the peft, xentropy-cuda-lib and triton-pre-mlir dependencies as PyPI does not
+        # support direct installs. The error message for importing PEFT, FusedCrossEntropy,
+        # and flash_attn_triton gives instructions on how to install if a user tries to use it
+        # without this dependency.
+        sed '/xentropy-cuda-lib@git+https:\/\/github.com\/HazyResearch\/flash-attention.git@.*/d' -i setup.py
+        sed '/triton-pre-mlir@git+https:\/\/github.com\/vchiley\/triton.git@.*/d' -i setup.py
+        sed '/peft@git+https:\/\/github.com\/huggingface\/peft.git.*/d' -i setup.py
+
+        python -m pip install --upgrade build twine
+        python -m build
+        twine check --strict dist/*
+
+    - name: Publish 📦 to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      if: contains(github.ref, 'refs/tags/v')
+      with:
+        user: __token__
+        password: ${{ secrets.PROD_PYPI_API_TOKEN }}
+
+    - name: Publish distribution 📦 to Test PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      if: contains(github.ref, 'refs/heads/') || contains(github.ref, 'refs/pull/')
+      with:
+        user: __token__
+        password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+        repository_url: https://test.pypi.org/legacy/
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..ad9f1788adaf39fffc3c8d4990a56673426514d1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.DS_Store
+dist
\ No newline at end of file
diff --git a/Andromeda/README.md b/Andromeda/README.md
index 8b0c987d2f04a886b84925d5afc8faa9248dad42..885beeff042bce399f738844450c59920a37d2f8 100644
--- a/Andromeda/README.md
+++ b/Andromeda/README.md
@@ -1,216 +1,121 @@
-[![Multi-Modality](agorabanner.png)](https://discord.gg/qUtxnK2NMf)
-
-
-# Andromeda: Ultra-Fast and Ultra-Intelligent SOTA Language Model 🚀🌌
-
-![Andromeda Next Generation Open Source Language Model](images/andromeda-banner.png)
-
-<div align="center">
-
-[![Open Bounties](https://img.shields.io/endpoint?url=https%3A%2F%2Fconsole.algora.io%2Fapi%2Fshields%2Fkyegomez%2Fbounties%3Fstatus%3Dopen)](https://console.algora.io/org/kyegomez/bounties?status=open)
-[![Rewarded Bounties](https://img.shields.io/endpoint?url=https%3A%2F%2Fconsole.algora.io%2Fapi%2Fshields%2Fkyegomez%2Fbounties%3Fstatus%3Dcompleted)](https://console.algora.io/org/kyegomez/bounties?status=completed)
-[![GitHub issues](https://img.shields.io/github/issues/kyegomez/Andromeda)](https://github.com/kyegomez/Andromeda/issues) 
-[![GitHub forks](https://img.shields.io/github/forks/kyegomez/Andromeda)](https://github.com/kyegomez/Andromeda/network) 
-[![GitHub stars](https://img.shields.io/github/stars/kyegomez/Andromeda)](https://github.com/kyegomez/Andromeda/stargazers) 
-[![GitHub license](https://img.shields.io/github/license/kyegomez/Andromeda)](https://github.com/kyegomez/Andromeda/blob/main/LICENSE)
-[![Share on Twitter](https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Share%20%40kyegomez/Andromeda)](https://twitter.com/intent/tweet?text=Check%20out%20this%20amazing%20AI%20project:%20Andromeda&url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda) 
-[![Share on Facebook](https://img.shields.io/badge/Share-%20facebook-blue)](https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda) 
-[![Share on LinkedIn](https://img.shields.io/badge/Share-%20linkedin-blue)](https://www.linkedin.com/shareArticle?mini=true&url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda&title=&summary=&source=)
-![Discord](https://img.shields.io/discord/999382051935506503)
-[![Share on Reddit](https://img.shields.io/badge/-Share%20on%20Reddit-orange)](https://www.reddit.com/submit?url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda&title=Andromeda%20-%20the%20next%20generation%20AI%20shields) 
-[![Share on Hacker News](https://img.shields.io/badge/-Share%20on%20Hacker%20News-orange)](https://news.ycombinator.com/submitlink?u=https%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda&t=Andromeda%20-%20the%20next%20generation%20AI%20shields) 
-[![Share on Pinterest](https://img.shields.io/badge/-Share%20on%20Pinterest-red)](https://pinterest.com/pin/create/button/?url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda&media=https%3A%2F%2Fexample.com%2Fimage.jpg&description=Andromeda%20-%20the%20next%20generation%20AI%20shields) 
-[![Share on WhatsApp](https://img.shields.io/badge/-Share%20on%20WhatsApp-green)](https://api.whatsapp.com/send?text=Check%20out%20Andromeda%20-%20the%20next%20generation%20AI%20shields%20%23Andromeda%20%23AI%0A%0Ahttps%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda)
-
-</div>
-
-
-
-Welcome to Andromeda, The Fastest, Most Creative, and Reliable Language Model Ever Built, train your own verison, conduct inference, and finetune your own verison with simple plug in and play scripts get started in 10 seconds, and:
-
-
-- 💼 Handle Ultra Long Sequences (32,000-200,000+ context lengths)
-- ⚡ Ultra Fast Processing (32,000+ tokens in under 100ms)
-- 🎓 Superior Reasoning Capabilities
-
----
-
-## 🔄 Updates
-
-- [READY FOR TRAINING, help us with the strategy!](https://www.figma.com/file/pfaU8Nhyw0EdXuT6z4Hutw/Andromeda-Strategy?type=whiteboard&node-id=0%3A1&t=Tub1wIzaPAXt2i86-1)
-- [And, here is the WANDB link to watch Andromeda train live!](https://wandb.ai/apacai/Andromeda/overview?)
-
----
-
-## Appreciation
-* All the creators in Agora, [Join Agora](https://discord.gg/qUtxnK2NMf) the community of AI engineers changing the world with their creations.
-* LucidRains for inspiring me to devote myself to open source AI
-
-
------
-## Hiring
-We're hiring: Engineers, Researchers, Interns, And, Customer Success Professionals to work on democratizing Andromeda, email me at with your story `kye@apac.ai`
-
-----------
-
-## 💻 Usage
-
-There are two methods to use Andromeda 
-
-1. `pip install TheBestLLMEver`
-
-2. `git clone https://github.com/kyegomez/Andromeda.git` 
-
-For detailed instructions, refer to the [Training SOP](DOCs/TRAINING.md) and [Documentation](https://github.com/kyegomez/Andromeda/blob/master/DOCs/DOCUMENTATION.md).
-
-### Method 1
-
-To get started:
-
-1. Clone the repository and install the required packages:
-
-```bash
-git clone https://github.com/kyegomez/Andromeda
-cd Andromeda
-pip3 install -r requirements.txt
-cd Andromeda
-python3 train.py
+# Transformer Model Technical Research Analysis
+
+This document provides an analysis of the hyperparameters and configurations of the given Transformer model, focusing on dimensions, depth, and heads, as well as an architectural overview of their meanings and use cases.
+
+## Model Configuration
+
+```python
+model = Transformer(
+    num_tokens=20000,
+    max_seq_len=8192,
+    use_abs_pos_emb = False,
+    attn_layers = Decoder(
+        dim=512,
+        depth=6,
+        heads=8,
+        alibi_pos_bias=True,
+        alibi_num_heads=4,
+        rotary_xpos=True,
+        attn_flash = True,
+        deepnorm=True,
+        shift_tokens=1,
+        attn_one_kv_head = True,
+    )
+)
 ```
 
-For further instructions, refer to the [Training SOP](DOCs/TRAINING.md).
-
----
-
-## 📚 Training
+### Hyperparameters
 
-1. Set the environment variables:
-   - `ENTITY_NAME`: Your wandb project name
-   - `OUTPUT_DIR`: Directory to save the weights (e.g., `./weights`)
-   - `MASTER_ADDR`: For distributed training
-   - `MASTER_PORT` For master port distributed training
-   - `RANK`- Number of nodes services
-   - `WORLD_SIZE` Number of gpus
-
-2. Configure the training:
-   - Accelerate Config
-   - Enable Deepspeed 3
-   - Accelerate launch train_distributed_accelerate.py
-
-For more information, refer to the [Training SOP](DOCs/TRAINING.md).
-
----
-
-## 🗃️ Dataset Building
-
-To preprocess a
-
- different dataset similar to the C4 or Falcon dataset used during training, use the `build_dataset.py` script. This script pre-tokenizes the data, chunks it into blocks of a specified sequence length, and uploads it to the Huggingface hub.
-
-Example command:
-
-```bash
-python3 Andromeda/build_dataset.py --seed 42 --seq_len 8192 --hf_account "HUGGINGFACE APIKEY" --tokenizer "EleutherAI/gpt-neox-20b" --dataset_name "EleutherAI/the_pile_deduplicated"
-```
+1. **num_tokens**: The number of unique tokens in the input vocabulary. In this case, the model is configured to handle 20,000 unique tokens.
 
----
+2. **max_seq_len**: The maximum sequence length that the model can handle. The current configuration supports sequences of up to 8,192 tokens.
 
-## 🚀 Why Andromeda?
+3. **use_abs_pos_emb**: A boolean flag indicating whether to use absolute positional embeddings. The model is configured not to use absolute positional embeddings (`False`).
 
-Andromeda offers several advantages:
-- Andromeda offers reliable processing of 100,000+ sequence lengths extremely fast under 300ms
-- Andromeda's dataset strategy was crafted with atomic precision and attention to detail for creativity and quantitative reasoning.
-- Andromeda is extremely intelligent with the ability to think like a poet or make API Calls to your favorite apps.
+4. **dim**: The dimensionality of the input embeddings and the internal representations within the Transformer layers. The model uses a dimensionality of 512.
 
-For detailed information about the model architecture and methods, refer to the [Model Architecture](DOCs/MODEL_ARCHITECTURE.md) documentation.
+5. **depth**: The number of Transformer layers (or blocks) in the model. This model has a depth of 6, meaning it has 6 layers.
 
----
+6. **heads**: The number of attention heads in the multi-head self-attention mechanism. This model uses 8 attention heads.
 
-# 🎯 Andromeda Principles
+### Additional Configurations
 
-- **Efficiency**: Optimize with techniques like attention flashing, rotary position encodings, and deep normalization.
-- **Flexibility**: Adapt to various tasks and domains for wide applications.
-- **Scalability**: Designed to scale with resources and data sizes.
-- **Community-Driven**: Thrives on contributions from the open-source community.
+- **alibi_pos_bias**: A boolean flag indicating whether to use the Alibi position bias mechanism. The model is configured to use Alibi position bias (`True`).
 
----
+- **alibi_num_heads**: The number of Alibi attention heads to use. The model is configured to use 4 Alibi attention heads.
 
-## 🚀 Get Involved
+- **rotary_xpos**: A boolean flag indicating whether to use the rotary positional encoding mechanism. The model is configured to use rotary positional encoding (`True`).
 
-We're just at the beginning of our journey. As we continue to develop and refine Andromeda, we invite you to join us. Whether you're a developer, researcher, or simply an enthusiast, your insights and contributions can help shape the future of Andromeda.
+- **attn_flash**: A boolean flag indicating whether to use the Flash attention mechanism. The model is configured to use Flash attention (`True`).
 
----
+- **deepnorm**: A boolean flag indicating whether to use deep normalization. The model is configured to use deep normalization (`True`).
 
-# 🤝 Contributing to Andromeda
+- **shift_tokens**: The number of tokens to shift during training to form the target sequence. The model is configured to shift by 1 token (`1`).
 
-We are thrilled to invite you to be a part of the Andromeda project. This is not just an open-source project but a community initiative, and we value your expertise and creativity. To show our appreciation, we have instituted a unique rewards system that directly compensates contributors from the revenue generated by the Andromeda API.
+- **attn_one_kv_head**: A boolean flag indicating whether to use one key-value head for attention instead of multiple heads. The model is configured to use one key-value head (`True`).
 
-## 🌟 Why Contribute
+## Architectural Overview
 
-Contributing to Andromeda not only enhances your skills and profile but also comes with financial rewards. When you contribute code, documentation, or any form of improvement to the Andromeda project, you are adding value. As such, we believe it's only fair that you share in the rewards.
+### Dimensions
 
-## 💰 Rewards Program
+- **Input Embedding Dimension (dim)**: This hyperparameter defines the size of the input embeddings and the internal representations within the Transformer layers. A larger dimensionality can capture more complex relationships between tokens but may require more computational resources.
 
-Here's how the Andromeda Rewards Program works:
+### Depth
 
-1. **Submit a Pull Request:** This can be a code enhancement, bug fix, documentation update, new feature, or any improvement to the project.
+- **Number of Transformer Layers (depth)**: This hyperparameter defines the number of Transformer layers (or blocks) in the model. Each layer consists of a multi-head self-attention mechanism followed by a position-wise feed-forward network. Increasing the depth allows the model to capture more complex and hierarchical relationships between tokens but may also increase the risk of overfitting and require more computational resources.
 
-2. **Review and Approval:** Our team will review your contribution. If it gets approved and merged, you become eligible for the rewards program.
+### Heads
 
-3. **Revenue Share:** Once your pull request is merged, you will receive a percentage of the revenue generated by the Andromeda API. The percentage will be determined based on the significance and impact of your contribution.
+- **Number of Attention Heads (heads)**: This hyperparameter defines the number of attention heads in the multi-head self-attention mechanism. Each head processes the input sequence independently and captures different aspects of the relationships between tokens. The outputs of all heads are then concatenated and transformed to produce the final output. Increasing the number of attention heads can help the model capture more diverse and fine-grained relationships between tokens but may also increase computational complexity and memory requirements.
 
-This means you're not just contributing to an open-source project; you're becoming a part of the Andromeda ecosystem. Your efforts can yield ongoing benefits as the Andromeda API grows and evolves.
+## Benefits and Consequences of Increasing Hyperparameters
 
-## 🚀 Becoming a Paid API
+### Dimensions
 
-As part of our growth strategy, we will be deploying Andromeda as a Paid API. The revenue generated from this API will not only sustain and further the project but also fund the rewards program. If you contribute anything to make Andromeda, you will receive recurring revenue from paid API requests!
+**Benefits:**
 
-## 🚀 How to Start Contributing
+- Better representation: Increasing the dimensionality of the input embeddings and internal representations allows the model to capture more complex relationships between tokens.
 
-If you're ready to become a part of Andromeda and contribute to the future of multimodal embeddings, here's what you need to do:
+- Improved model expressiveness: A higher dimensionality may enable the model to learn more expressive features, leading to better performance on complex tasks.
 
-1. Fork the repository.
+**Consequences:**
 
-2. Make your improvements or additions in your forked repository.
+- Computational complexity: Increasing the dimensionality will increase the computational complexity of the model, which may lead to longer training and inference times.
 
-3. Submit a pull request detailing the changes you've made.
+- Memory requirements: A higher dimensionality will increase the memory requirements of the model, potentially limiting its applicability on resource-constrained hardware.
 
-4. Our team will review your submission. If it's approved, it will be merged into the main repository, and you will become part of the Andromeda Rewards Program.
+- Risk of overfitting: Models with a higher dimensionality may be more prone to overfitting, especially if the size of the training dataset is small.
 
-Thank you for considering contributing to Andromeda. Your expertise and commitment to this project are what make it thrive. Let's build the future of multimodal embeddings together.
+### Depth
 
----
+**Benefits:**
 
-## 🗺️ Roadmap
+- Hierarchical representation: Increasing the depth of the model allows it to capture more complex and hierarchical relationships between tokens, which can lead to improved performance on tasks that require understanding long-range dependencies.
 
-1. **Training phase**: Train Andromeda on a large-scale dataset to achieve SOTA performance in various natural language processing tasks.
+- Enhanced feature extraction: Deeper models can extract features at different levels of abstraction, potentially improving their ability to generalize to new data.
 
-2. **World-class inference infrastructure**: Establish a robust and efficient infrastructure that leverages techniques such as:
+**Consequences:**
 
-   - Model quantization: Reduce memory and computational requirements without significant loss in performance.
-   - Distillation: Train smaller, faster models that retain the knowledge of the larger model.
-   - Optimized serving frameworks: Deploy Andromeda using efficient serving frameworks, such as NVIDIA Triton or TensorFlow Serving, for rapid inference.
+- Computational complexity: Increasing the depth will increase the computational complexity of the model, leading to longer training and inference times.
 
-3. **Continuous improvement**: Continuously fine-tune Andromeda on diverse data sources and adapt it to new tasks and domains.
+- Memory requirements: A deeper model will require more memory, potentially limiting its applicability on resource-constrained hardware.
 
-4. **Community-driven development**: Encourage open-source contributions, including pre-processing improvements, advanced training techniques, and novel use cases.
+- Risk of overfitting: Deeper models may be more prone to overfitting, especially if the size of the training dataset is small.
 
----
+- Vanishing/exploding gradients: Deeper models may suffer from vanishing or exploding gradients during training, making it harder to optimize the model. Techniques such as layer normalization or skip connections can help mitigate this issue.
 
-## 📈 Benchmarks
+### Heads
 
-### Speed
-- Andromeda utilizes one of the most reliable Attentions ever, flash attention 2.0 Triton. It consumes 50x less memory than GPT-3 and 10x less than LLAMA.
+**Benefits:**
 
-![AndromedaBanner](images/andromeda_performance.png)
+- Diverse attention: Increasing the number of attention heads allows the model to capture more diverse and fine-grained relationships between tokens, which can improve its ability to understand the input data.
 
-- We can speed this up even more with dynamic sparse flash attention 2.0.
+- Robustness: Multi-head attention can make the model more robust, as each head can focus on different aspects of the input data.
 
----
+**Consequences:**
 
-# 🔮 Join the Journey
+- Computational complexity: Increasing the number of attention heads will increase the computational complexity of the model, leading to longer training and inference times.
 
-We're just getting started, and we invite you to join the journey. Let's revolutionize the NLP landscape together! 🚀🌟
+- Memory requirements: A model with more attention heads will require more memory, potentially limiting its applicability on resource-constrained hardware.
 
-- Join Agora and work with 2,000+ AI Engineers to implement all new features.
-- Provide compute and help train Andromeda.
-- Share the message on how we're liberating this superintelligent AI and seizing the power from the corrupt, providing it back to you.
+- Diminishing returns: There may be diminishing returns when increasing the number of attention heads beyond a certain point, as the model may already be capturing most of the relevant information with fewer heads.
\ No newline at end of file
diff --git a/Andromeda/__init__.py b/Andromeda/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab18a31b670c694547ed9732c3fe61263df8baf3
--- /dev/null
+++ b/Andromeda/__init__.py
@@ -0,0 +1,3 @@
+# from Andromeda.train import Train
+from Andromeda.model import AndromedaTokenizer, Andromeda
+from Andromeda.train import Train, train
\ No newline at end of file
diff --git a/Andromeda/configs.py b/Andromeda/configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..4da16ec57cc8a288ec9951838f47d0cf6aade0c0
--- /dev/null
+++ b/Andromeda/configs.py
@@ -0,0 +1,128 @@
+from Andromeda.model import AndromedaEmbedding, Andromeda
+
+
+Andromeda1Billion = Andromeda(
+    num_tokens=25000,
+    max_seq_len=4192,
+    dim=2048,
+    depth=16,
+    dim_head=128,
+    heads=8,
+    use_abs_pos_emb=False, 
+    alibi_pos_bias=True, 
+    alibi_num_heads=4, 
+    rotary_xpos=True,
+    attn_flash=True, 
+    # shift_tokens=1, 
+    attn_one_kv_head=True, 
+    qk_norm=True, 
+    attn_qk_norm=True, 
+    attn_qk_norm_dim_scale=True, 
+    embedding_provider=AndromedaEmbedding()
+)
+
+
+
+Andromeda3Billion = Andromeda(
+    num_tokens=50432,
+    max_seq_len=8192,
+    dim=3072,
+    depth=24,
+    dim_head=128,
+    heads=12,
+    use_abs_pos_emb=False, 
+    alibi_pos_bias=True, 
+    alibi_num_heads=6, 
+    rotary_xpos=True,
+    attn_flash=True, 
+    shift_tokens=1, 
+    attn_one_kv_head=True, 
+    qk_norm=True, 
+    attn_qk_norm=True, 
+    attn_qk_norm_dim_scale=True, 
+    embedding_provider=AndromedaEmbedding()
+)
+
+
+
+Andromeda7Billion = Andromeda(
+    num_tokens=50432,
+    max_seq_len=8192,
+    dim=4096,
+    depth=32,
+    dim_head=128,
+    heads=16,
+    use_abs_pos_emb=False, 
+    alibi_pos_bias=True, 
+    alibi_num_heads=8, 
+    rotary_xpos=True,
+    attn_flash=True, 
+    shift_tokens=1, 
+    attn_one_kv_head=True, 
+    qk_norm=True, 
+    attn_qk_norm=True, 
+    attn_qk_norm_dim_scale=True, 
+    embedding_provider=AndromedaEmbedding()
+)
+
+Andromeda10Billion = Andromeda(
+    num_tokens=50432,
+    max_seq_len=8192,
+    dim=5120,
+    depth=32,
+    dim_head=128,
+    heads=20,
+    use_abs_pos_emb=False, 
+    alibi_pos_bias=True, 
+    alibi_num_heads=4, 
+    rotary_xpos=True,
+    attn_flash=True, 
+    shift_tokens=1, 
+    attn_one_kv_head=True, 
+    qk_norm=True, 
+    attn_qk_norm=True, 
+    attn_qk_norm_dim_scale=True, 
+    embedding_provider=AndromedaEmbedding()
+)
+
+Andromeda15Billion = Andromeda(
+    num_tokens=50432,
+    max_seq_len=8192,
+    dim=6144,
+    depth=40,
+    dim_head=128,
+    heads=24,
+    use_abs_pos_emb=False, 
+    alibi_pos_bias=True, 
+    alibi_num_heads=4, 
+    rotary_xpos=True,
+    attn_flash=True, 
+    shift_tokens=1, 
+    attn_one_kv_head=True, 
+    qk_norm=True, 
+    attn_qk_norm=True, 
+    attn_qk_norm_dim_scale=True, 
+    embedding_provider=AndromedaEmbedding()
+)
+
+Andromeda20Billion = Andromeda(
+    num_tokens=50432,
+    max_seq_len=8192,
+    dim=7168,
+    depth=48,
+    dim_head=128,
+    heads=28,
+    use_abs_pos_emb=False, 
+    alibi_pos_bias=True, 
+    alibi_num_heads=4, 
+    rotary_xpos=True,
+    attn_flash=True, 
+    shift_tokens=1, 
+    attn_one_kv_head=True, 
+    qk_norm=True, 
+    attn_qk_norm=True, 
+    attn_qk_norm_dim_scale=True, 
+    embedding_provider=AndromedaEmbedding()
+)
+
+#to GPT like 176Billion Parameters 122888 dimension, 96 depth, 96 heads, attn dim head 128
\ No newline at end of file
diff --git a/Andromeda/core/__init__.py b/Andromeda/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbbe7b70bee5f9ca083099216de514a2d860a399
--- /dev/null
+++ b/Andromeda/core/__init__.py
@@ -0,0 +1,8 @@
+import torch
+from packaging import version
+
+if version.parse(torch.__version__) >= version.parse('2.0.0'):
+    from einops._torch_specific import allow_ops_in_compiled_graph
+    allow_ops_in_compiled_graph()
+
+
diff --git a/Andromeda/core/attend.py b/Andromeda/core/attend.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec679c054258a12c215b120e99923c15b6bc03b3
--- /dev/null
+++ b/Andromeda/core/attend.py
@@ -0,0 +1,252 @@
+from functools import partial
+
+import torch
+from torch import nn, einsum, Tensor
+import torch.nn.functional as F
+
+from collections import namedtuple
+from functools import wraps
+from packaging import version
+from dataclasses import dataclass
+from einops import rearrange
+
+from Andromeda.core.flash import attention
+
+# from flash import FlashAttention
+
+# constants
+
+EfficientAttentionConfig = namedtuple('EfficientAttentionConfig', ['enable_flash', 'enable_math', 'enable_mem_efficient'])
+
+@dataclass
+class Intermediates:
+    qk_similarities: Tensor = None
+    pre_softmax_attn: Tensor = None
+    post_softmax_attn: Tensor = None
+
+# helpers
+
+def exists(val):
+    return val is not None
+
+def default(val, d):
+    return val if exists(val) else d
+
+def once(fn):
+    called = False
+    @wraps(fn)
+    def inner(x):
+        nonlocal called
+        if called:
+            return
+        called = True
+        return fn(x)
+    return inner
+
+print_once = once(print)
+
+# main class
+
+class Attend(nn.Module):
+    def __init__(
+        self,
+        *,
+        dropout = 0.,
+        causal = False,
+        heads = None,
+        talking_heads = False,
+        scale = None,
+        qk_norm = False,
+        flash = False,
+        triton = False,
+    ):
+        super().__init__()
+        self.scale = scale
+        self.qk_norm = qk_norm
+        self.causal = causal
+        self.attn_fn = partial(F.softmax, dtype = torch.float32) if not qk_norm else F.softmax
+
+        self.dropout = dropout
+        self.attn_dropout = nn.Dropout(dropout)
+
+        # talking heads
+
+        assert not (flash and talking_heads), 'talking heads not compatible with flash attention'
+
+        self.talking_heads = talking_heads
+        if talking_heads:
+            self.pre_softmax_talking_heads = nn.Conv2d(heads, heads, 1, bias = False)
+            self.post_softmax_talking_heads = nn.Conv2d(heads, heads, 1, bias = False)
+
+        # flash attention
+        self.flash = flash
+        assert not (flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above'
+
+        # determine efficient attention configs for cuda and cpu
+        self.cpu_config = EfficientAttentionConfig(True, True, True)
+        self.cuda_config = None
+
+        if not torch.cuda.is_available() or not flash:
+            return
+
+        device_properties = torch.cuda.get_device_properties(torch.device('cuda'))
+
+        if device_properties.major == 8 and device_properties.minor == 0:
+            print_once('A100 GPU detected, using flash attention if input tensor is on cuda')
+            self.cuda_config = EfficientAttentionConfig(True, False, False)
+        else:
+            print_once('Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda')
+            self.cuda_config = EfficientAttentionConfig(False, True, True)
+
+    def flash_attn(
+        self,
+        q, k, v,
+        mask = None,
+        attn_bias = None
+    ):
+        batch, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device
+
+        # Recommended for multi-query single-key-value attention by Tri Dao
+        # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64])
+
+        if k.ndim == 3:
+            k = rearrange(k, 'b ... -> b 1 ...').expand_as(q)
+
+        if v.ndim == 3:
+            v = rearrange(v, 'b ... -> b 1 ...').expand_as(q)
+
+        # handle scale - by default they scale by dim_head ** -0.5, but need to take care if using cosine sim attention
+
+        if self.qk_norm:
+            default_scale = q.shape[-1] ** -0.5
+            q = q * (default_scale / self.scale)
+
+        # Check if mask exists and expand to compatible shape
+        # The mask is B L, so it would have to be expanded to B H N L
+
+        causal = self.causal
+
+        if exists(mask):
+            assert mask.ndim == 4
+            mask = mask.expand(batch, heads, q_len, k_len)
+
+            # manually handle causal mask, if another mask was given
+
+            if causal:
+                causal_mask = torch.ones((q_len, k_len), dtype = torch.bool, device = device).triu(k_len - q_len + 1)
+                mask = mask | causal_mask
+                causal = False
+
+        # handle alibi positional bias
+        # convert from bool to float
+
+        if exists(attn_bias):
+            attn_bias = rearrange(attn_bias, 'h i j -> 1 h i j').expand(batch, -1, -1, -1)
+
+            # if mask given, the mask would already contain the causal mask from above logic
+            # otherwise, if no mask given but still causal, mask out alibi positional bias to a large negative number
+
+            mask_value = -torch.finfo(q.dtype).max
+
+            if exists(mask):
+                attn_bias = attn_bias.masked_fill(mask, mask_value // 2)
+            elif causal:
+                causal_mask = torch.ones((q_len, k_len), dtype = torch.bool, device = device).triu(k_len - q_len + 1)
+                attn_bias = attn_bias.masked_fill(causal_mask, mask_value // 2)
+                causal = False
+
+            # scaled_dot_product_attention handles attn_mask either as bool or additive bias
+            # make it an additive bias here
+
+            mask = attn_bias
+
+        # Check if there is a compatible device for flash attention
+
+        config = self.cuda_config if is_cuda else self.cpu_config
+
+        # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale
+        
+        with torch.backends.cuda.sdp_kernel(**config._asdict()):
+            out = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask = mask,
+                dropout_p = self.dropout if self.training else 0., 
+                is_causal = causal
+            )
+
+        return out, Intermediates()
+
+    def forward(
+        self,
+        q, k, v,
+        mask = None,
+        attn_bias = None,
+        prev_attn = None
+    ):
+        """
+        einstein notation
+        b - batch
+        h - heads
+        n, i, j - sequence length (base sequence length, source, target)
+        d - feature dimension
+        """
+
+        n, device = q.shape[-2], q.device
+
+        scale = default(self.scale, q.shape[-1] ** -0.5)
+
+        if self.flash:
+            assert not exists(prev_attn), 'residual attention not compatible with flash attention'
+            return self.flash_attn(q, k, v, mask = mask, attn_bias = attn_bias)
+            # return FlashAttention(q, k, v, mask=mask, attn_bias=attn_bias )
+
+        if self.triton:
+            return attention(q, k, v, self.casual, scale)
+
+        kv_einsum_eq = 'b j d' if k.ndim == 3 else 'b h j d'
+
+        dots = einsum(f'b h i d, {kv_einsum_eq} -> b h i j', q, k) * scale
+
+        if exists(prev_attn):
+            dots = dots + prev_attn
+
+        qk_similarities = dots.clone()
+
+        if self.talking_heads:
+            dots = self.pre_softmax_talking_heads(dots)
+
+        if exists(attn_bias):
+            dots = dots + attn_bias
+
+        dtype = dots.dtype
+        pre_softmax_attn = dots.clone()
+
+        mask_value = -torch.finfo(dots.dtype).max
+
+        if exists(mask):
+            dots = dots.masked_fill(mask, mask_value)
+
+        if self.causal:
+            i, j = dots.shape[-2:]
+            causal_mask = torch.ones((i, j), dtype = torch.bool, device = device).triu(j - i + 1)
+            dots = dots.masked_fill(causal_mask, mask_value)
+
+        attn = self.attn_fn(dots, dim = -1)
+        attn = attn.type(dtype)
+
+        post_softmax_attn = attn.clone()
+
+        attn = self.attn_dropout(attn)
+
+        if self.talking_heads:
+            attn = self.post_softmax_talking_heads(attn)
+
+        out = einsum(f'b h i j, {kv_einsum_eq} -> b h i d', attn, v)
+
+        intermediates = Intermediates(
+            qk_similarities = qk_similarities,
+            pre_softmax_attn = pre_softmax_attn,
+            post_softmax_attn = post_softmax_attn
+        )
+
+        return out, intermediates
diff --git a/Andromeda/core/autoregressive_wrapper.py b/Andromeda/core/autoregressive_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..7202c894561c51a35c828e64a9215905ca2843f3
--- /dev/null
+++ b/Andromeda/core/autoregressive_wrapper.py
@@ -0,0 +1,150 @@
+from math import ceil
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from einops import rearrange, pack, unpack
+
+def exists(val):
+    return val is not None
+
+def eval_decorator(fn):
+    def inner(self, *args, **kwargs):
+        was_training = self.training
+        self.eval()
+        out = fn(self, *args, **kwargs)
+        self.train(was_training)
+        return out
+    return inner
+
+# nucleus
+
+def top_p(logits, thres = 0.9):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+    sorted_indices_to_remove = cum_probs > (1 - thres)
+    sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+    sorted_indices_to_remove[:, 0] = 0
+
+    sorted_logits[sorted_indices_to_remove] = float('-inf')
+    return sorted_logits.scatter(1, sorted_indices, sorted_logits)
+
+# topk
+
+def top_k(logits, thres = 0.9):
+    k = ceil((1 - thres) * logits.shape[-1])
+    val, ind = torch.topk(logits, k)
+    probs = torch.full_like(logits, float('-inf'))
+    probs.scatter_(1, ind, val)
+    return probs
+
+# top_a
+
+def top_a(logits, min_p_pow=2.0, min_p_ratio=0.02):
+    probs = F.softmax(logits, dim=-1)
+    limit = torch.pow(torch.max(probs), min_p_pow) * min_p_ratio
+    logits[probs < limit] = float('-inf')
+    logits[probs >= limit] = 1
+    return logits
+
+# autoregressive wrapper class
+
+class AutoregressiveWrapper(nn.Module):
+    def __init__(
+        self,
+        net,
+        ignore_index = -100,
+        pad_value = 0,
+        mask_prob = 0.
+    ):
+        super().__init__()
+        self.pad_value = pad_value
+        self.ignore_index = ignore_index
+
+        self.net = net
+        self.max_seq_len = net.max_seq_len
+
+        # paper shows masking (MLM) in conjunction with autoregressive decoder-only training leads to big improvements https://arxiv.org/abs/2210.13432
+        assert mask_prob < 1.
+        self.mask_prob = mask_prob
+
+    @torch.no_grad()
+    @eval_decorator
+    def generate(
+        self,
+        start_tokens,
+        seq_len,
+        eos_token = None,
+        temperature = 1.,
+        filter_logits_fn = top_k,
+        filter_thres = 0.9,
+        min_p_pow = 2.0,
+        min_p_ratio = 0.02,
+        **kwargs
+    ):
+
+        start_tokens, ps = pack([start_tokens], '* n')
+
+        b, t = start_tokens.shape
+
+        out = start_tokens
+
+        for _ in range(seq_len):
+            x = out[:, -self.max_seq_len:]
+
+            logits = self.net(x, **kwargs)[:, -1]
+
+            if filter_logits_fn in {top_k, top_p}:
+                filtered_logits = filter_logits_fn(logits, thres = filter_thres)
+                probs = F.softmax(filtered_logits / temperature, dim=-1)
+
+            elif filter_logits_fn is top_a:
+                filtered_logits = filter_logits_fn(logits, min_p_pow = min_p_pow, min_p_ratio= min_p_ratio)
+                probs = F.softmax(filtered_logits / temperature, dim=-1)
+
+            sample = torch.multinomial(probs, 1)
+
+            out = torch.cat((out, sample), dim=-1)
+
+            if exists(eos_token):
+                is_eos_tokens = (out == eos_token)
+
+                if is_eos_tokens.any(dim = -1).all():
+                    # mask out everything after the eos tokens
+                    shifted_is_eos_tokens = F.pad(is_eos_tokens, (1, -1))
+                    mask = shifted_is_eos_tokens.float().cumsum(dim = -1) >= 1
+                    out = out.masked_fill(mask, self.pad_value)
+                    break
+
+        out = out[:, t:]
+
+        out, = unpack(out, ps, '* n')
+
+        return out
+
+    def forward(self, x, return_loss=True, **kwargs):
+        seq, ignore_index = x.shape[1], self.ignore_index
+
+        inp, target = x[:, :-1], x[:, 1:]
+
+        if self.mask_prob > 0.:
+            rand = torch.randn(inp.shape, device = x.device)
+            rand[:, 0] = -torch.finfo(rand.dtype).max # first token should not be masked out
+            num_mask = min(int(seq * self.mask_prob), seq - 1)
+            indices = rand.topk(num_mask, dim = -1).indices
+            mask = ~torch.zeros_like(inp).scatter(1, indices, 1.).bool()
+            kwargs.update(self_attn_context_mask = mask)
+
+        logits = self.net(inp, **kwargs)
+
+        loss = F.cross_entropy(
+            rearrange(logits, 'b n c -> b c n'),
+            target,
+            ignore_index = ignore_index
+        )
+
+        if return_loss:
+            return logits, loss
+
+        return logits
diff --git a/Andromeda/core/flash.py b/Andromeda/core/flash.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5582a6c84f38b668ab4607420182da63cd77602
--- /dev/null
+++ b/Andromeda/core/flash.py
@@ -0,0 +1,289 @@
+import torch
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def max_fn(x, y):
+    return tl.math.max(x, y)
+
+
+@triton.jit
+def _fwd_kernel(
+    Q, K, V, sm_scale,
+    L,
+    Out,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vk, stride_vn,
+    stride_oz, stride_oh, stride_om, stride_on,
+    Z, H, N_CTX,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    qvk_offset = off_hz * stride_qh
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + qvk_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    K_block_ptr = tl.make_block_ptr(
+        base=K + qvk_offset,
+        shape=(BLOCK_DMODEL, N_CTX),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1)
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V + qvk_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_vk, stride_vn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
+    # load q: it will stay in SRAM throughout
+    q = tl.load(Q_block_ptr)
+    q = (q * qk_scale).to(tl.float16)
+    # loop over k, v and update accumulator
+    lo = 0
+    hi = (start_m + 1) * BLOCK_M if IS_CAUSAL else N_CTX
+    for start_n in range(lo, hi, BLOCK_N):
+        # -- load k, v --
+        k = tl.load(K_block_ptr)
+        v = tl.load(V_block_ptr)
+        # -- compute qk ---
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        if IS_CAUSAL:
+            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant ---
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(tl.float16), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+        # update pointers
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+    # write back l and m
+    acc = acc / l_i[:, None]
+    l_ptrs = L + off_hz * N_CTX + offs_m
+    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
+    # write back O
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + qvk_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_om, stride_on),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    tl.store(O_block_ptr, acc.to(tl.float16))
+
+
+@triton.jit
+def _bwd_preprocess(
+    Out, DO,
+    Delta,
+    BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr,
+):
+    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_n = tl.arange(0, D_HEAD)
+    # load
+    o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
+    do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
+    # compute
+    delta = tl.sum(o * do, axis=1)
+    # write-back
+    tl.store(Delta + off_m, delta)
+
+
+@triton.jit
+def _bwd_kernel(
+    Q, K, V, sm_scale, Out, DO,
+    DQ, DK, DV,
+    L,
+    D,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vk, stride_vn,
+    Z, H, N_CTX,
+    num_block,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    CAUSAL: tl.constexpr,
+):
+    off_hz = tl.program_id(0)
+    off_z = off_hz // H
+    off_h = off_hz % H
+    qk_scale = sm_scale * 1.44269504
+    # offset pointers for batch/head
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_qz + off_h * stride_qh
+    V += off_z * stride_qz + off_h * stride_qh
+    DO += off_z * stride_qz + off_h * stride_qh
+    DQ += off_z * stride_qz + off_h * stride_qh
+    DK += off_z * stride_qz + off_h * stride_qh
+    DV += off_z * stride_qz + off_h * stride_qh
+    for start_n in range(0, num_block):
+        if CAUSAL:
+            lo = start_n * BLOCK_M
+        else:
+            lo = 0
+        # initialize row/col offsets
+        offs_qm = lo + tl.arange(0, BLOCK_M)
+        offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M)
+        offs_m = tl.arange(0, BLOCK_N)
+        offs_k = tl.arange(0, BLOCK_DMODEL)
+        # initialize pointers to value-like data
+        q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
+        v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        # pointer to row-wise quantities in value-like data
+        D_ptrs = D + off_hz * N_CTX
+        l_ptrs = L + off_hz * N_CTX
+        # initialize dv amd dk
+        dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+        # k and v stay in SRAM throughout
+        k = tl.load(k_ptrs)
+        v = tl.load(v_ptrs)
+        # loop over rows
+        for start_m in range(lo, num_block * BLOCK_M, BLOCK_M):
+            offs_m_curr = start_m + offs_m
+            # load q, k, v, do on-chip
+            q = tl.load(q_ptrs)
+            # recompute p = softmax(qk, dim=-1).T
+            if CAUSAL:
+                qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), float(0.), float("-inf"))
+            else:
+                qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q, tl.trans(k))
+            qk *= qk_scale
+            l_i = tl.load(l_ptrs + offs_m_curr)
+            p = tl.math.exp2(qk - l_i[:, None])
+            # compute dv
+            do = tl.load(do_ptrs)
+            dv += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do)
+            # compute dp = dot(v, do)
+            Di = tl.load(D_ptrs + offs_m_curr)
+            dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
+            dp += tl.dot(do, tl.trans(v))
+            # compute ds = p * (dp - delta[:, None])
+            ds = p * dp * sm_scale
+            # compute dk = dot(ds.T, q)
+            dk += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q)
+            # compute dq
+            dq = tl.load(dq_ptrs)
+            dq += tl.dot(ds.to(Q.dtype.element_ty), k)
+            tl.store(dq_ptrs, dq)
+            # increment pointers
+            dq_ptrs += BLOCK_M * stride_qm
+            q_ptrs += BLOCK_M * stride_qm
+            do_ptrs += BLOCK_M * stride_qm
+        # write-back
+        dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
+        tl.store(dv_ptrs, dv)
+        tl.store(dk_ptrs, dk)
+
+
+empty = torch.empty(128, device="cuda")
+
+
+class _attention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, causal, sm_scale):
+        # shape constraints
+        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+        assert Lq == Lk and Lk == Lv
+        assert Lk in {16, 32, 64, 128}
+        o = torch.empty_like(q)
+        BLOCK_M = 128
+        BLOCK_N = 64
+        grid = (triton.cdiv(q.shape[2], BLOCK_M), q.shape[0] * q.shape[1], 1)
+        L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
+
+        num_warps = 4 if Lk <= 64 else 8
+        _fwd_kernel[grid](
+            q, k, v, sm_scale,
+            L,
+            o,
+            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+            v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+            o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+            q.shape[0], q.shape[1], q.shape[2],
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_DMODEL=Lk,
+            IS_CAUSAL=causal,
+            num_warps=num_warps,
+            num_stages=4)
+
+        ctx.save_for_backward(q, k, v, o, L)
+        ctx.grid = grid
+        ctx.sm_scale = sm_scale
+        ctx.BLOCK_DMODEL = Lk
+        ctx.causal = causal
+        return o
+
+    @staticmethod
+    def backward(ctx, do):
+        BLOCK = 128
+        q, k, v, o, L = ctx.saved_tensors
+        do = do.contiguous()
+        dq = torch.zeros_like(q, dtype=torch.float32)
+        dk = torch.empty_like(k)
+        dv = torch.empty_like(v)
+        delta = torch.empty_like(L)
+        _bwd_preprocess[(ctx.grid[0] * ctx.grid[1], )](
+            o, do,
+            delta,
+            BLOCK_M=BLOCK, D_HEAD=ctx.BLOCK_DMODEL,
+        )
+        _bwd_kernel[(ctx.grid[1],)](
+            q, k, v, ctx.sm_scale,
+            o, do,
+            dq, dk, dv,
+            L, delta,
+            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+            v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+            q.shape[0], q.shape[1], q.shape[2],
+            ctx.grid[0],
+            BLOCK_M=BLOCK, BLOCK_N=BLOCK,
+            BLOCK_DMODEL=ctx.BLOCK_DMODEL, num_warps=8,
+            CAUSAL=ctx.causal,
+            num_stages=1,
+        )
+        return dq, dk, dv, None, None
+
+
+attention = _attention.apply
diff --git a/Andromeda/core/transformer.py b/Andromeda/core/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..697240780a3a7f4af2eee286edf6b37e4d1e7f17
--- /dev/null
+++ b/Andromeda/core/transformer.py
@@ -0,0 +1,1376 @@
+import math
+from random import random
+
+import torch
+from torch import nn, einsum, Tensor
+import torch.nn.functional as F
+
+from functools import partial, wraps
+from inspect import isfunction
+from dataclasses import dataclass
+from typing import List
+
+from einops import rearrange, repeat
+
+from Andromeda.core.attend import Attend, Intermediates
+from Andromeda.core.autoregressive_wrapper import AutoregressiveWrapper
+
+from abc import ABC, abstractmethod
+# import bitsandbytes as bnb
+
+# constants
+
+DEFAULT_DIM_HEAD = 64
+
+@dataclass
+class LayerIntermediates:
+    hiddens: List[Tensor] = None
+    attn_intermediates: List[Intermediates] = None
+
+# helpers
+
+def exists(val):
+    return val is not None
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+def cast_tuple(val, depth):
+    return val if isinstance(val, tuple) else (val,) * depth
+
+def maybe(fn):
+    @wraps(fn)
+    def inner(x, *args, **kwargs):
+        if not exists(x):
+            return x
+        return fn(x, *args, **kwargs)
+    return inner
+
+class always():
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, *args, **kwargs):
+        return self.val
+
+class not_equals():
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, x, *args, **kwargs):
+        return x != self.val
+
+class equals():
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, x, *args, **kwargs):
+        return x == self.val
+
+# tensor helpers
+
+def max_neg_value(tensor):
+    return -torch.finfo(tensor.dtype).max
+
+def l2norm(t, groups = 1):
+    t = rearrange(t, '... (g d) -> ... g d', g = groups)
+    t = F.normalize(t, p = 2, dim = -1)
+    return rearrange(t, '... g d -> ... (g d)')
+
+def pad_at_dim(t, pad, dim = -1, value = 0.):
+    dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
+    zeros = ((0, 0) * dims_from_right)
+    return F.pad(t, (*zeros, *pad), value = value)
+
+def or_reduce(masks):
+    head, *body = masks
+    for rest in body:
+        head = head | rest
+    return head
+
+# init helpers
+
+def init_zero_(layer):
+    nn.init.constant_(layer.weight, 0.)
+    if exists(layer.bias):
+        nn.init.constant_(layer.bias, 0.)
+
+# keyword argument helpers
+
+def pick_and_pop(keys, d):
+    values = list(map(lambda key: d.pop(key), keys))
+    return dict(zip(keys, values))
+
+def group_dict_by_key(cond, d):
+    return_val = [dict(),dict()]
+    for key in d.keys():
+        match = bool(cond(key))
+        ind = int(not match)
+        return_val[ind][key] = d[key]
+    return (*return_val,)
+
+def string_begins_with(prefix, str):
+    return str.startswith(prefix)
+
+def group_by_key_prefix(prefix, d):
+    return group_dict_by_key(partial(string_begins_with, prefix), d)
+
+def groupby_prefix_and_trim(prefix, d):
+    kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
+    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
+    return kwargs_without_prefix, kwargs
+
+# initializations
+
+def deepnorm_init(
+    transformer,
+    beta,
+    module_name_match_list = ['.ff.', '.to_v', '.to_out']
+):
+    for name, module in transformer.named_modules():
+        if type(module) != nn.Linear:
+            continue
+
+        needs_beta_gain = any(map(lambda substr: substr in name, module_name_match_list))
+        gain = beta if needs_beta_gain else 1
+        nn.init.xavier_normal_(module.weight.data, gain = gain)
+
+        if exists(module.bias):
+            nn.init.constant_(module.bias.data, 0)
+
+# structured dropout, more effective than traditional attention dropouts
+
+def dropout_seq(seq, mask, dropout):
+    b, n, *_, device = *seq.shape, seq.device
+    logits = torch.randn(b, n, device = device)
+
+    if exists(mask):
+        mask_value = max_neg_value(logits)
+        logits = logits.masked_fill(~mask, mask_value)
+
+    keep_prob = 1. - dropout
+    num_keep = max(1,  int(keep_prob * n))
+    keep_indices = logits.topk(num_keep, dim = 1).indices
+
+    batch_indices = torch.arange(b, device = device)
+    batch_indices = rearrange(batch_indices, 'b -> b 1')
+
+    seq = seq[batch_indices, keep_indices]
+
+    if exists(mask):
+        seq_counts = mask.sum(dim = -1)
+        seq_keep_counts = torch.ceil(seq_counts * keep_prob).int()
+        keep_mask = torch.arange(num_keep, device = device) < rearrange(seq_keep_counts, 'b -> b 1')
+
+        mask = mask[batch_indices, keep_indices] & keep_mask
+
+    return seq, mask
+
+# activations
+
+class ReluSquared(nn.Module):
+    def forward(self, x):
+        return F.relu(x) ** 2
+    
+
+#tokenization
+class BaseTokenizer(ABC):
+    @abstractmethod
+    def tokenize(self, text: str) -> List[int]:
+        pass
+
+class CustomTokenizer(BaseTokenizer):
+    def tokenize(self, text: str) -> List[int]:
+        # Your custom tokenization algorithm
+        tokens = ...
+        return tokens
+
+# embedding
+
+class BaseEmbedding(ABC):
+    @abstractmethod
+    def get_embedding(self, num_tokens: int, dim: int) -> nn.Module:
+        # Custom embedding function or model
+        embedding = ...
+        
+        return embedding
+
+class AndromedaEmbedding(BaseEmbedding):
+    def get_embedding(self, num_tokens: int, dim: int) -> nn.Module:
+        embedding = nn.Embedding(num_tokens, dim)
+
+        return embedding
+    
+# class AndromedaBnBEmbedding(BaseEmbedding):
+#     def get_embedding(self, num_tokens: int, dim: int, padding_idx: int = 0) -> bnb.nn.modules:
+#         embedding = bnb.nn.modules.Embedding(num_tokens, dim, padding_idx)
+
+#         return embedding
+
+class TokenEmbedding(nn.Module):
+    def __init__(self, dim, num_tokens, embedding_provider: BaseEmbedding, l2norm_embed = False):
+        super().__init__()
+        self.l2norm_embed = l2norm_embed
+        self.emb = embedding_provider.get_embedding(num_tokens, dim)
+        # nn.Embedding(num_tokens, dim)
+
+    def forward(self, x):
+        token_emb = self.emb(x)
+        return l2norm(token_emb) if self.l2norm_embed else token_emb
+
+# positional embeddings
+
+class AbsolutePositionalEmbedding(nn.Module):
+    def __init__(self, dim, max_seq_len, l2norm_embed = False):
+        super().__init__()
+        self.scale = dim ** -0.5 if not l2norm_embed else 1.
+        self.max_seq_len = max_seq_len
+        self.l2norm_embed = l2norm_embed
+        self.emb = nn.Embedding(max_seq_len, dim)
+
+    def forward(self, x, pos = None):
+        seq_len, device = x.shape[1], x.device
+        assert seq_len <= self.max_seq_len, f'you are passing in a sequence length of {seq_len} but your absolute positional embedding has a max sequence length of {self.max_seq_len}'
+
+        if not exists(pos):
+            pos = torch.arange(seq_len, device = device)
+
+        pos_emb = self.emb(pos)
+        pos_emb = pos_emb * self.scale
+        return l2norm(pos_emb) if self.l2norm_embed else pos_emb
+
+class ScaledSinusoidalEmbedding(nn.Module):
+    def __init__(self, dim, theta = 10000):
+        super().__init__()
+        assert (dim % 2) == 0
+        self.scale = nn.Parameter(torch.ones(1) * dim ** -0.5)
+
+        half_dim = dim // 2
+        freq_seq = torch.arange(half_dim).float() / half_dim
+        inv_freq = theta ** -freq_seq
+        self.register_buffer('inv_freq', inv_freq, persistent = False)
+
+    def forward(self, x, pos = None):
+        seq_len, device = x.shape[1], x.device
+
+        if not exists(pos):
+            pos = torch.arange(seq_len, device = device)
+
+        emb = einsum('i, j -> i j', pos, self.inv_freq)
+        emb = torch.cat((emb.sin(), emb.cos()), dim = -1)
+        return emb * self.scale
+
+class RelativePositionBias(nn.Module):
+    def __init__(self, scale, causal = False, num_buckets = 32, max_distance = 128, heads = 8):
+        super().__init__()
+        self.scale = scale
+        self.causal = causal
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.relative_attention_bias = nn.Embedding(num_buckets, heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, causal = True, num_buckets = 32, max_distance = 128):
+        ret = 0
+        n = -relative_position
+        if not causal:
+            num_buckets //= 2
+            ret += (n < 0).long() * num_buckets
+            n = torch.abs(n)
+        else:
+            n = torch.max(n, torch.zeros_like(n))
+
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+
+        val_if_large = max_exact + (
+            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).long()
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def forward(self, i, j):
+        device = self.device
+        q_pos = torch.arange(j - i, j, dtype = torch.long, device = device)
+        k_pos = torch.arange(j, dtype = torch.long, device = device)
+        rel_pos = k_pos[None, :] - q_pos[:, None]
+        rp_bucket = self._relative_position_bucket(rel_pos, causal = self.causal, num_buckets = self.num_buckets, max_distance = self.max_distance)
+        values = self.relative_attention_bias(rp_bucket)
+        bias = rearrange(values, 'i j h -> h i j')
+        return bias * self.scale
+
+class DynamicPositionBias(nn.Module):
+    def __init__(self, dim, *, heads, depth, log_distance = False, norm = False):
+        super().__init__()
+        assert depth >= 1, 'depth for dynamic position bias MLP must be greater or equal to 1'
+        self.log_distance = log_distance
+
+        self.mlp = nn.ModuleList([])
+
+        self.mlp.append(nn.Sequential(
+            nn.Linear(1, dim),
+            nn.LayerNorm(dim) if norm else nn.Identity(),
+            nn.SiLU()
+        ))
+
+        for _ in range(depth - 1):
+            self.mlp.append(nn.Sequential(
+                nn.Linear(dim, dim),
+                nn.LayerNorm(dim) if norm else nn.Identity(),
+                nn.SiLU()
+            ))
+
+        self.mlp.append(nn.Linear(dim, heads))
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def forward(self, i, j):
+        assert i == j
+        n, device = j, self.device
+
+        # get the (n x n) matrix of distances
+        seq_arange = torch.arange(n, device = device)
+        context_arange = torch.arange(n, device = device)
+        indices = rearrange(seq_arange, 'i -> i 1') - rearrange(context_arange, 'j -> 1 j')
+        indices += (n - 1)
+
+        # input to continuous positions MLP
+        pos = torch.arange(-n + 1, n, device = device).float()
+        pos = rearrange(pos, '... -> ... 1')
+
+        if self.log_distance:
+            pos = torch.sign(pos) * torch.log(pos.abs() + 1)  # log of distance is sign(rel_pos) * log(abs(rel_pos) + 1)
+
+        for layer in self.mlp:
+            pos = layer(pos)
+
+        # get position biases        
+        bias = pos[indices]
+        bias = rearrange(bias, 'i j h -> h i j')
+        return bias
+
+class AlibiPositionalBias(nn.Module):
+    def __init__(self, heads, total_heads, **kwargs):
+        super().__init__()
+        self.heads = heads
+        self.total_heads = total_heads
+
+        slopes = Tensor(self._get_slopes(heads))
+        slopes = rearrange(slopes, 'h -> h 1 1')
+        self.register_buffer('slopes', slopes, persistent = False)
+        self.register_buffer('bias', None, persistent = False)
+    
+    def get_bias(self, i, j, device):
+        i_arange = torch.arange(j - i, j, device = device)
+        j_arange = torch.arange(j, device = device)
+        bias = -torch.abs(rearrange(j_arange, 'j -> 1 1 j') - rearrange(i_arange, 'i -> 1 i 1'))
+        return bias
+
+    @staticmethod
+    def _get_slopes(heads):
+        def get_slopes_power_of_2(n):
+            start = (2**(-2**-(math.log2(n)-3)))
+            ratio = start
+            return [start*ratio**i for i in range(n)]
+
+        if math.log2(heads).is_integer():
+            return get_slopes_power_of_2(heads)
+
+        closest_power_of_2 = 2 ** math.floor(math.log2(heads))
+        return get_slopes_power_of_2(closest_power_of_2) + get_slopes_power_of_2(2 * closest_power_of_2)[0::2][:heads-closest_power_of_2]
+
+    @property
+    def device(self):
+        return next(self.buffers()).device
+
+    def forward(self, i, j):
+        h, device = self.total_heads, self.device
+
+        if exists(self.bias) and self.bias.shape[-1] >= j:
+            return self.bias[..., :i, :j]
+
+        bias = self.get_bias(i, j, device)
+        bias = bias * self.slopes
+
+        num_heads_unalibied = h - bias.shape[0]
+        bias = pad_at_dim(bias, (0, num_heads_unalibied), dim = 0)
+        self.register_buffer('bias', bias, persistent = False)
+
+        return self.bias
+
+class LearnedAlibiPositionalBias(AlibiPositionalBias):
+    def __init__(self, heads, total_heads):
+        super().__init__(heads, total_heads)
+        log_slopes = torch.log(self.slopes)
+        self.learned_logslopes = nn.Parameter(log_slopes)
+
+    def forward(self, i, j):
+        h, i, j, device = self.heads, self.device
+
+        def get_slopes(param):
+            return pad_at_dim(param.exp(), (0, h - param.shape[0]), dim = -2)
+
+        if exists(self.bias) and self.bias.shape[-1] >= j:
+            bias = self.bias[..., :i, :j]
+        else:
+            bias = self.get_bias(i, j, device)
+            self.register_buffer('bias', bias, persistent = False)
+
+        slopes = get_slopes(self.learned_logslopes)
+        bias = bias * slopes
+
+        return bias
+
+class RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        use_xpos = False,
+        scale_base = 512,
+        interpolation_factor=1.,
+        base=10000,
+        base_rescale_factor=1.
+    ):
+        super().__init__()
+        base *=  base_rescale_factor ** (dim / (dim - 2))
+        
+        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+
+        self.register_buffer('inv_freq', inv_freq)
+
+        if not use_xpos:
+            self.register_buffer('scale', None)
+            return
+
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+
+        self.scale_base = scale_base
+        self.register_buffer('scale', scale)
+
+    def forward(self, seq_len, device):
+        t = torch.arange(seq_len, device = device).type_as(self.inv_freq)
+        freqs = torch.einsum('i , j -> i j', t, self.inv_freq)
+        freqs = torch.cat((freqs, freqs), dim = -1)
+
+        if not exists(self.scale):
+            return freqs, 1.
+
+        power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base
+        scale = self.scale ** rearrange(power, 'n -> n 1')
+        scale = torch.cat((scale, scale), dim = -1)
+
+        return freqs, scale
+
+
+def rotate_half(x):
+    x = rearrange(x, '... (j d) -> ... j d', j = 2)
+    x1, x2 = x.unbind(dim = -2)
+    return torch.cat((-x2, x1), dim = -1)
+
+def apply_rotary_pos_emb(t, freqs, scale = 1):
+    seq_len = t.shape[-2]
+    freqs = freqs[-seq_len:, :]
+    return (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+
+# norms
+
+class Scale(nn.Module):
+    def __init__(self, value, fn):
+        super().__init__()
+        self.value = value
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        out = self.fn(x, **kwargs)
+        def scale_fn(t):
+            return t * self.value
+
+        if not isinstance(out, tuple):
+            return scale_fn(out)
+
+        return (scale_fn(out[0]), *out[1:])
+
+class ScaleNorm(nn.Module):
+    def __init__(self, dim, eps = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(1) * (dim ** -0.5))
+
+    def forward(self, x):
+        norm = torch.norm(x, dim = -1, keepdim = True)
+        return x / norm.clamp(min = self.eps) * self.g
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps = 1e-8):
+        super().__init__()
+        self.scale = dim ** -0.5
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x):
+        norm = torch.norm(x, dim = -1, keepdim = True) * self.scale
+        return x / norm.clamp(min = self.eps) * self.g
+
+# residual and residual gates
+
+class Residual(nn.Module):
+    def __init__(self, dim, scale_residual = False, scale_residual_constant = 1.):
+        super().__init__()
+        self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None
+        self.scale_residual_constant = scale_residual_constant
+
+    def forward(self, x, residual):
+        if exists(self.residual_scale):
+            residual = residual * self.residual_scale
+
+        if self.scale_residual_constant != 1:
+            residual = residual * self.scale_residual_constant
+
+        return x + residual
+
+class GRUGating(nn.Module):
+    def __init__(self, dim, scale_residual = False, **kwargs):
+        super().__init__()
+        self.gru = nn.GRUCell(dim, dim)
+        self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None
+
+    def forward(self, x, residual):
+        if exists(self.residual_scale):
+            residual = residual * self.residual_scale
+
+        gated_output = self.gru(
+            rearrange(x, 'b n d -> (b n) d'),
+            rearrange(residual, 'b n d -> (b n) d')
+        )
+
+        return gated_output.reshape_as(x)
+
+# token shifting
+
+def shift(t, amount, mask = None):
+    if amount == 0:
+        return t
+    else:
+        amount = min(amount, t.shape[1])
+
+    if exists(mask):
+        t = t.masked_fill(~mask[..., None], 0.)
+
+    return pad_at_dim(t, (amount, -amount), dim = - 2, value = 0.)
+
+class ShiftTokens(nn.Module):
+    def __init__(self, shifts, fn):
+        super().__init__()
+        self.fn = fn
+        self.shifts = tuple(shifts)
+
+    def forward(self, x, **kwargs):
+        mask = kwargs.get('mask', None)
+        shifts = self.shifts
+        segments = len(shifts)
+        feats_per_shift = x.shape[-1] // segments
+        splitted = x.split(feats_per_shift, dim = -1)
+        segments_to_shift, rest = splitted[:segments], splitted[segments:]
+        segments_to_shift = list(map(lambda args: shift(*args, mask = mask), zip(segments_to_shift, shifts)))
+        x = torch.cat((*segments_to_shift, *rest), dim = -1)
+        return self.fn(x, **kwargs)
+
+# feedforward
+
+class GLU(nn.Module):
+    def __init__(self, dim_in, dim_out, activation):
+        super().__init__()
+        self.act = activation
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim = -1)
+        return x * self.act(gate)
+
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out = None,
+        mult = 4,
+        glu = False,
+        swish = False,
+        relu_squared = False,
+        post_act_ln = False,
+        dropout = 0.,
+        no_bias = False,
+        zero_init_output = False
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+
+        if relu_squared:
+            activation = ReluSquared()
+        elif swish:
+            activation = nn.SiLU()
+        else:
+            activation = nn.GELU()
+
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim, bias = not no_bias),
+            activation
+        ) if not glu else GLU(dim, inner_dim, activation)
+
+        self.ff = nn.Sequential(
+            project_in,
+            nn.LayerNorm(inner_dim) if post_act_ln else nn.Identity(),
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out, bias = not no_bias)
+        )
+
+        # init last linear layer to 0
+        if zero_init_output:
+            init_zero_(self.ff[-1])
+
+    def forward(self, x):
+        return self.ff(x)
+
+# attention. it is all we need
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_head = DEFAULT_DIM_HEAD,
+        heads = 8,
+        causal = False,
+        flash = False,
+        talking_heads = False,
+        head_scale = False,
+        sparse_topk = None,
+        num_mem_kv = 0,
+        dropout = 0.,
+        on_attn = False,
+        gate_values = False,
+        zero_init_output = False,
+        max_attend_past = None,
+        qk_norm = False,
+        qk_norm_groups = 1,
+        qk_norm_scale = 10,
+        qk_norm_dim_scale = False,
+        one_kv_head = False,
+        shared_kv = False,
+        value_dim_head = None,
+        tensor_product = False   # https://arxiv.org/abs/2208.06061
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+
+        self.heads = heads
+        self.causal = causal
+        self.max_attend_past = max_attend_past
+
+        value_dim_head = default(value_dim_head, dim_head)
+        q_dim = k_dim = dim_head * heads
+        v_dim = out_dim = value_dim_head * heads
+
+        self.one_kv_head = one_kv_head
+        if one_kv_head:
+            k_dim = dim_head
+            v_dim = value_dim_head
+            out_dim = v_dim * heads
+
+        self.to_q = nn.Linear(dim, q_dim, bias = False)
+        self.to_k = nn.Linear(dim, k_dim, bias = False)
+
+        # shared key / values, for further memory savings during inference
+        assert not (shared_kv and value_dim_head != dim_head), 'key and value head dimensions must be equal for shared key / values'
+        self.to_v = nn.Linear(dim, v_dim, bias = False) if not shared_kv else None
+
+        # relations projection from tp-attention
+        self.to_r = nn.Linear(dim, v_dim, bias = False) if tensor_product else None
+
+        # add GLU gating for aggregated values, from alphafold2
+        self.to_v_gate = None
+        if gate_values:
+            self.to_v_gate = nn.Linear(dim, out_dim)
+            nn.init.constant_(self.to_v_gate.weight, 0)
+            nn.init.constant_(self.to_v_gate.bias, 1)
+
+        # cosine sim attention
+        self.qk_norm = qk_norm
+        self.qk_norm_groups = qk_norm_groups
+        self.qk_norm_scale = qk_norm_scale
+
+        # whether to use the rmsnorm (equivalent to cosine sim attention when scale is equal to 1) - https://arxiv.org/abs/2302.05442
+        self.qk_norm_dim_scale = qk_norm_dim_scale
+
+        self.qk_norm_q_scale = self.qk_norm_k_scale = 1
+        if qk_norm and qk_norm_dim_scale:
+            self.qk_norm_q_scale = nn.Parameter(torch.ones(dim_head))
+            self.qk_norm_k_scale = nn.Parameter(torch.ones(dim_head))
+
+        assert (not qk_norm) or (dim_head % qk_norm_groups) == 0, 'dimension per attention head must be divisible by the qk norm groups'
+        assert not (qk_norm and (dim_head // qk_norm_groups) <= 2), 'the group dimension may be too small (2 was too small in my tests, but 4 still works, surprisingly)'
+
+        # attend class - includes core attention algorithm + talking heads
+
+        self.attend = Attend(
+            heads = heads,
+            causal = causal,
+            talking_heads = talking_heads,
+            dropout = dropout,
+            qk_norm = qk_norm,
+            scale = qk_norm_scale if qk_norm else self.scale,
+            flash = flash
+        )
+
+        # head scaling
+        self.head_scale = head_scale
+        if head_scale:
+            self.head_scale_params = nn.Parameter(torch.ones(1, heads, 1, 1))
+
+        # explicit topk sparse attention
+        self.sparse_topk = sparse_topk
+
+        # add memory key / values
+        self.num_mem_kv = num_mem_kv
+        if num_mem_kv > 0:
+            self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
+            self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
+
+        # attention on attention
+        self.attn_on_attn = on_attn
+        self.to_out = nn.Sequential(nn.Linear(out_dim, dim * 2, bias = False), nn.GLU()) if on_attn else nn.Linear(out_dim, dim, bias = False)
+
+        # init output projection 0
+        if zero_init_output:
+            init_zero_(self.to_out)
+
+    def forward(
+        self,
+        x,
+        context = None,
+        mask = None,
+        context_mask = None,
+        attn_mask = None,
+        rel_pos = None,
+        rotary_pos_emb = None,
+        prev_attn = None,
+        mem = None
+    ):
+        b, n, _, h, head_scale, device, has_context = *x.shape, self.heads, self.head_scale, x.device, exists(context)
+        kv_input = default(context, x)
+
+        q_input = x
+        k_input = kv_input
+        v_input = kv_input
+        r_input = x
+
+        if exists(mem):
+            k_input = torch.cat((mem, k_input), dim = -2)
+            v_input = torch.cat((mem, v_input), dim = -2)
+
+        q = self.to_q(q_input)
+        k = self.to_k(k_input)
+        v = self.to_v(v_input) if exists(self.to_v) else k
+        r = self.to_r(r_input) if exists(self.to_r) else None
+
+        q = rearrange(q, 'b n (h d) -> b h n d', h = h)
+
+        if not self.one_kv_head:
+            k, v, r = map(lambda t: maybe(rearrange)(t, 'b n (h d) -> b h n d', h = h), (k, v, r))
+
+        if self.qk_norm:
+            qk_l2norm = partial(l2norm, groups = self.qk_norm_groups)
+            q, k = map(qk_l2norm, (q, k))
+
+            q = q * self.qk_norm_q_scale
+            k = k * self.qk_norm_k_scale
+
+        if exists(rotary_pos_emb) and not has_context:
+            freqs, xpos_scale = rotary_pos_emb
+            l = freqs.shape[-1]
+
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale ** -1.) if exists(xpos_scale) else (1., 1.)
+            (ql, qr), (kl, kr), (vl, vr) = map(lambda t: (t[..., :l], t[..., l:]), (q, k, v))
+
+            ql, kl, vl = map(lambda arg: apply_rotary_pos_emb(arg[0], freqs, arg[1]), ((ql, q_xpos_scale), (kl, k_xpos_scale), (vl, k_xpos_scale)))
+            q, k, v = map(lambda t: torch.cat(t, dim = -1), ((ql, qr), (kl, kr), (vl, vr)))
+
+        input_mask = default(context_mask, mask)
+
+        if self.num_mem_kv > 0:
+            mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b = b), (self.mem_k, self.mem_v))
+
+            if self.qk_norm:
+                mem_k = l2norm(mem_k)
+                mem_k = mem_k * self.qk_norm_k_scale
+
+            k = torch.cat((mem_k, k), dim = -2)
+            v = torch.cat((mem_v, v), dim = -2)
+
+            if exists(input_mask):
+                input_mask = pad_at_dim(input_mask, (self.num_mem_kv, 0), dim = -1, value = True)
+
+
+        i, j = map(lambda t: t.shape[-2], (q, k))
+
+        # determine masking
+
+        max_neg_value(q)
+        masks = []
+        final_attn_mask = None
+
+        if exists(input_mask):
+            input_mask = rearrange(input_mask, 'b j -> b 1 1 j')
+            masks.append(~input_mask)
+
+        if exists(attn_mask):
+            assert 2 <= attn_mask.ndim <= 4, 'attention mask must have greater than 2 dimensions but less than or equal to 4'
+            if attn_mask.ndim == 2:
+                attn_mask = rearrange(attn_mask, 'i j -> 1 1 i j')
+            elif attn_mask.ndim == 3:
+                attn_mask = rearrange(attn_mask, 'h i j -> 1 h i j')
+            masks.append(~attn_mask)
+
+        if exists(self.max_attend_past):
+            range_q = torch.arange(j - i, j, device = device)
+            range_k = torch.arange(j, device = device)
+            dist = rearrange(range_q, 'i -> 1 1 i 1') - rearrange(range_k, 'j -> 1 1 1 j')
+            max_attend_past_mask = dist > self.max_attend_past
+            masks.append(max_attend_past_mask)
+
+        if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]:
+            top, _ = dots.topk(self.sparse_topk, dim = -1)
+            vk = rearrange(top[..., -1], '... -> ... 1')
+            sparse_topk_mask = dots < vk
+            masks.append(sparse_topk_mask)
+
+        if len(masks) > 0:
+            final_attn_mask = or_reduce(masks)
+
+        # prepare relative positional bias, if needed
+
+        attn_bias = None
+        if exists(rel_pos):
+            attn_bias = rel_pos(i, j)
+
+        # attention is all we need
+
+        out, intermediates = self.attend(
+            q, k, v,
+            mask = final_attn_mask,
+            attn_bias = attn_bias,
+            prev_attn = prev_attn
+        )
+
+        # https://arxiv.org/abs/2208.06061 proposes to add a residual for better gradients
+
+        if exists(r):
+            out = out * r + out
+
+        # normformer scaling of heads
+
+        if head_scale:
+            out = out * self.head_scale_params
+
+        # merge heads
+
+        out = rearrange(out, 'b h n d -> b n (h d)')
+
+        # alphafold2 styled gating of the values
+
+        if exists(self.to_v_gate):
+            gates = self.to_v_gate(x)
+            out = out * gates.sigmoid()
+
+        # combine the heads
+
+        out = self.to_out(out)
+
+        if exists(mask):
+            mask = rearrange(mask, 'b n -> b n 1')
+            out = out.masked_fill(~mask, 0.)
+
+        return out, intermediates
+
+class AttentionLayers(nn.Module):
+    def __init__(
+        self,
+        dim,
+        depth,
+        heads = None,
+        causal = False,
+        cross_attend = False,
+        only_cross = False,
+        use_scalenorm = False,
+        use_rmsnorm = False,
+        alibi_pos_bias = False,
+        alibi_num_heads = None,
+        alibi_learned = False,
+        rel_pos_bias = False,
+        rel_pos_num_buckets = 32,
+        rel_pos_max_distance = 128,
+        dynamic_pos_bias = False,
+        dynamic_pos_bias_log_distance = False,
+        dynamic_pos_bias_mlp_depth = 2,
+        dynamic_pos_bias_norm = False,
+        rotary_pos_emb = False,
+        rotary_emb_dim = None,
+        rotary_xpos = False,
+        rotary_interpolation_factor=1.,
+        rotary_xpos_scale_base = 512,
+        rotary_base_rescale_factor=1.,
+        custom_layers = None,
+        sandwich_coef = None,
+        par_ratio = None,
+        residual_attn = False,
+        cross_residual_attn = False,
+        macaron = False,
+        pre_norm = True,
+        gate_residual = False,
+        scale_residual = False,
+        scale_residual_constant = 1.,
+        deepnorm = False,
+        shift_tokens = 0,
+        sandwich_norm = False,
+        resi_dual = False,
+        zero_init_branch_output = False,
+        layer_dropout = 0.,
+        cross_attn_tokens_dropout = 0.,
+        **kwargs
+    ):
+        super().__init__()
+        rotary_pos_emb = rotary_pos_emb or rotary_xpos
+
+        ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs)
+        attn_kwargs, kwargs = groupby_prefix_and_trim('attn_', kwargs)
+
+        dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD)
+
+        self.dim = dim
+        self.depth = depth
+        self.layers = nn.ModuleList([])
+
+        self.has_pos_emb = rel_pos_bias or rotary_pos_emb
+
+        rotary_emb_dim = max(default(rotary_emb_dim, dim_head // 2), 32)
+
+        assert not (rotary_xpos and not causal), 'rotary xpos is not compatible with bidirectional attention'
+        self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim, use_xpos = rotary_xpos, scale_base = rotary_xpos_scale_base, interpolation_factor=rotary_interpolation_factor, base_rescale_factor=rotary_base_rescale_factor) if rotary_pos_emb else None
+
+        assert not (alibi_pos_bias and rel_pos_bias), 'you can only choose Alibi positional bias or T5 relative positional bias, not both'
+        assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance'
+
+        # relative positional bias
+
+        flash_attn = attn_kwargs.get('flash', False)
+        assert (int(rel_pos_bias) + int(dynamic_pos_bias) + int(alibi_pos_bias)) <= 1, 'you can only choose up to one of t5, alibi, or dynamic positional bias'
+
+        self.rel_pos = None
+        if rel_pos_bias:
+            assert not flash_attn, 'flash attention not compatible with t5 relative positional bias'
+            self.rel_pos = RelativePositionBias(scale = dim_head ** 0.5, causal = causal, heads = heads, num_buckets = rel_pos_num_buckets, max_distance = rel_pos_max_distance)
+        elif dynamic_pos_bias:
+            assert not flash_attn, 'flash attention not compatible with dynamic positional bias'
+            self.rel_pos = DynamicPositionBias(dim = dim // 4, heads = heads, log_distance = dynamic_pos_bias_log_distance, depth = dynamic_pos_bias_mlp_depth, norm = dynamic_pos_bias_norm)
+        elif alibi_pos_bias:
+            alibi_num_heads = default(alibi_num_heads, heads)
+            assert alibi_num_heads <= heads, 'number of ALiBi heads must be less than the total number of heads'
+            alibi_pos_klass = LearnedAlibiPositionalBias if alibi_learned else AlibiPositionalBias
+            self.rel_pos = alibi_pos_klass(heads = alibi_num_heads, total_heads = heads)
+
+        # determine deepnorm and residual scale
+
+        if deepnorm:
+            assert scale_residual_constant == 1, 'scale residual constant is being overridden by deep norm settings'
+            pre_norm = sandwich_norm = resi_dual = False
+            scale_residual = True
+            scale_residual_constant = (2 * depth) ** 0.25
+
+        assert (int(sandwich_norm) + int(resi_dual)) <= 1, 'either sandwich norm or resiDual is selected, but not both'
+        assert not (not pre_norm and sandwich_norm), 'sandwich norm cannot be used when not using prenorm'
+        assert not (not pre_norm and resi_dual), 'resiDualcannot be used when not using prenorm'
+
+        self.pre_norm = pre_norm
+        self.sandwich_norm = sandwich_norm
+        self.resi_dual = resi_dual
+
+        self.residual_attn = residual_attn
+        self.cross_residual_attn = cross_residual_attn
+        self.cross_attend = cross_attend
+
+        norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm
+        norm_class = RMSNorm if use_rmsnorm else norm_class
+        norm_fn = partial(norm_class, dim)
+
+        if cross_attend and not only_cross:
+            default_block = ('a', 'c', 'f')
+        elif cross_attend and only_cross:
+            default_block = ('c', 'f')
+        else:
+            default_block = ('a', 'f')
+
+        if macaron:
+            default_block = ('f',) + default_block
+
+        # zero init
+
+        if zero_init_branch_output:
+            attn_kwargs = {**attn_kwargs, 'zero_init_output':  True}
+            ff_kwargs = {**ff_kwargs, 'zero_init_output':  True}
+
+        # calculate layer block order
+
+        if exists(custom_layers):
+            layer_types = custom_layers
+        elif exists(par_ratio):
+            par_depth = depth * len(default_block)
+            assert 1 < par_ratio <= par_depth, 'par ratio out of range'
+            default_block = tuple(filter(not_equals('f'), default_block))
+            par_attn  = par_depth // par_ratio
+            depth_cut = par_depth * 2 // 3  # 2 / 3 attention layer cutoff suggested by PAR paper
+            par_width = (depth_cut + depth_cut // par_attn) // par_attn
+            assert len(default_block) <= par_width, 'default block is too large for par_ratio'
+            par_block = default_block + ('f',) * (par_width - len(default_block))
+            par_head = par_block * par_attn
+            layer_types = par_head + ('f',) * (par_depth - len(par_head))
+        elif exists(sandwich_coef):
+            assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth'
+            layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef
+        else:
+            layer_types = default_block * depth
+
+        self.layer_types = layer_types
+        self.num_attn_layers = len(list(filter(equals('a'), layer_types)))
+
+        # stochastic depth
+
+        self.layer_dropouts = cast_tuple(layer_dropout, len(layer_types))
+
+        # structured dropout for cross attending
+
+        self.cross_attn_tokens_dropout = cross_attn_tokens_dropout
+
+        # calculate token shifting
+
+        shift_tokens = cast_tuple(shift_tokens, len(layer_types))
+
+        # iterate and construct layers
+
+        for ind, (layer_type, layer_shift_tokens) in enumerate(zip(self.layer_types, shift_tokens)):
+            is_last_layer = ind == (len(self.layer_types) - 1)
+
+            if layer_type == 'a':
+                layer = Attention(dim, heads = heads, causal = causal, **attn_kwargs)
+            elif layer_type == 'c':
+                layer = Attention(dim, heads = heads, **attn_kwargs)
+            elif layer_type == 'f':
+                layer = FeedForward(dim, **ff_kwargs)
+                layer = layer if not macaron else Scale(0.5, layer)
+            else:
+                raise Exception(f'invalid layer type {layer_type}')
+
+            if layer_shift_tokens > 0:
+                shift_range_upper = layer_shift_tokens + 1
+                shift_range_lower = -layer_shift_tokens if not causal else 0
+                layer = ShiftTokens(range(shift_range_lower, shift_range_upper), layer)
+
+            residual_fn = GRUGating if gate_residual else Residual
+            residual = residual_fn(dim, scale_residual = scale_residual, scale_residual_constant = scale_residual_constant)
+
+            pre_branch_norm = norm_fn() if pre_norm else None
+            post_branch_norm = norm_fn() if sandwich_norm else None
+            post_main_norm = norm_fn() if (resi_dual or not pre_norm) and not is_last_layer else None
+
+            norms = nn.ModuleList([
+                pre_branch_norm,
+                post_branch_norm,
+                post_main_norm
+            ])
+
+            self.layers.append(nn.ModuleList([
+                norms,
+                layer,
+                residual
+            ]))
+
+            self.layers_length = len(self.layers) # It doesn't work if called after
+
+        if deepnorm:
+            init_gain = (8 * depth) ** -0.25
+            deepnorm_init(self, init_gain)
+
+    def forward(
+        self,
+        x,
+        context = None,
+        mask = None,
+        context_mask = None,
+        attn_mask = None,
+        self_attn_context_mask = None,
+        mems = None,
+        return_hiddens = False
+    ):
+        assert not (self.cross_attend ^ exists(context)), 'context must be passed in if cross_attend is set to True'
+
+        hiddens = []
+        intermediates = []
+        prev_attn = None
+        prev_cross_attn = None
+
+        mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers
+
+        rotary_pos_emb = None
+        if exists(self.rotary_pos_emb):
+            max_rotary_emb_length = max(list(map(lambda m: (m.shape[1] if exists(m) else 0) + x.shape[1], mems)))
+            rotary_pos_emb = self.rotary_pos_emb(max_rotary_emb_length, x.device)
+
+        outer_residual = x
+
+        for ind, (layer_type, (norm, block, residual_fn), layer_dropout) in enumerate(zip(self.layer_types, self.layers, self.layer_dropouts)):
+            ind == (self.layers_length - 1)
+
+            if self.training and layer_dropout > 0. and random() < layer_dropout:
+                continue
+
+            if layer_type == 'a':
+                if return_hiddens:
+                    hiddens.append(x)
+                layer_mem = mems.pop(0) if mems else None
+
+            if layer_type == 'c':
+                if self.training and self.cross_attn_tokens_dropout > 0.:
+                    context, context_mask = dropout_seq(context, context_mask, self.cross_attn_tokens_dropout)
+
+            inner_residual = x
+
+            pre_norm, post_branch_norm, post_main_norm = norm
+
+            if exists(pre_norm) and not self.resi_dual:
+                x = pre_norm(x)
+
+            if layer_type == 'a':
+                out, inter = block(x, mask = mask, context_mask = self_attn_context_mask, attn_mask = attn_mask, rel_pos = self.rel_pos, rotary_pos_emb = rotary_pos_emb, prev_attn = prev_attn, mem = layer_mem)
+            elif layer_type == 'c':
+                out, inter = block(x, context = context, mask = mask, context_mask = context_mask, prev_attn = prev_cross_attn)
+            elif layer_type == 'f':
+                out = block(x)
+
+            if self.resi_dual:
+                outer_residual = residual_fn(out, outer_residual)
+
+            if exists(post_branch_norm):
+                out = post_branch_norm(out)
+
+            x = residual_fn(out, inner_residual)
+
+            if layer_type in ('a', 'c') and return_hiddens:
+                intermediates.append(inter)
+
+            if layer_type == 'a' and self.residual_attn:
+                prev_attn = inter.pre_softmax_attn
+            elif layer_type == 'c' and self.cross_residual_attn:
+                prev_cross_attn = inter.pre_softmax_attn
+
+            if exists(post_main_norm):
+                x = post_main_norm(x)
+
+            if self.resi_dual:
+                x = x + pre_norm(outer_residual)
+
+        if return_hiddens:
+            intermediates = LayerIntermediates(
+                hiddens = hiddens,
+                attn_intermediates = intermediates
+            )
+
+            return x, intermediates
+
+        return x
+
+
+class Decoder(AttentionLayers):
+    def __init__(self, **kwargs):
+        assert 'causal' not in kwargs, 'cannot set causality on decoder'
+        super().__init__(causal = True, **kwargs)
+
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        num_tokens,
+        max_seq_len,
+        attn_layers,
+        # tokenizer: BaseTokenizer,
+        embedding_provider: BaseEmbedding,
+        emb_dim = None,
+        max_mem_len = 0.,
+        shift_mem_down = 0,
+        emb_dropout = 0.,
+        post_emb_norm = False,
+        num_memory_tokens = None,
+        tie_embedding = False,
+        logits_dim = None,
+        use_abs_pos_emb = True,
+        scaled_sinu_pos_emb = False,
+        l2norm_embed = False,
+        emb_frac_gradient = 1. # GLM-130B and Cogview successfully used this, set at 0.1
+    ):
+        super().__init__()
+
+        assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder'
+
+        dim = attn_layers.dim
+        emb_dim = default(emb_dim, dim)
+
+        self.emb_dim = emb_dim
+        self.num_tokens = num_tokens
+        self.max_seq_len = max_seq_len
+        self.max_mem_len = max_mem_len
+        self.shift_mem_down = shift_mem_down
+
+        self.l2norm_embed = l2norm_embed
+        self.token_emb = TokenEmbedding(emb_dim, num_tokens, embedding_provider, l2norm_embed=l2norm_embed)
+
+        if not (use_abs_pos_emb and not attn_layers.has_pos_emb):
+            self.pos_emb = always(0)
+        elif scaled_sinu_pos_emb:
+            self.pos_emb = ScaledSinusoidalEmbedding(emb_dim)
+        else:
+            self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len, l2norm_embed = l2norm_embed)
+
+        self.emb_frac_gradient = emb_frac_gradient # fraction of the gradient that should go to the embedding, https://arxiv.org/abs/2105.13290
+
+        self.post_emb_norm = nn.LayerNorm(emb_dim) if post_emb_norm else nn.Identity()
+        self.emb_dropout = nn.Dropout(emb_dropout)
+
+        self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
+        self.attn_layers = attn_layers
+        self.norm = nn.LayerNorm(dim)
+
+        self.init_()
+
+        logits_dim = default(logits_dim, num_tokens)
+        self.to_logits = nn.Linear(dim, logits_dim) if not tie_embedding else lambda t: t @ self.token_emb.weight.t()
+
+        # memory tokens (like [cls]) from Memory Transformers paper
+        num_memory_tokens = default(num_memory_tokens, 0)
+        self.num_memory_tokens = num_memory_tokens
+        if num_memory_tokens > 0:
+            self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim))
+
+    def init_(self):
+        if self.l2norm_embed:
+            nn.init.normal_(self.token_emb.emb.weight, std = 1e-5)
+
+            if not isinstance(self.pos_emb, always):
+                nn.init.normal_(self.pos_emb.emb.weight, std = 1e-5)
+
+            return
+
+        nn.init.kaiming_normal_(self.token_emb.emb.weight)
+
+    def forward(
+        self,
+        x,
+        return_embeddings = False,
+        return_logits_and_embeddings = False,
+        return_intermediates = False,
+        mask = None,
+        return_mems = False,
+        return_attn = False,
+        mems = None,
+        pos = None,
+        prepend_embeds = None,
+        sum_embeds = None,
+        **kwargs
+    ):
+        b, n, device, num_mem, emb_frac_gradient = *x.shape, x.device, self.num_memory_tokens, self.emb_frac_gradient
+        return_hiddens = return_mems | return_attn
+
+        # absolute positional embedding
+
+        external_pos_emb = exists(pos) and pos.dtype != torch.long
+        pos_emb = self.pos_emb(x, pos = pos) if not external_pos_emb else pos
+        x = self.token_emb(x) + pos_emb
+
+        # for summing embeddings passed externally - needs this for self-conditioning in non-autoregressive training
+
+        if exists(sum_embeds):
+            x = x + sum_embeds
+
+        # post embedding norm, purportedly leads to greater stabilization
+
+        x = self.post_emb_norm(x)
+
+        # whether to append embeds, as in PaLI, for image embeddings
+
+        if exists(prepend_embeds):
+            prepend_seq, prepend_dim = prepend_embeds.shape[1:]
+
+            assert prepend_dim == x.shape[-1], 'prepended embeddings need to have same dimensions as text model dimensions'
+
+            x = torch.cat((prepend_embeds, x), dim = -2)
+
+        # whether to reduce the gradient going to the embedding, from cogview paper, corroborated by GLM-130B model
+
+        if emb_frac_gradient < 1:
+            assert emb_frac_gradient > 0
+
+            x = x * emb_frac_gradient + x.detach() * (1 - emb_frac_gradient)
+
+        # embedding dropout
+
+        x = self.emb_dropout(x)
+
+        x = self.project_emb(x)
+
+        if num_mem > 0:
+            mem = repeat(self.memory_tokens, 'n d -> b n d', b = b)
+            x = torch.cat((mem, x), dim = 1)
+
+            # auto-handle masking after appending memory tokens
+            if exists(mask):
+                mask = pad_at_dim(mask, (num_mem, 0), dim = -1, value = True)
+
+        if self.shift_mem_down and exists(mems):
+            mems_l, mems_r = mems[:self.shift_mem_down], mems[self.shift_mem_down:]
+            mems = [*mems_r, *mems_l]
+
+        if return_hiddens:
+            x, intermediates = self.attn_layers(x, mask = mask, mems = mems, return_hiddens = True, **kwargs)
+        else:
+            x = self.attn_layers(x, mask = mask, mems = mems, **kwargs)
+
+        x = self.norm(x)
+
+        mem, x = x[:, :num_mem], x[:, num_mem:]
+
+        if return_logits_and_embeddings:
+            out = (self.to_logits(x), x)
+        elif return_embeddings:
+            out = x
+        else:
+            out = self.to_logits(x)
+
+        if return_intermediates:
+            return out, intermediates
+
+        if return_mems:
+            hiddens = intermediates.hiddens
+            new_mems = list(map(lambda pair: torch.cat(pair, dim = -2), zip(mems, hiddens))) if exists(mems) else hiddens
+            new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), new_mems))
+            return out, new_mems
+
+        if return_attn:
+            attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
+            return out, attn_maps
+
+        return out
diff --git a/Andromeda/dataset_prep/__init__.py b/Andromeda/dataset_prep/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Andromeda/dataset_prep/books.py b/Andromeda/dataset_prep/books.py
new file mode 100644
index 0000000000000000000000000000000000000000..92bd3e340a5cb6a1954661d4aa6519f2c4978dbd
--- /dev/null
+++ b/Andromeda/dataset_prep/books.py
@@ -0,0 +1,12 @@
+# from Andromeda.dataset_builder import DatasetBuilder
+from build_dataset import DatasetBuilder
+
+builder = DatasetBuilder(
+    dataset_name="the_pile_books3",
+    seq_len=8192,
+    num_cpu=4,
+    hf_account_repo="kye/the_pile_books3_GPTNeox-8192",
+    tokenizer="EleutherAI/gpt-neox-20b",
+)
+
+dataset = builder.build_dataset()
diff --git a/Andromeda/inference.py b/Andromeda/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..e28581fafd023b80223b7c0590ec4019b0e7e01a
--- /dev/null
+++ b/Andromeda/inference.py
@@ -0,0 +1,198 @@
+import torch
+from transformers import AutoTokenizer
+from einops._torch_specific import allow_ops_in_compiled_graph
+
+import argparse
+
+# class AndromedaEval:
+#     def __init__(self, path, seed=42, device=None):
+#         self.path = path
+#         self.seed = seed
+
+#         self.device = device
+
+#         if self.device is None:
+#             self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+#         set_seed(self.seed)
+
+#         #tokenizer
+#         self.tokenizer = AndromedaTokenizer
+
+#         #model
+#         self.model = Andromeda
+
+#         #checkpoint
+#         self.model.load_state_dict(torch.load(self.path))
+#         self.model.eval()
+
+#         #device
+#         self.model = self.model.to(self.device)
+
+#         #metrics
+#         self.metrics = {}
+#         self.reset_metrics()
+
+#     def reset_metrics(self):
+#         self.metrics = {
+#             "generation_steps": None,
+#             "time_forward": [],
+#             "time_forward_average": None,
+
+#             "memory_usages": [],
+#             "memory_usage_average": None,
+#             "time_end_to_end": None,
+            
+#             "throughput": None
+#         }
+
+#     def get_num_params(self):
+#         num_params = sum(param.numel() for param in self.model.parameters() if param.requires_grad)
+
+#         return num_params
+    
+#     def generate(self, prompt, generation_steps=32):
+#         #make sure all of the metrics reset at every generation
+#         self.reset_metrics()
+
+#         self.metrics["generation_steps"] = generation_steps
+
+#         tokens = self.tokenizer.encode(prompt)
+#         tokens_new = []
+        
+#         time_end_to_end = time.time()
+
+#         #generation loop
+#         for _ in range(generation_steps):
+#             tokens_tensor = torch.tensor([tokens], device=self.device)
+
+#             #forward pass
+#             tracemalloc.start()
+
+#             time_forward_0 = time.time()
+            
+#             logits = self.model(tokens_tensor, return_loss=False)[:, -1] # no loss takes the output of the last tokens
+            
+#             time_forward_1 = time.time()
+            
+#             _, memory_usage = tracemalloc.get_traced_memory()
+#             tracemalloc.stop()
+
+#             self.metrics["memory_usages"].append(memory_usage)
+
+#             time_forward = time_forward_1 - time_forward_0
+#             self.metrics["times_forward"].append(time_forward)
+
+#             next_token = torch.armax(logits).item()
+
+#             #save the newly generated token
+#             tokens.append(next_token)
+#             tokens_new.append(next_token)
+
+#         time_end_to_end_1 = time.time()
+
+#         time_end_to_end = time_end_to_end_1 - time_end_to_end_0
+#         self.metrics["time_end_to_end"] = time_end_to_end
+
+#         decoded = self.tokenizer.decode(tokens)
+
+#         self.metrics["time_forward_average"] = np.mean(self.metrics["times_forward"])
+#         self.metrics["memory_usage_average"] = np.mean(self.metrics["memory_usage"])
+
+#         self.metrics['throughput'] = generation_steps / np.sum(self.metrics["times_forward"])
+
+#         return tokens_new, decoded
+
+        
+
+
+# def main():
+#     prompt = 'My name is'
+
+#     andromeda = EvalAndromeda(path='checkpoints/step_44927_6656/pytorch_model.bin')
+    
+#     num_params = Andromeda.get_num_params()
+#     print(f'The model has {num_params} parameters')
+    
+#     _, output = Andromeda.generate(prompt)
+    
+#     for metric, value in Andromeda.metrics.items():
+#         print(f'{metric}: {value}\n')
+    
+#     print('\n')
+    
+#     print(output)
+
+
+
+
+
+
+def main():
+    allow_ops_in_compiled_graph()
+
+    torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
+
+    parser = argparse.ArgumentParser(description="Generate text using Andromeda model")
+    parser.add_argument("prompt", type=str, help="Text prompt to generate text")
+    parser.add_argument(
+        "--seq_len", type=int, default=256, help="Sequence length for generated text"
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.8, help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--filter_thres", type=float, default=0.9, help="Filter threshold for sampling"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="andromeda-e-1",
+        help="Model to use for generation",
+    )
+
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="fp32",
+        help="Data type for the model: 'bf16', or 'fp32'",
+    )
+
+    args = parser.parse_args()
+
+
+    dtype = torch.float32
+    if args.dtype == 'bf16':
+        dtype = torch.bfloat16
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    #need to submit to torch hub
+    model = torch.hub.load("apacai/andromeda", args.model).to(device).to(dtype)
+
+    opt_model = torch.compile(model, backend="hidet")
+
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+
+    encoded_text = tokenizer(args.prompt, return_tensors="pt")
+
+    output_tensor = opt_model.generate(
+        seq_len=args.seq_len,
+        prompt=encoded_text["input_ids"].to(device),
+        temperature=args.temperature,
+        filter_thres=args.filter_thres,
+        pad_value=0.0,
+        eos_token=tokenizer.eos_token_id,
+        return_seq_without_prompt=False,
+        use_tqdm=True,
+    )
+
+    decoded_output = tokenizer.batch_decode(output_tensor, skip_special_tokens=True)
+
+    return decoded_output
+
+
+if __name__ == "__main__":
+    generated_text = main()
+    for text in generated_text:
+        print(f"{text}")
\ No newline at end of file
diff --git a/Andromeda/model.py b/Andromeda/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8bcda424e06113e6f7ef853cc713cace36f3926
--- /dev/null
+++ b/Andromeda/model.py
@@ -0,0 +1,118 @@
+from torch.nn import Module
+from Andromeda.core.transformer import Transformer, AutoregressiveWrapper, AndromedaEmbedding, Decoder
+from transformers import AutoTokenizer
+
+class AndromedaTokenizer:
+    def __init__(self):
+        self.tokenizer= AutoTokenizer.from_pretrained(
+            "EleutherAI/gpt-neox-20b",
+            eos_token="<eos>",
+            pad_token="<pad>",
+            extra_ids=0,
+            model_max_length=8192
+        )
+
+    def tokenize_texts(self, texts):
+        return self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True).input_ids
+    
+    def decode(self, texts):
+        return self.tokenizer.decode(texts)
+    
+    def __len__(self):
+        num_tokens = len(self.tokenizer)
+        return num_tokens
+
+
+
+class Andromeda(Module):
+    """
+    Andromeda is a transformer-based model architecture. It initializes with 
+    a Transformer and AutoregressiveWrapper with default or user-specified parameters.
+    """
+    def __init__(self, 
+                 num_tokens=50432, 
+                 max_seq_len=8192, 
+                 dim=2560, 
+                 depth=32, 
+                 dim_head=128, 
+                 heads=24,
+                 use_abs_pos_emb=False, 
+                 alibi_pos_bias=True, 
+                 alibi_num_heads=12, 
+                 rotary_xpos=True,
+                 attn_flash=True, 
+                #  shift_tokens=1, 
+                 attn_one_kv_head=True,  # multiquery attention
+                 qk_norm=True, 
+                 attn_qk_norm=True, 
+                 attn_qk_norm_dim_scale=True, 
+                 embedding_provider=AndromedaEmbedding()):
+        """
+        Initialize the model with specified or default parameters.
+        Args:
+        - num_tokens: Number of tokens in the vocabulary
+        - max_seq_len: Maximum sequence length
+        - dim: Dimension of the model
+        - depth: Depth of the model
+        - dim_head: Dimension of the model head
+        - heads: Number of heads
+        - use_abs_pos_emb: Whether to use absolute position embedding
+        - alibi_pos_bias: Alibi position bias
+        - alibi_num_heads: Number of alibi heads
+        - rotary_xpos: Rotary position
+        - attn_flash: Attention flash
+        - deepnorm: Deep normalization
+        - shift_tokens: Number of tokens to shift
+        - attn_one_kv_head: Attention one key/value head
+        - qk_norm: Query-key normalization
+        - attn_qk_norm: Attention query-key normalization
+        - attn_qk_norm_dim_scale: Attention query-key normalization dimension scale
+        - embedding_provider: Embedding provider module
+        """
+        super().__init__()
+
+        try:
+            self.Andromeda = Transformer(
+                num_tokens=num_tokens,
+                max_seq_len=max_seq_len,
+                use_abs_pos_emb=use_abs_pos_emb,
+                embedding_provider=embedding_provider,
+                attn_layers=Decoder(
+                    dim=dim,
+                    depth=depth,
+                    dim_head=dim_head,
+                    heads=heads,
+                    alibi_pos_bias=alibi_pos_bias,
+                    alibi_num_heads=alibi_num_heads,
+                    rotary_xpos=rotary_xpos,
+                    attn_flash=attn_flash,
+                    # deepnorm=deepnorm,
+                    # shift_tokens=shift_tokens,
+                    attn_one_kv_head=attn_one_kv_head,
+                    qk_norm=qk_norm,
+                    attn_qk_norm=attn_qk_norm,
+                    attn_qk_norm_dim_scale=attn_qk_norm_dim_scale
+                )
+            )
+
+            self.decoder = AutoregressiveWrapper(self.Andromeda)
+
+        except Exception as e:
+            print("Failed to initialize Andromeda: ", e)
+            raise
+
+    def forward(self, text_tokens, **kwargs):
+        """
+        Forward pass through the model. It expects the input text_tokens.
+        Args:
+        - text_tokens: Input tokens
+        - kwargs: Other arguments
+        Returns:
+        - output from the decoder
+        """
+        try:
+            model_input = self.decoder.forward(text_tokens)[0]
+            return self.decoder(model_input, padded_x=model_input[0])
+        except Exception as e:
+            print("Failed in forward method: ", e)
+            raise
diff --git a/Andromeda/old/__init__.py b/Andromeda/old/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Andromeda/old/sophia.py b/Andromeda/old/sophia.py
new file mode 100644
index 0000000000000000000000000000000000000000..35a59c99b4b9212035479711fd16aef12d0769d6
--- /dev/null
+++ b/Andromeda/old/sophia.py
@@ -0,0 +1,200 @@
+import torch
+from torch import Tensor
+from torch.optim.optimizer import Optimizer
+from typing import List
+
+
+class SophiaG(Optimizer):
+    def __init__(self, params, lr=1e-4, betas=(0.965, 0.99), rho = 0.04,
+         weight_decay=1e-1, *, maximize: bool = False,
+         capturable: bool = False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= rho:
+            raise ValueError("Invalid rho parameter at index 1: {}".format(rho))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, rho=rho, 
+                        weight_decay=weight_decay, 
+                        maximize=maximize, capturable=capturable)
+        super(SophiaG, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('maximize', False)
+            group.setdefault('capturable', False)
+        state_values = list(self.state.values())
+        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
+        if not step_is_tensor:
+            for s in state_values:
+                s['step'] = torch.tensor(float(s['step']))
+    
+    @torch.no_grad()
+    def update_hessian(self):
+        for group in self.param_groups:
+            beta1, beta2 = group['betas']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                state = self.state[p]
+
+                if len(state) == 0:
+                    state['step'] = torch.zeros((1,), dtype=torch.float, device=p.device) \
+                        if self.defaults['capturable'] else torch.tensor(0.)
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                
+                if 'hessian' not in state.keys():
+                    state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                state['hessian'].mul_(beta2).addcmul_(p.grad, p.grad, value=1 - beta2)
+
+
+    @torch.no_grad()
+    def step(self, closure=None, bs=5120):
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            state_steps = []
+            hessian = []
+            beta1, beta2 = group['betas']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                
+                if p.grad.is_sparse:
+                    raise RuntimeError('Hero does not support sparse gradients')
+                grads.append(p.grad)
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = torch.zeros((1,), dtype=torch.float, device=p.device) \
+                        if self.defaults['capturable'] else torch.tensor(0.)
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                
+                if 'hessian' not in state.keys():
+                    state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format)                
+
+                exp_avgs.append(state['exp_avg'])
+                state_steps.append(state['step'])
+                hessian.append(state['hessian'])
+                
+                if self.defaults['capturable']:
+                    bs = torch.ones((1,), dtype=torch.float, device=p.device) * bs
+
+            sophiag(params_with_grad,
+                  grads,
+                  exp_avgs,
+                  hessian,
+                  state_steps,
+                  bs=bs,
+                  beta1=beta1,
+                  beta2=beta2,
+                  rho=group['rho'],
+                  lr=group['lr'],
+                  weight_decay=group['weight_decay'],
+                  maximize=group['maximize'],
+                  capturable=group['capturable'])
+
+        return loss
+
+def sophiag(params: List[Tensor],
+          grads: List[Tensor],
+          exp_avgs: List[Tensor],
+          hessian: List[Tensor],
+          state_steps: List[Tensor],
+          capturable: bool = False,
+          *,
+          bs: int,
+          beta1: float,
+          beta2: float,
+          rho: float,
+          lr: float,
+          weight_decay: float,
+          maximize: bool):
+
+    if not all(isinstance(t, torch.Tensor) for t in state_steps):
+        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
+
+    
+    func = _single_tensor_sophiag
+
+    func(params,
+         grads,
+         exp_avgs,
+         hessian,
+         state_steps,
+         bs=bs,
+         beta1=beta1,
+         beta2=beta2,
+         rho=rho,
+         lr=lr,
+         weight_decay=weight_decay,
+         maximize=maximize,
+         capturable=capturable)
+
+def _single_tensor_sophiag(params: List[Tensor],
+                         grads: List[Tensor],
+                         exp_avgs: List[Tensor],
+                         hessian: List[Tensor],
+                         state_steps: List[Tensor],
+                         *,
+                         bs: int,
+                         beta1: float,
+                         beta2: float,
+                         rho: float,
+                         lr: float,
+                         weight_decay: float,
+                         maximize: bool,
+                         capturable: bool):
+
+    for i, param in enumerate(params):
+        grad = grads[i] if not maximize else -grads[i]
+        exp_avg = exp_avgs[i]
+        hess = hessian[i]
+        step_t = state_steps[i]
+
+        if capturable:
+            assert param.is_cuda and step_t.is_cuda and bs.is_cuda 
+            
+        if torch.is_complex(param):
+            grad = torch.view_as_real(grad)
+            exp_avg = torch.view_as_real(exp_avg)
+            hess = torch.view_as_real(hess)
+            param = torch.view_as_real(param)
+
+        # update step
+        step_t += 1
+
+        # Perform stepweight decay
+        param.mul_(1 - lr * weight_decay)
+
+        # Decay the first and second moment running average coefficient
+        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+        
+        if capturable:
+            step_size = lr 
+            step_size_neg = step_size.neg()
+
+            ratio = (exp_avg.abs() / (rho * bs * hess + 1e-15)).clamp(None,1)
+            param.addcmul_(exp_avg.sign(), ratio, value=step_size_neg)
+        else:
+            step_t.item()
+            step_size_neg = - lr 
+            
+            ratio = (exp_avg.abs() / (rho * bs * hess + 1e-15)).clamp(None,1)
+            param.addcmul_(exp_avg.sign(), ratio, value=step_size_neg)
\ No newline at end of file
diff --git a/Andromeda/old/training.py b/Andromeda/old/training.py
new file mode 100644
index 0000000000000000000000000000000000000000..568d6b40ff3ec7074605e4c9a6b7e41aa410f782
--- /dev/null
+++ b/Andromeda/old/training.py
@@ -0,0 +1,294 @@
+#quantization + paralleism
+import time
+
+import torch
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from torch.nn import CrossEntropyLoss
+from torch.utils.data import DataLoader
+from transformers import default_data_collator, get_linear_schedule_with_warmup
+from accelerate import Accelerator
+
+from rich.progress import Progress
+
+
+from lion_pytorch import Lion
+# from x_transformers import Transformer, Decoder, AutoregressiveWrapper
+from optimus_prim import Transformer, Decoder, AutoregressiveWrapper
+
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+import torch.distributed as dist
+
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel,
+    CPUOffload,
+)
+
+from torch.distributed.fsdp.wrap import (
+    default_auto_wrap_policy,
+)
+
+from transformers import AutoTokenizer
+
+#logging
+import boto3
+
+
+#training
+import wandb
+
+from torch.utils.tensorboard import SummaryWriter
+
+class CustomGPTNeoXTokenizer:
+    def __init__(self):
+        self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    
+    def tokenize(self, text):
+        return self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
+
+custom_tokenizer = CustomGPTNeoXTokenizer()
+
+Andromeda = Transformer(
+    num_tokens=64007,
+    max_seq_len=8192,
+    use_abs_pos_emb = False,
+    tokenizer=custom_tokenizer,
+    attn_layers = Decoder(
+        dim=2048,
+        depth=6,
+        heads=16,
+        alibi_pos_bias=True,
+        alibi_num_heads=8,
+        rotary_xpos=True,
+        attn_flash = True,
+        deepnorm=True,
+        shift_tokens=1,
+        attn_one_kv_head = True,
+        qk_norm=True
+    )
+)
+
+Andromeda = AutoregressiveWrapper(Andromeda)
+
+
+
+AWS_ACCESS_KEY_ID=""
+AWS_SECRET_ACCESS_KEY="d"
+
+
+def save_model_to_s3(model, bucket_name, key_prefix, step):
+    s3 = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
+    model_path = f"checkpoint_at_step_{step}.pt"
+    torch.save(model.state_dict(), model_path)
+    s3.upload_file(model_path, bucket_name, f"{key_prefix}/{model_path}")
+
+
+
+def count_number_of_parameters(model, only_trainable: bool = True) -> int:
+    if only_trainable:
+        num_params: int = sum(p.numel()
+                              for p in model.parameters() if p.requires_grad)
+    else:
+        num_params: int = sum(p.numel() for p in model.parameters() if p)
+    return int(num_params)
+
+
+
+def prep_sample(sample):
+    title = sample["title"]
+    text = sample["text"]
+    return {
+        "title": title,
+        "text": text
+    }
+
+
+def train(args):
+
+    if args.use_ddp:
+        dist.init_process_group(backend="nccl")
+
+
+    accelerator = Accelerator(
+        mixed_precision="fp16",
+        gradient_accumulation_steps=1,
+    )
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    #v1
+    model = Andromeda()
+    if args.use_ddp:
+        model = DistributedDataParallel(model)
+    else:
+        model = DataParallel(model)
+
+    fsdp_model = FullyShardedDataParallel(
+        model(),
+        fsdp_auto_wrap_policy=default_auto_wrap_policy,
+        cpu_offload=CPUOffload(offload_params=True),
+    )
+
+    fsdp_model = fsdp_model.to(accelerator.device)
+
+    #device count
+    if torch.cuda.device_count() > 1:
+        print(f"Let's use ${torch.cuda.device_count()} GPUS")
+
+
+
+
+    optimizer = Lion(model.parameters(), lr=args.learning_rate / 3, weight_decay=args.weight_decay * 3)
+    
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer=optimizer,
+        num_warmup_steps=args.warmup_steps,
+        num_training_steps=args.max_steps,
+    )
+
+    # tokenizer = KosmosTokenizer()
+
+    #====================> load data #====================> load data #====================> load data 
+
+
+    dataset = load_dataset("the_pile_books3")
+
+    # dataset = dataset.map(prep_sample, num_proc=8)
+    dataset = dataset.map(prep_sample, num_proc=8)
+
+
+    #new removed columns
+    remove_columns = ['title']
+
+
+    dataset = dataset.map(Andromeda.decoder.tokenizer, batched=True,
+                          batch_size=128, remove_columns=remove_columns)
+
+    train_dataloader = DataLoader(
+        dataset, collate_fn=default_data_collator, batch_size=args.batch_size, pin_memory=True
+    )
+
+
+
+    #====================> load data #====================> load data #====================> load data #====================> load data 
+
+    fsdp_model, train_dataloader, optimizer, lr_scheduler = accelerator.prepare(fsdp_model, train_dataloader, optimizer,
+                                                                           lr_scheduler)
+    fsdp_model.train()
+    accelerator.register_for_checkpointing(lr_scheduler)
+
+    accelerator.print(
+        f"Number of parameters: {count_number_of_parameters(model):,}")
+    accelerator.print(
+        f"Number of trainable parameters: {count_number_of_parameters(model, only_trainable=True):,}")
+
+    # Log model and optimizer parameters to wandb
+    accelerator.init_trackers(project_name="Andromeda")
+
+    #wandb
+    wandb.init(project="Andromeda", config=args)
+    
+    #init tensorboard writer
+    tb_writer = SummaryWriter()
+
+
+    train_loader = iter(train_dataloader)
+    epoch_loss = 0
+    total_loss = 0
+    start_time = time.time()
+
+    with Progress() as progress:
+        task = progress.add_task("[red]Training...", total=args.max_steps)
+        for step in range(0, args.max_steps):
+            batch_start = time.time()
+            batch = next(train_loader)
+            outputs = fsdp_model(**batch, self_attn_padding_mask=batch["attention_mask"])
+            # Shift so that tokens < n predict n
+            outputs = torch.cat([outputs[:, :1], outputs[:, 67:]], dim=1).contiguous()
+            # shift_logits = outputs[..., :-1, :].contiguous()
+            # shift_labels = batch["labels"][..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            one_hot_labels = torch.nn.functional.one_hot(batch["labels"][:, 1:], num_classes=32002).float()
+            loss = loss_fct(outputs[:,:-1], one_hot_labels)
+
+            epoch_loss += loss.detach().float()
+
+            accelerator.backward(loss)
+            optimizer.step()
+            optimizer.zero_grad()
+
+            batch_end = time.time()
+            logs = {
+                "loss": loss.item(),
+                "perplexity": torch.exp(loss).item(),
+                "lr": lr_scheduler.get_last_lr()[0],
+                "examples": args.batch_size * (step + 1),
+                "examples_per_second": args.batch_size / (batch_end - batch_start),
+            }
+            if step % args.log_every == args.log_every - 1:
+                #log metrics to wandb
+                wandb.log(logs, step=step)
+
+                #log metrics to tensorboard 
+                                # Log metrics to TensorBoard
+                tb_writer.add_scalar("loss", logs["loss"], step)
+                tb_writer.add_scalar("perplexity", logs["perplexity"], step)
+                tb_writer.add_scalar("lr", logs["lr"], step)
+                tb_writer.add_scalar("examples", logs["examples"], step)
+                tb_writer.add_scalar("examples_per_second", logs["examples_per_second"], step)
+
+                #accelerator
+                accelerator.log(logs, step=step)
+                progress.update(task, advance=1, description=f"Step Loss: {loss.item():.5f} "
+                                                             f"| Mean Loss: {(total_loss + epoch_loss) / step:.5f} "
+                                                             f"| Mean PPL: {torch.exp((total_loss + epoch_loss) / step):.2f} "
+                                                             f"| Examples: {args.batch_size * (step + 1)} "
+                                                             f"| Examples/s: {args.batch_size / (batch_end - batch_start):.2f} "
+                                                             f"| Elapsed: {time.strftime('%H:%M:%S', time.gmtime(time.time() - start_time))}")
+
+            if step % args.save_every == args.save_every - 1:
+                train_epoch_loss = epoch_loss / args.save_every
+                total_loss += epoch_loss
+                epoch_loss = 0
+
+                accelerator.log({
+                    "train_ppl": torch.exp(train_epoch_loss),
+                    "train_epoch_loss": train_epoch_loss,
+                }, step=step)
+
+                progress.print(f"Saving checkpoint at step {step}...")
+                accelerator.save_state(
+                    f"{args.checkpoint_dir}/checkpoint_at_step_{step}/")
+                
+                #save the model weights to s3 
+                save_model_to_s3(model, "kosmostraining", "kosmosv1/checkpoints", step)
+                print(f"Saved to s3: {save_model_to_s3} ")
+
+        #finish tensorboard writer
+        tb_writer.close()
+
+        #finish wnabd run
+        wandb.finish()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_dir", type=str, default="checkpoints")
+    parser.add_argument("--learning_rate", type=float, default=1e-5)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--warmup_steps", type=int, default=0)
+    parser.add_argument("--max_steps", type=int, default=100000)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--log_every", type=int, default=1)
+    parser.add_argument("--save_every", type=int, default=100)
+    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--use_ddp", action="store_true", help="Use DistributedDataParallel")
+
+    args = parser.parse_args()
+
+    train(args)
\ No newline at end of file
diff --git a/Andromeda/old/training_1.py b/Andromeda/old/training_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ebac60e7e0ffeb9d11a140652a914540a687fc5
--- /dev/null
+++ b/Andromeda/old/training_1.py
@@ -0,0 +1,350 @@
+import math
+import multiprocessing
+import os
+
+from datetime import timedelta
+from functools import partial
+from itertools import chain
+
+
+from accelerate import Accelerator
+from accelerate.utils import InitProcessGroupKwargs
+
+from datasets import concatenate_datasets, load_dataset
+
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointImpl, apply_activation_checkpointing, checkpoint_wrapper)
+
+from torch.utils.data import DataLoader
+
+from tqdm import tqdm
+
+from transformers import (AutoTokenizer, default_data_collator,
+                          get_cosine_schedule_with_warmup,
+                          get_linear_schedule_with_warmup, set_seed)
+
+
+# from stable_adamw import StableAdamWUnfused
+# sd
+
+from optimus_prime import Transformer, Decoder, AutoregressiveWrapper
+from optimus_prime import AndromedaEmbedding
+
+from lion_pytorch import Lion
+
+
+# constants
+
+class CFG:
+    BATCH_SIZE: int = 3 # 3
+    GRADIENT_ACCUMULATE_EVERY: int = 1
+    SEED: int = 42
+    LEARNING_RATE: float = 1e-4
+    WEIGHT_DECAY: float = 1e-2
+    SEQ_LEN: int = 8192 # 8192
+    NUM_CPU: int = multiprocessing.cpu_count()
+    USE_PRETOKENIZED: bool = True
+    USE_ACTIVATION_CHECKPOINTING: bool = True
+    RESUME_FROM_CHECKPOINT: str = None
+    CHECKPOINTING_STEPS: int = 1000
+    OUTPUT_DIR: str = "output"
+    ENTITY_NAME: str = "wanb" # Put your wandb username here
+
+# deepspeed_plugin = DeepSpeedPlugin(zero_stage=2, gradient_accumulation_steps=CFG.GRADIENT_ACCUMULATE_EVERY)
+
+# helpers
+
+def print_num_params(model, accelerator: Accelerator):
+    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    accelerator.print(f"Number of parameters in model: {n_params}")
+
+def fsdp_activation_checkpointing(
+    model, accelerator: Accelerator, offload_to_cpu=False
+):
+
+    accelerator.print("Using FSDP activation checkpointing")
+
+    # check_fn = lambda submodule: isinstance(submodule, ParallelTransformerBlock)
+
+    non_reentrant_wrapper = partial(
+        checkpoint_wrapper,
+        offload_to_cpu=offload_to_cpu,
+        checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+    )
+
+    apply_activation_checkpointing(
+        model, checkpoint_wrapper_fn=non_reentrant_wrapper)
+
+
+def get_lr_scheduler_with_warmup(
+    optimizer, scheduler_type, num_warmup_steps, max_train_steps, grad_accumulate_every
+):
+    NUM_WARMUP_STEPS = num_warmup_steps
+    GRADIENT_ACCUMULATE_EVERY = grad_accumulate_every
+
+    if scheduler_type == "linear":
+        return get_linear_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=NUM_WARMUP_STEPS * GRADIENT_ACCUMULATE_EVERY,
+            num_training_steps=max_train_steps * GRADIENT_ACCUMULATE_EVERY
+        )
+    elif scheduler_type == "cosine":
+        return get_cosine_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=NUM_WARMUP_STEPS * GRADIENT_ACCUMULATE_EVERY,
+            num_training_steps=max_train_steps * GRADIENT_ACCUMULATE_EVERY
+        )
+    else:
+        raise ValueError(
+            "Invalid scheduler_type. Expected 'linear' or 'cosine', got: {}".format(
+                scheduler_type
+            )
+        )
+
+
+def build_dataloaders():
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    dataset = load_dataset("openwebtext", split="train")
+
+    tokenized_dataset = dataset.map(
+        lambda example: tokenizer([t + tokenizer.eos_token for t in example["text"]]),
+        batched=True,
+        num_proc=CFG.NUM_CPU,
+        remove_columns=["text"],
+    )
+
+    block_size = CFG.SEQ_LEN
+
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        return result
+
+    train_dataset = tokenized_dataset.map(
+        group_texts, batched=True, num_proc=CFG.NUM_CPU,
+    )
+
+    return train_dataset
+
+# main
+
+def TrainAndromeda():
+    # accelerator
+
+    timeout = InitProcessGroupKwargs(timeout=timedelta(seconds=1_000_000))
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=CFG.GRADIENT_ACCUMULATE_EVERY,
+        mixed_precision="fp16",
+        log_with="wandb",
+        kwargs_handlers=[timeout],
+        deepspeed_plugin=deepspeed_plugin
+    )
+
+    accelerator.init_trackers(
+        project_name="andromeda",
+        config={
+            "batch_size": CFG.BATCH_SIZE,
+            "gradient_accumulate_every": CFG.GRADIENT_ACCUMULATE_EVERY,
+            "learning_rate": CFG.LEARNING_RATE,
+            "seq_len": CFG.SEQ_LEN,
+        },
+        init_kwargs={"wandb": {"entity": CFG.ENTITY_NAME}}
+    )
+
+    accelerator.print(f"Total GPUS: {accelerator.num_processes}")
+
+    # set seed
+
+    set_seed(CFG.SEED)
+
+    # Create the tokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+
+    # instantiate andromeda
+
+    model = Transformer(
+        num_tokens=64007,
+        max_seq_len=8192,
+        use_abs_pos_emb=False,
+        tokenizer=tokenizer, # !
+        embedding_provider=AndromedaEmbedding(),
+        attn_layers = Decoder(
+            dim=128, # 2048
+            depth=8, # 16
+            dim_head=128,
+            heads=8,
+            alibi_pos_bias=True,
+            alibi_num_heads=4,
+            rotary_xpos=True,
+            attn_flash = True,
+            deepnorm=True,
+            shift_tokens=1,
+            attn_one_kv_head = True,
+            qk_norm=True,
+            attn_qk_norm=True,
+            attn_qk_norm_dim_scale=True # set this to True, in addition to `attn_qk_norm = True`
+        )
+    ).to(accelerator.device)
+
+    model = AutoregressiveWrapper(model).to(accelerator.device)
+
+    optim = Lion(model.parameters(), lr=1e-4, weight_decay=1e-2, use_triton=True)
+
+    print_num_params(model, accelerator)
+
+    if CFG.USE_ACTIVATION_CHECKPOINTING:
+        fsdp_activation_checkpointing(model, accelerator)
+
+    # dataloaders
+
+    if CFG.USE_PRETOKENIZED:
+        d0 = load_dataset("conceptofmind/c4_0-to-20_neox_with_eos_8k", split="train")
+        d1 = load_dataset("conceptofmind/c4_21-to-40_neox_with_eos_8k", split="train")
+        d2 = load_dataset("conceptofmind/c4_41-to-60_neox_with_eos_8k", split="train")
+        d3 = load_dataset("conceptofmind/c4_61-to-80_neox_with_eos_8k", split="train")
+        d4 = load_dataset("conceptofmind/c4_81-to-100_neox_with_eos_8k", split="train")
+
+        train_dataset = concatenate_datasets([d0, d1, d2, d3, d4])
+    else:
+        train_dataset = build_dataloaders()
+
+    train_loader = DataLoader(
+        train_dataset, batch_size=CFG.BATCH_SIZE, collate_fn=default_data_collator,
+    )
+
+    max_train_steps = math.ceil(len(train_loader) / CFG.GRADIENT_ACCUMULATE_EVERY)
+    accelerator.print(f"Max train steps: {max_train_steps}")
+
+    # lr scheduler
+    # We cant decide on an actual number
+
+    NUM_WARMUP_STEPS = int(max_train_steps * 0.01)
+    accelerator.print(f"Num warmup steps: {NUM_WARMUP_STEPS}")
+
+    lr_scheduler = get_lr_scheduler_with_warmup(
+        optimizer=optim,
+        scheduler_type="cosine",
+        num_warmup_steps=NUM_WARMUP_STEPS,
+        max_train_steps=max_train_steps,
+        grad_accumulate_every=CFG.GRADIENT_ACCUMULATE_EVERY
+    )
+
+    # prepare
+
+    model, optim, train_loader, lr_scheduler = accelerator.prepare(
+        model, optim, train_loader, lr_scheduler
+    )
+
+    # checkpoint scheduler
+
+    accelerator.register_for_checkpointing(lr_scheduler)
+
+    # I do not know why Huggingface recommends recalculation of max_train_steps
+
+    max_train_steps = math.ceil(len(train_loader) / CFG.GRADIENT_ACCUMULATE_EVERY)
+    accelerator.print(f"Max train steps recalculated: {max_train_steps}")
+
+    # Total batch size for logging
+
+    total_batch_size = (
+        CFG.BATCH_SIZE * accelerator.num_processes * CFG.GRADIENT_ACCUMULATE_EVERY
+    )
+    accelerator.print(f"Total batch size: {total_batch_size}")
+
+    # resume training
+
+    progress_bar = tqdm(
+        range(max_train_steps), disable=not accelerator.is_local_main_process
+    )
+    completed_steps = 0
+
+    if CFG.RESUME_FROM_CHECKPOINT:
+        if CFG.RESUME_FROM_CHECKPOINT is not None or CFG.RESUME_FROM_CHECKPOINT != "":
+            accelerator.print(f"Resuming from checkpoint {CFG.RESUME_FROM_CHECKPOINT}")
+            accelerator.load_state(CFG.RESUME_FROM_CHECKPOINT)
+            path = os.path.basename(CFG.RESUME_FROM_CHECKPOINT)
+        
+        training_difference = os.path.splitext(path)[0]
+
+        # need to multiply `gradient_accumulation_steps` to reflect real steps
+        resume_step = (
+            int(training_difference.replace("step_", ""))
+            * CFG.GRADIENT_ACCUMULATE_EVERY
+        )
+
+    if CFG.RESUME_FROM_CHECKPOINT and resume_step is not None:
+        train_loader = accelerator.skip_first_batches(train_loader, resume_step)
+        completed_steps += resume_step
+        progress_bar.update(resume_step)
+
+    # training
+
+    model.train()
+
+    for step, batch in enumerate(train_loader):
+        with accelerator.accumulate(model):
+            inputs = batch["input_ids"].to(accelerator.device)
+            _, loss = model(inputs, return_loss=True)
+            accelerator.backward(loss)
+
+            # print(loss.item())
+
+            accelerator.log({"loss": loss.item()}, step=step)
+
+            if accelerator.sync_gradients:
+                accelerator.clip_grad_norm_(model.parameters(), 0.5)
+
+            optim.step()
+            lr_scheduler.step()
+            optim.zero_grad()
+
+        if accelerator.sync_gradients:
+            progress_bar.update(1)
+            completed_steps += 1
+
+        if isinstance(CFG.CHECKPOINTING_STEPS, int):
+            if completed_steps % CFG.CHECKPOINTING_STEPS == 0:
+                output_dir = f"step_{completed_steps }"
+                if CFG.OUTPUT_DIR is not None:
+                    output_dir = os.path.join(CFG.OUTPUT_DIR, output_dir)
+                accelerator.save_state(output_dir)
+
+        if completed_steps >= max_train_steps:
+            break
+
+    # end training
+
+    accelerator.print("Training Finished")
+    accelerator.end_training()
+
+    # save final model
+
+    # accelerator.print(f"Saving model to {CFG.OUTPUT_DIR}")
+    if CFG.OUTPUT_DIR is not None:
+        base_path = f'{CFG.OUTPUT_DIR}/final'
+
+        if not os.path.exists(base_path):
+            os.makedirs(base_path)
+
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        with accelerator.main_process_first():
+            accelerator.save(
+                unwrapped_model.state_dict(), os.path.join(base_path, 'final_model.pt')
+            )
+
+if __name__ == "__main__":
+    TrainAndromeda()
\ No newline at end of file
diff --git a/Andromeda/old/training_sophia.py b/Andromeda/old/training_sophia.py
new file mode 100644
index 0000000000000000000000000000000000000000..51386ab1e9fc09a2e992d5a2bc3175a284d750cb
--- /dev/null
+++ b/Andromeda/old/training_sophia.py
@@ -0,0 +1,369 @@
+import math
+import multiprocessing
+import os
+
+from datetime import timedelta
+from functools import partial
+from itertools import chain
+
+
+from accelerate import Accelerator
+from accelerate.utils import InitProcessGroupKwargs
+
+from datasets import concatenate_datasets, load_dataset
+
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointImpl, apply_activation_checkpointing, checkpoint_wrapper)
+
+from torch.utils.data import DataLoader
+
+from tqdm import tqdm
+
+from transformers import (AutoTokenizer, default_data_collator,
+                          get_cosine_schedule_with_warmup,
+                          get_linear_schedule_with_warmup, set_seed)
+
+
+# from stable_adamw import StableAdamWUnfused
+# sd
+
+from optimus_prime import Transformer, Decoder, AutoregressiveWrapper
+from optimus_prime import AndromedaEmbedding
+
+from sophia import SophiaG
+
+# constants
+
+class CFG:
+    BATCH_SIZE: int = 3 # 3
+    GRADIENT_ACCUMULATE_EVERY: int = 1
+    SEED: int = 42
+    LEARNING_RATE: float = 1e-4
+    WEIGHT_DECAY: float = 1e-2
+    SEQ_LEN: int = 8192 # 8192
+    NUM_CPU: int = multiprocessing.cpu_count()
+    USE_PRETOKENIZED: bool = True
+    USE_ACTIVATION_CHECKPOINTING: bool = True
+    RESUME_FROM_CHECKPOINT: str = None
+    CHECKPOINTING_STEPS: int = 1000
+    OUTPUT_DIR: str = "output"
+    ENTITY_NAME: str = "nicolo" # Put your wandb username here
+
+# helpers
+
+def print_num_params(model, accelerator: Accelerator):
+    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    accelerator.print(f"Number of parameters in model: {n_params}")
+
+def fsdp_activation_checkpointing(
+    model, accelerator: Accelerator, offload_to_cpu=False
+):
+
+    accelerator.print("Using FSDP activation checkpointing")
+
+    # check_fn = lambda submodule: isinstance(submodule, ParallelTransformerBlock)
+
+    non_reentrant_wrapper = partial(
+        checkpoint_wrapper,
+        offload_to_cpu=offload_to_cpu,
+        checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+    )
+
+    apply_activation_checkpointing(
+        model, checkpoint_wrapper_fn=non_reentrant_wrapper)
+
+
+def get_lr_scheduler_with_warmup(
+    optimizer, scheduler_type, num_warmup_steps, max_train_steps, grad_accumulate_every
+):
+    NUM_WARMUP_STEPS = num_warmup_steps
+    GRADIENT_ACCUMULATE_EVERY = grad_accumulate_every
+
+    if scheduler_type == "linear":
+        return get_linear_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=NUM_WARMUP_STEPS * GRADIENT_ACCUMULATE_EVERY,
+            num_training_steps=max_train_steps * GRADIENT_ACCUMULATE_EVERY
+        )
+    elif scheduler_type == "cosine":
+        return get_cosine_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=NUM_WARMUP_STEPS * GRADIENT_ACCUMULATE_EVERY,
+            num_training_steps=max_train_steps * GRADIENT_ACCUMULATE_EVERY
+        )
+    else:
+        raise ValueError(
+            "Invalid scheduler_type. Expected 'linear' or 'cosine', got: {}".format(
+                scheduler_type
+            )
+        )
+
+
+def build_dataloaders():
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+
+    content_column = 'text'
+
+    dataset = load_dataset("sentiment140", split="train")
+    dataset = dataset.remove_columns([col for col in dataset.column_names if col != content_column])
+
+    tokenized_dataset = dataset.map(
+        lambda example: tokenizer([t + tokenizer.eos_token for t in example[content_column]]),
+        batched=True,
+        num_proc=CFG.NUM_CPU,
+        remove_columns=[content_column]
+    )
+
+    block_size = CFG.SEQ_LEN
+
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {}
+
+        for k in examples.keys():
+            concatenated_examples[k] = list(chain(*examples[k]))
+
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+
+        return result
+
+    train_dataset = tokenized_dataset.map(
+        group_texts, batched=True, num_proc=CFG.NUM_CPU
+    )
+
+    return train_dataset
+
+# main
+
+def TrainAndromeda():
+    # accelerator
+
+    timeout = InitProcessGroupKwargs(timeout=timedelta(seconds=1_000_000))
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=CFG.GRADIENT_ACCUMULATE_EVERY,
+        mixed_precision="fp16", # Switch to bf16
+        log_with="wandb",
+        kwargs_handlers=[timeout]
+    )
+
+    accelerator.init_trackers(
+        project_name="andromeda",
+        config={
+            "batch_size": CFG.BATCH_SIZE,
+            "gradient_accumulate_every": CFG.GRADIENT_ACCUMULATE_EVERY,
+            "learning_rate": CFG.LEARNING_RATE,
+            "seq_len": CFG.SEQ_LEN,
+        },
+        init_kwargs={"wandb": {"entity": CFG.ENTITY_NAME}}
+    )
+
+    accelerator.print(f"Total GPUS: {accelerator.num_processes}")
+
+    # set seed
+
+    set_seed(CFG.SEED)
+
+    # Create the tokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+
+    # instantiate andromeda
+
+    model = Transformer(
+        num_tokens=64007,
+        max_seq_len=8192,
+        use_abs_pos_emb=False,
+        tokenizer=tokenizer, # !
+        embedding_provider=AndromedaEmbedding(),
+        attn_layers = Decoder(
+            dim=128, # 2048
+            depth=8, # 16
+            dim_head=128,
+            heads=8,
+            alibi_pos_bias=True,
+            alibi_num_heads=4,
+            rotary_xpos=True,
+            attn_flash = True,
+            # deepnorm=True,
+            shift_tokens=1,
+            attn_one_kv_head = True,
+            qk_norm=True,
+            attn_qk_norm=True,
+            attn_qk_norm_dim_scale=True # set this to True, in addition to `attn_qk_norm = True`
+        )
+    ).to(accelerator.device)
+
+    model = AutoregressiveWrapper(model).to(accelerator.device)
+
+    #optim = Lion(model.parameters(), lr=1e-4, weight_decay=1e-2)
+    optim = SophiaG(model.parameters(), lr=1e-5, weight_decay=1e-1)
+
+    print_num_params(model, accelerator)
+
+    if CFG.USE_ACTIVATION_CHECKPOINTING:
+        fsdp_activation_checkpointing(model, accelerator)
+
+    # dataloaders
+
+    if CFG.USE_PRETOKENIZED:
+        d0 = load_dataset("conceptofmind/c4_0-to-20_neox_with_eos_8k", split="train")
+        d1 = load_dataset("conceptofmind/c4_21-to-40_neox_with_eos_8k", split="train")
+        d2 = load_dataset("conceptofmind/c4_41-to-60_neox_with_eos_8k", split="train")
+        d3 = load_dataset("conceptofmind/c4_61-to-80_neox_with_eos_8k", split="train")
+        d4 = load_dataset("conceptofmind/c4_81-to-100_neox_with_eos_8k", split="train")
+
+        train_dataset = concatenate_datasets([d0, d1, d2, d3, d4])
+    else:
+        train_dataset = build_dataloaders()
+
+    train_loader = DataLoader(
+        train_dataset, batch_size=CFG.BATCH_SIZE, collate_fn=default_data_collator,
+    )
+
+    # optimizer
+
+    # optim = decoupled_optimizer(
+    #     model,
+    #     learning_rate=CFG.LEARNING_RATE,
+    #     weight_decay=CFG.WEIGHT_DECAY,
+    #     beta_1=0.9,
+    #     beta_2=0.95,
+    #     use_adamw=False,
+    # )
+
+    # Determine number of training steps
+
+    max_train_steps = math.ceil(len(train_loader) / CFG.GRADIENT_ACCUMULATE_EVERY)
+    accelerator.print(f"Max train steps: {max_train_steps}")
+
+    # lr scheduler
+    # We cant decide on an actual number
+
+    NUM_WARMUP_STEPS = int(max_train_steps * 0.01)
+    accelerator.print(f"Num warmup steps: {NUM_WARMUP_STEPS}")
+
+    lr_scheduler = get_lr_scheduler_with_warmup(
+        optimizer=optim,
+        scheduler_type="cosine",
+        num_warmup_steps=NUM_WARMUP_STEPS,
+        max_train_steps=max_train_steps,
+        grad_accumulate_every=CFG.GRADIENT_ACCUMULATE_EVERY
+    )
+
+    # prepare
+
+    model, optim, train_loader, lr_scheduler = accelerator.prepare(
+        model, optim, train_loader, lr_scheduler
+    )
+
+    # checkpoint scheduler
+
+    accelerator.register_for_checkpointing(lr_scheduler)
+
+    # I do not know why Huggingface recommends recalculation of max_train_steps
+
+    max_train_steps = math.ceil(len(train_loader) / CFG.GRADIENT_ACCUMULATE_EVERY)
+    accelerator.print(f"Max train steps recalculated: {max_train_steps}")
+
+    # Total batch size for logging
+
+    total_batch_size = (
+        CFG.BATCH_SIZE * accelerator.num_processes * CFG.GRADIENT_ACCUMULATE_EVERY
+    )
+    accelerator.print(f"Total batch size: {total_batch_size}")
+
+    # resume training
+
+    progress_bar = tqdm(
+        range(max_train_steps), disable=not accelerator.is_local_main_process
+    )
+    completed_steps = 0
+
+    if CFG.RESUME_FROM_CHECKPOINT:
+        if CFG.RESUME_FROM_CHECKPOINT is not None or CFG.RESUME_FROM_CHECKPOINT != "":
+            accelerator.print(f"Resuming from checkpoint {CFG.RESUME_FROM_CHECKPOINT}")
+            accelerator.load_state(CFG.RESUME_FROM_CHECKPOINT)
+            path = os.path.basename(CFG.RESUME_FROM_CHECKPOINT)
+        
+        training_difference = os.path.splitext(path)[0]
+
+        # need to multiply `gradient_accumulation_steps` to reflect real steps
+        resume_step = (
+            int(training_difference.replace("step_", ""))
+            * CFG.GRADIENT_ACCUMULATE_EVERY
+        )
+
+    if CFG.RESUME_FROM_CHECKPOINT and resume_step is not None:
+        train_loader = accelerator.skip_first_batches(train_loader, resume_step)
+        completed_steps += resume_step
+        progress_bar.update(resume_step)
+
+    # training
+
+    model.train()
+
+    for step, batch in enumerate(train_loader):
+        with accelerator.accumulate(model):
+            inputs = batch["input_ids"].to(accelerator.device)
+            _, loss = model(inputs, return_loss=True)
+            accelerator.backward(loss)
+
+            # print(loss.item())
+
+            accelerator.log({"loss": loss.item()}, step=step)
+
+            if accelerator.sync_gradients:
+                accelerator.clip_grad_norm_(model.parameters(), 0.5)
+
+            optim.step()
+            lr_scheduler.step()
+            optim.zero_grad()
+
+        if accelerator.sync_gradients:
+            progress_bar.update(1)
+            completed_steps += 1
+
+        if isinstance(CFG.CHECKPOINTING_STEPS, int):
+            if completed_steps % CFG.CHECKPOINTING_STEPS == 0:
+                output_dir = f"step_{completed_steps }"
+                if CFG.OUTPUT_DIR is not None:
+                    output_dir = os.path.join(CFG.OUTPUT_DIR, output_dir)
+                accelerator.save_state(output_dir)
+
+        if completed_steps >= max_train_steps:
+            break
+
+    # end training
+
+    accelerator.print("Training Finished")
+    accelerator.end_training()
+
+    # save final model
+
+    # accelerator.print(f"Saving model to {CFG.OUTPUT_DIR}")
+    if CFG.OUTPUT_DIR is not None:
+        base_path = f'{CFG.OUTPUT_DIR}/final'
+
+        if not os.path.exists(base_path):
+            os.makedirs(base_path)
+
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        with accelerator.main_process_first():
+            accelerator.save(
+                unwrapped_model.state_dict(), os.path.join(base_path, 'final_model.pt')
+            )
+
+if __name__ == "__main__":
+    TrainAndromeda()
\ No newline at end of file
diff --git a/Andromeda/train.py b/Andromeda/train.py
index e533aa3a016634f42d7d7f51bcb6d3aeb1f05643..214c031b3c342b24a58ed2a4f771491acc41488b 100644
--- a/Andromeda/train.py
+++ b/Andromeda/train.py
@@ -6,50 +6,45 @@ from functools import partial
 from itertools import chain
 
 import torch
-# import bitsandbytes as bnb
 
-from torch.distributed.fsdp import (
-    FullyShardedDataParallel,
-    MixedPrecision,
-    BackwardPrefetch,
-    ShardingStrategy,
-)
+########### SETUP CONFIG
+import torch.distributed as dist
 from accelerate import Accelerator
-from accelerate.utils import (DummyOptim, InitProcessGroupKwargs)
 from accelerate.logging import get_logger
-
-
+from accelerate.state import AcceleratorState
+from accelerate.utils import DummyOptim, InitProcessGroupKwargs
 from datasets import load_dataset
 from lion_pytorch import Lion
-from torch.nn import LayerNorm
-
-
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-    CheckpointImpl, apply_activation_checkpointing, checkpoint_wrapper)
-from torch.distributed.fsdp.wrap import (
-    transformer_auto_wrap_policy
+    CheckpointImpl,
+    apply_activation_checkpointing,
+    checkpoint_wrapper,
 )
 
-
+# import bitsandbytes as bnb
+from torch.distributed.fsdp import (
+    BackwardPrefetch,
+    FullyShardedDataParallel,
+    MixedPrecision,
+    ShardingStrategy,
+)
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from torch.nn import LayerNorm
 from torch.optim import AdamW
 from torch.utils.data import DataLoader
 from tqdm import tqdm
-from transformers import (AutoTokenizer, default_data_collator,
-                          get_cosine_schedule_with_warmup,
-                          get_linear_schedule_with_warmup, set_seed)
-
+from transformers import (
+    AutoTokenizer,
+    default_data_collator,
+    get_cosine_schedule_with_warmup,
+    get_linear_schedule_with_warmup,
+    set_seed,
+)
 
-from Andromeda.utils.stable_adamw import StableAdamWUnfused
-from Andromeda.core.transformer import Transformer, AndromedaEmbedding
 # from Andromeda.model import Andromeda
-from Andromeda.model import AndromedaEmbedding #, Andromeda
 from Andromeda.configs import Andromeda1Billion
-
-########### SETUP CONFIG
-import torch.distributed as dist
-
-
-from accelerate.state import AcceleratorState
+from Andromeda.core.transformer import Transformer
+from Andromeda.utils.stable_adamw import StableAdamWUnfused
 
 # state = AcceleratorState()
 
@@ -686,7 +681,7 @@ def Train():
             )
 
 
-def main():
+def train():
     os.environ['MASTER_ADDR'] #'localhost'
     os.environ['MASTER_PORT'] #= '9994'
     
@@ -702,4 +697,4 @@ def main():
     Train()
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    train()
\ No newline at end of file
diff --git a/Andromeda/utils/__init__.py b/Andromeda/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Andromeda/utils/decoupled_optimizer.py b/Andromeda/utils/decoupled_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7291575a34e0928ed6a961a3b5b10240692e0a6
--- /dev/null
+++ b/Andromeda/utils/decoupled_optimizer.py
@@ -0,0 +1,147 @@
+import torch
+# from palm_rlhf_pytorch.palm import LayerNorm
+from torch.nn import LayerNorm
+from torch.optim import AdamW
+
+# from palm.utils import print_main
+from Andromeda.utils.helpers import print_main
+from Andromeda.utils.stable_adamw import StableAdamWUnfused
+
+# optimizers
+
+
+def decoupled_optimizer(
+    model: torch.nn.Module,
+    learning_rate: float,
+    weight_decay: float = 0.1,
+    beta_1: float = 0.90,
+    beta_2: float = 0.95,
+    optimizer_type: str = "adamw",
+    use_fsdp: bool = True,
+):
+    """
+    Decouples the optimizer from the training process.
+
+    This function sets up the optimizer for the model by creating two groups of parameters:
+    one for weight decay and one without weight decay. Then, it initializes the optimizer
+    with these two groups of parameters.
+
+    Args:
+        model (Module): The model whose parameters are optimized.
+        learning_rate (float): The learning rate for the optimizer.
+        weight_decay (float): The weight decay for the optimizer.
+        beta_1 (float): The exponential decay rate for the 1st moment estimates.
+        beta_2 (float): The exponential decay rate for the 2nd moment estimates.
+        optimizer_type (str): The type of the optimizer. Can be 'lion', 'adamw', or 'stable_adamw'.
+        use_fsdp (bool, optional): If True, the optimizer will work with fully sharded data parallelism. Defaults to True.
+        accelerator (Accelerator, optional): The accelerator from HuggingFace's Accelerate library. Defaults to None.
+
+    Returns:
+        Optimizer: The initialized optimizer.
+
+    Raises:
+        ValueError: If the optimizer type is not 'lion', 'adamw' or 'stable_adamw'.
+    """
+    print_main(f"Using {optimizer_type} optimizer")
+    # Create an empty dictionary called param_dict to store the model's named parameters.
+    param_dict = {}
+    # Iterate over the model's named parameters and populate the param_dict with key-value pairs.
+    for param_name, param in model.named_parameters():
+        print_main(param_name)
+        param_dict[param_name] = param
+
+    # Separate the model's named modules into two groups: decay and no_decay.
+
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+    no_decay = []
+
+    if use_fsdp:
+        exclude_module = "_fsdp_wrapped_module.token_emb"
+    else:
+        exclude_module = "token_emb"
+
+    # Iterate through the named modules of the model.
+    for module_name, module in model.named_modules():
+        # Check if the current module is an instance of any of the desired types (LayerNorm or torch.nn.Embedding).
+        for ndim in [LayerNorm, torch.nn.Embedding]:
+            if isinstance(module, ndim):
+                # If torch.nn.Embedding, append its name with a ".weight" suffix to the no_decay list.
+                if module_name == exclude_module:
+                    no_decay.append(f"{module_name}.weight")
+                else:
+                    # If the module is an instance of LayerNorm
+                    no_decay.append(f"{module_name}.gamma")
+                # Exit the inner loop since the desired module has been found.
+                break
+
+    # Create an empty list to store the names of the Linear layer weights with weight decay.
+    decay = []
+
+    # Iterate through the named modules of the model.
+    for module_name, module in model.named_modules():
+        # Check if the current module is an instance of the desired type (torch.nn.Linear).
+        for ndim in [torch.nn.Linear]:
+            if isinstance(module, ndim):
+                # If the module is an instance of torch.nn.Linear, append its name with a ".weight" suffix to the decay list.
+                decay.append(f"{module_name}.weight")
+                # Exit the inner loop since the desired module has been found.
+                break
+
+    # Create two separate lists of model parameters: decay_param and no_decay_param.
+    # The decay_param list contains the parameters that should have weight decay applied.
+    # The no_decay_param list contains the parameters that should not have weight decay applied, excluding the 'to_logits.weight' parameter.
+
+    # Create an empty list called decay_param to store the parameters with weight decay.
+    decay_param = []
+
+    if use_fsdp:
+        exclude_param = "_fsdp_wrapped_module.to_logits.weight"
+    else:
+        exclude_param = "to_logits.weight"
+
+    # Iterate over the decay list, which contains the names of the parameters with weight decay.
+    for param in decay:
+        # Check if the current parameter is not 'to_logits.weight'.
+        # Append the corresponding parameter from param_dict to the decay_param list.
+
+        if param != exclude_param:
+            decay_param.append(param_dict[param])
+
+    # Create an empty list called no_decay_param to store the parameters without weight decay.
+    no_decay_param = []
+
+    # Iterate over the no_decay list, which contains the names of the parameters without weight decay.
+    for param in no_decay:
+        # Append the corresponding parameter from param_dict to the no_decay_param list.
+        no_decay_param.append(param_dict[param])
+
+    # Create a list called grouped_params that contains two dictionaries.
+    # The first dictionary has the decay_param list and the corresponding weight_decay value.
+    # The second dictionary has the no_decay_param list and a weight_decay value of 0.0.
+    grouped_params = [
+        {"params": decay_param, "weight_decay": weight_decay},
+        {"params": no_decay_param, "weight_decay": 0.0},
+    ]
+
+    # Create a variable called optimizer that stores an instance of the optimizer.
+    if optimizer_type == "adamw":
+        optimizer = AdamW(
+            grouped_params, 
+            lr=learning_rate, 
+            betas=(beta_1, beta_2),
+        )
+    elif optimizer_type == "stable_adamw":
+        optimizer = StableAdamWUnfused(
+            grouped_params, 
+            lr=learning_rate, 
+            betas=(beta_1, beta_2),
+        )
+    else:
+        raise ValueError(
+            "Invalid optimizer_type. Expected 'lion', 'adamw', 'deepspeed' or 'stable_adamw', got: {}".format(
+                optimizer_type
+            )
+        )
+
+    # Return the optimizer.
+    return optimizer
\ No newline at end of file
diff --git a/Andromeda/utils/helpers.py b/Andromeda/utils/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4516dae17a64708db57f053a159ce0a88ddfad02
--- /dev/null
+++ b/Andromeda/utils/helpers.py
@@ -0,0 +1,17 @@
+import torch.distributed as dist  # Add this line
+
+def print_num_params(model):
+    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+    if dist.is_available():
+        if dist.get_rank() == 0:
+            print(f"Number of parameters in model: {n_params}")
+    else:
+        print(f"Number of parameters in model: {n_params}")
+
+def print_main(msg):
+    if dist.is_available():
+        if dist.get_rank() == 0:
+            print(msg)
+    else:
+        print(msg)
\ No newline at end of file
diff --git a/Andromeda/utils/rf_utils.py b/Andromeda/utils/rf_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3acb9feef3e69546eee8d7ce82b78804b7622c4
--- /dev/null
+++ b/Andromeda/utils/rf_utils.py
@@ -0,0 +1,186 @@
+import math 
+import torch 
+from torch import einsum, _nnpack_available
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange
+import copy
+from pathlib import PurePath
+from tqdm import tqdm_gui
+from beartype import beartype
+from beartype.typing import Tuple, Optional
+
+from einops import rearrange, repeat, reduce, unpack
+from einops.layers.torch import Rearrange, Reduce
+
+
+#helpers
+def exists(val):
+    return val is not None
+
+
+#decorators
+def eval_decorator(fn):
+    def inner(self, *args, **kwargs):
+        was_training = self.training
+        self.eval()
+        out = fn(self, *args, **kwargs)
+        self.train(was_training)
+        return out
+    return inner
+
+def defaults(val, d):
+    return val if exists(val) else d
+
+#tensor helpers
+
+def log(t, eps=1e-20):
+    return torch.log(t.clamp(min = eps))
+
+def masked_mean(seq, mask=None, dim=1, keepdim=True):
+    if not exists(mask):
+        return seq.mean(dim=dim)
+    
+    if seq.ndim == 3:
+        mask = rearrange(mask, 'b n -> b n 1')
+
+    masked_seq = seq.masked_fill(~mask, 0.)
+    numer = masked_seq.sum(dim=dim, keepdim=keepdim)
+    denom = mask.sum(dim=dim, keepdim=keepdim)
+
+    masked_mean = numer / denom.clamp(min = 1e-3)
+    masked_mean = masked_mean.masked_fill(denom == 0, 0.)
+    return masked_mean
+
+
+#sampling helpers
+
+def gumbel_noise(t):
+    noise = torch.zeros_like(t).uniform(0, 1)
+    return -log(-log(noise))
+
+
+def gumbel_sample(t, temperature = 1., dim=-1):
+    return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim=dim)
+
+def top_p(logits, thres=0.9):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cum_probs = torch.einsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+    sorted_indices_to_remove = cum_probs > (1 - thres)
+    sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+    sorted_indices_to_remove[:, 0] = 0
+
+    sorted_logits[sorted_indices_to_remove] = float("-inf")
+    return sorted_logits.scatter(1, sorted_indices, sorted_logits)
+
+def top_k(logits, thres=0.9):
+    k = math.ceil((1 - thres) * logits.shape[-1])
+    val, ind = torch.topk(logits, k)
+    probs = torch.full_like(logits, float('-inf'))
+    probs.scatter_(1, ind, val)
+    return probs
+
+
+class LoRA(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        r=8,
+        alpha=None
+    ):
+        super().__init__()
+        alpha = defaults(alpha, r)
+        self.scale = alpha / r
+
+        self.A = nn.Parameter(torch.randn(dim, r))
+        self.B = nn.Parameter(torch.zeros(r, dim_out))
+
+
+
+#reward model
+@beartype
+
+class RewardModel(nn.Module):
+    def __init__(
+            self,
+            model: Andromeda,
+            dropout=0.1,
+            num_binned_output = 0.,
+            use_lora = True,
+            lora_r = 8,
+            reward_lora_scope = 'reward',
+    ):
+        super().__init__()
+        
+        self.model = copy.deepcopy(Andromeda)
+        self.model.set_dropout(dropout)
+
+        self.reward_lora_scope = reward_lora_scope is use_lora else None
+
+        if exists(self.reward_lora_scope):
+            self.model.add_finetune_params(reward_lora_scope, lora_r = lora_r)
+
+        dim = model.dim
+
+        self.binned_output = num_binned_output > 1
+
+        self.prompt_embed = nn.Parameter(torch.zeros(1, 1, dim))
+        self.response_embed = nn.Parameter(torch.zeros(1, 1, dim))
+
+
+        if self.binned_output:
+            self.to_pred = nn.Linear(dim, num_binned_output)
+        else:
+            self.to_pred = nn.Sequential(
+                nn.Linear(dim, 1, bias=False),
+                Rearrange('... 1 -> ...')
+            )
+
+    def load(self, path):
+        path = Path(path)
+        assert path.exists()
+        self.load_state_dict(torch.load(str(path)))
+
+    def finetune_parameters(self):
+        return (
+            *self.to_pred.parameters(),
+            *(self.model.finetune_parameters(self.reward_lora_scope) if exists(self.reward_lora_scope) else model.parameters())
+        )
+    
+    
+    def forward(
+            self,
+            x,
+            mask=None,
+            prompt_mask=None,
+            prompt_lengths=None,
+            labels=None,
+            sample=False,
+            sample_temperature=1.,
+            disable_lora=False
+    ):
+        assert not (exists(prompt_mask) and exists(prompt_lengths))
+
+        #derive prompt mask from prompt lengths
+
+        if exists(prompt_lengths):
+            batch, seq_len = x.shape
+            arange = torch.arange(seq_len, device = x.device)
+            prompt_mask = repeat(arange, 'n -> n n', b = batch) > rearrange(prompt_lengths, 'b -> b 1')
+
+        #rward model should have an understand of which section is prompt and which section is repsonse
+
+        extra_embed = None
+
+        if exists(prompt_mask):
+            extra_embed = torch.where(
+                rearrange(prompt_mask, 'b n -> b n 1'),
+                self.prompt_embed,
+                self.response_embed
+            )
+
+        embeds = self.model(
+            x,
+        )
\ No newline at end of file
diff --git a/Andromeda/utils/stable_adamw.py b/Andromeda/utils/stable_adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8f63fd873d8663a355d8d63504bf738290ae03e
--- /dev/null
+++ b/Andromeda/utils/stable_adamw.py
@@ -0,0 +1,96 @@
+
+import torch
+
+# This is the unfused version of StableAdamW. It is slower than the fused version (coming).
+
+
+class StableAdamWUnfused(torch.optim.Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=0.002,
+        weight_decay=0.2,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        clip_thresh=1.0,
+        precision="amp_bfloat16",
+        custom_scalar=65536,
+    ):
+        beta1, beta2 = betas[0], betas[1]
+        defaults = dict(lr=lr, weight_decay=weight_decay, beta1=beta1, beta2=beta2)
+        super(StableAdamWUnfused, self).__init__(params, defaults)
+
+        self.eps = eps
+        self.d = clip_thresh
+
+        # Set precision to "custom_fp16" if you want to use a fixed loss scalar, custom_scalar, which is divided out in the update step.
+        # If you do this, call (custom_scalar * loss).backward() instead of loss.backward().
+        self.precision = precision
+        self.custom_scaler = custom_scalar
+
+        for group in self.param_groups:
+            group["step"] = 1.0
+
+        print("Using StableAdamWUnfused-v1")
+
+    def __setstate__(self, state):
+        super(StableAdamWUnfused, self).__setstate__(state)
+
+    def step(self, closure=None):
+        if closure is not None:
+            closure()
+
+        for group in self.param_groups:
+            lr = group["lr"]
+            weight_decay = group["weight_decay"]
+            beta1 = group["beta1"]
+            beta2 = group["beta2"]
+            step = group["step"]
+
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                theta = p.data
+                param_state = self.state[p]
+
+                if self.precision == "custom_fp16":
+                    g = p.grad.data / self.custom_scaler
+                    if torch.any(torch.isnan(g) | torch.isinf(g)):
+                        continue
+                else:
+                    g = p.grad.data
+
+                if "exp_avg" not in param_state:
+                    v = param_state["exp_avg"] = torch.zeros_like(theta)
+                    u = param_state["exp_avg_sq"] = torch.zeros_like(theta)
+                else:
+                    v = param_state["exp_avg"]
+                    u = param_state["exp_avg_sq"]
+
+                beta1hat = beta1 * (1 - beta1 ** (step - 1)) / (1 - beta1**step)
+                beta2hat = beta2 * (1 - beta2 ** (step - 1)) / (1 - beta2**step)
+
+                v = v.mul_(beta1hat).add_(g, alpha=1.0 - beta1hat)
+                u = u.mul_(beta2hat).addcmul_(g, g, value=1.0 - beta2hat)
+
+                denominator = u.sqrt().add_(self.eps)
+
+                # StableAdamW = AdamW + update clipping (https://arxiv.org/abs/1804.04235) applied tensor-wise.
+                rms = (
+                    torch.div(
+                        g.pow(2), torch.maximum(u, (self.eps**2) * torch.ones_like(u))
+                    )
+                    .mean()
+                    .sqrt()
+                    .item()
+                )
+
+                theta = theta.mul_(1.0 - lr * weight_decay).addcdiv_(
+                    v, denominator, value=-lr * (1.0 / max(1.0, rms / self.d))
+                )
+
+                # save current params
+                param_state["exp_avg"] = v
+                param_state["exp_avg_sq"] = u
+
+            group["step"] = step + 1
\ No newline at end of file
diff --git a/DOCs/Corporation/MONETIZATION.md b/DOCs/Corporation/MONETIZATION.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6fe94daf042aecb255aeafebe3b98a18d05c21c
--- /dev/null
+++ b/DOCs/Corporation/MONETIZATION.md
@@ -0,0 +1,51 @@
+# Andromeda Product Brief and Monetization Strategy Document
+
+## Product Summary:
+
+Andromeda is an innovative language model designed for high performance and efficiency. It utilizes advanced techniques that allow it to process and learn from multiple sources and adapt in real-time. 
+
+## Monetization Strategies:
+
+1. **Usage-based API:** Provide Andromeda as a paid API service where users pay based on the amount of computation they use.
+2. **Consulting deals:** Offer expert consulting services to businesses looking to incorporate Andromeda's capabilities into their operations.
+3. **Dedicated capacity:** Sell dedicated computational power to businesses for exclusive usage of Andromeda's capabilities.
+4. **Licensing the technology:** Allow companies to license the Andromeda model for their proprietary use.
+5. **Subscription models:** Provide access to Andromeda's capabilities on a subscription basis.
+6. **Freemium model:** Offer basic usage of Andromeda for free, while charging for advanced features and capabilities.
+7. **Partnerships:** Form strategic partnerships with tech companies that can leverage Andromeda's capabilities in their products and services.
+8. **Sponsorships:** Sponsor research projects or tech events to get visibility and promote Andromeda's services.
+9. **Training and certifications:** Offer training programs and certifications on Andromeda usage and applications.
+10. **Custom development:** Offer custom development services for businesses that want specialized applications of Andromeda.
+
+## Potential Customers:
+
+1. **Tech companies:** Andromeda can be integrated into a wide array of tech products and services.
+2. **Educational institutions:** Universities and research institutions can use Andromeda for research purposes.
+3. **Government agencies:** Andromeda can assist in processing and analyzing large amounts of data.
+4. **Healthcare providers:** Andromeda can be used in data analysis and decision making in healthcare.
+5. **Media and entertainment industry:** Andromeda's language model can be used in content creation and curation.
+
+## Potential Cashflow Gains:
+
+1. **API usage revenues:** Charging per API call can generate substantial revenues with a high number of users.
+2. **Subscription fees:** A tier-based subscription model can ensure a steady income stream.
+3. **Licensing fees:** Companies willing to license the technology can provide a significant one-time or recurring revenue.
+4. **Consulting fees:** Consulting services can yield high-value contracts.
+5. **Sponsorship revenues:** Sponsoring events or projects can yield returns in the form of new business leads and customers.
+
+## Expenses:
+
+1. **Cloud infrastructure costs:** Major expense in maintaining and scaling the Andromeda model.
+2. **Research and development:** Continual improvement of Andromeda requires ongoing investment.
+3. **Marketing and sales:** Promoting Andromeda and closing sales deals will be a recurring expense.
+4. **Operational costs:** Expenses related to managing the company, including salaries, office space, utilities, and more.
+5. **Open-source contributors:** Andromeda is built on the contributions of numerous developers. Recognizing these contributors through a rewards program is an essential part of maintaining a healthy development ecosystem.
+
+### Open Source Contributors:
+
+The following is a representative list of contributors who have helped make Agora what it is today:
+
+1. Kye
+2. Nicolo
+
+Each contributor brings unique expertise and value to the project, helping to shape Andromeda into a powerful, efficient, and intelligent language model that will revolutionize the NLP landscape.
\ No newline at end of file
diff --git a/DOCs/Design/Dyson.md b/DOCs/Design/Dyson.md
new file mode 100644
index 0000000000000000000000000000000000000000..c924cd1bcda82ff4c8aa0f02e0d238ecd77b50ad
--- /dev/null
+++ b/DOCs/Design/Dyson.md
@@ -0,0 +1,26 @@
+Insights and Techniques:
+
+1. Flops: The importance of considering the number of floating-point operations (FLOPs) when designing models.
+2. Flash Attention 2.0: The use of techniques like Flash Attention 2.0 cuda to enable more FLOPs in the model.
+3. Mixed Precision: Utilizing mixed precision training to improve training speed and memory efficiency.
+4. Deepspeed 3 with NVMe: Using Deepspeed 3 with NVMe for optimizing training performance.
+5. 8-bit Optimizer: Employing an 8-bit optimizer for further speed improvements.
+6. Gradient Clipping: Adding gradient clipping to achieve massive speedup during training.
+7. XPOS, ALIBI, QK Layernorm: Leveraging advanced techniques for extrapolation, interpolation, and training stabilization.
+8. Multi Query Attention: Using multi-query attention to boost decoding speed.
+9. Parallelized Transformer Blocks: Parallelizing transformer blocks to enhance overall model performance.
+10. Positional Embeddings and Shifted Tokens: The decision to not use positional embeddings and utilization of shifted tokens for sequence length advancement.
+11. Positional Interpolation: Incorporating positional interpolation for improved sequence handling.
+12. Optimized CUDA Embedding Function: Utilizing an optimized CUDA embedding function for better performance.
+13. Nebula Loss Function: Implementing the Nebula loss function, a polymorphic loss function for multi-task training.
+
+Possible Improvements:
+
+1. Clearer Metrics: To validate the model's claims, it would be beneficial to establish specific metrics for monitoring across training, especially regarding reasoning capabilities.
+2. Validation and Testing Environment: Further development and description of the exhaustive testing environment to validate the model's performance and capabilities.
+3. Comprehensive Documentation: Provide detailed documentation of the model's architecture, training methodology, and testing procedures to ensure transparency and replicability.
+4. Benchmarking Against Competitors: Perform benchmarking against existing models to showcase the advantages and differentiation offered by the proposed architecture and training techniques.
+5. Real-World Applications: Highlight potential real-world applications or use cases where the proposed model can provide superior performance compared to existing solutions.
+6. Explainability and Interpretability: Consider incorporating methods for model explainability and interpretability, especially in applications where these aspects are crucial.
+7. Addressing Specific Niche Needs: Identify specific niches or use cases where the model can excel and tailor marketing and development efforts accordingly.
+8. Collaboration and Peer Review: Engage with the research community, participate in peer review, and seek collaboration opportunities to gain additional insights and validation.
\ No newline at end of file
diff --git a/DOCs/Design/MODEL_ARCHITECTURE.md b/DOCs/Design/MODEL_ARCHITECTURE.md
new file mode 100644
index 0000000000000000000000000000000000000000..7b1499a88aa928e4c1483176fc57aebd08a23d6f
--- /dev/null
+++ b/DOCs/Design/MODEL_ARCHITECTURE.md
@@ -0,0 +1,57 @@
+
+### Alibi Positional Bias
+
+Alibi positional bias allows the model to learn relative positions between tokens, enabling it to better capture the relationships and dependencies between tokens in a sequence.
+
+Usage example:
+
+```python
+attn_layers = Decoder(
+    ...
+    alibi_pos_bias=True,
+    alibi_num_heads=4,
+    ...
+)
+```
+
+### Rotary Position Encodings (xpos)
+
+Rotary position encodings introduce a more efficient way to encode positions in the input sequence. They avoid the need for absolute positional embeddings, reducing the model's memory footprint and improving training speed.
+
+Usage example:
+
+```python
+attn_layers = Decoder(
+    ...
+    rotary_xpos=True,
+    ...
+)
+```
+
+### Flash Attention
+
+Flash attention speeds up the self-attention mechanism by reducing the number of attention computations. It accelerates training and inference while maintaining a high level of performance.
+
+Usage example:
+
+```python
+attn_layers = Decoder(
+    ...
+    attn_flash=True,
+    ...
+)
+```
+
+Usage example:
+
+```python
+attn_layers = Decoder(
+    ...
+    deepnorm=True,
+    ...
+)
+```
+
+### Deep Normalization (deepnorm)
+
+Deep normalization is a technique that normalizes the activations within a layer, helping with training stability and convergence. It allows the model to better learn complex patterns and generalize to unseen data.
\ No newline at end of file
diff --git a/DOCs/Design/SPEED.md b/DOCs/Design/SPEED.md
new file mode 100644
index 0000000000000000000000000000000000000000..b154664db9ebfc1ed36ca666162ec39507f71dde
--- /dev/null
+++ b/DOCs/Design/SPEED.md
@@ -0,0 +1,11 @@
+# Increasing Speed 
+
+* Integrate Flash Attention 2.0 cuda, significant speed up
+
+* Utilize 8BIT Optimizer from BNB, big speed up weakness => bnb isn't compatible with all gpus
+
+* Use a better tokenizer TokenMonster?
+
+* Parallelize the transformer blocks similar to that of [PALMS](https://github.com/conceptofmind/PaLM)
+
+* Look into MPTS config for LION for pretraining, did they use high batch size?
\ No newline at end of file
diff --git a/DOCs/Design/Specs.md b/DOCs/Design/Specs.md
new file mode 100644
index 0000000000000000000000000000000000000000..086d4f2be9e69a0d4778eb826e67d7e081c9dd97
--- /dev/null
+++ b/DOCs/Design/Specs.md
@@ -0,0 +1,196 @@
+## **Andromeda Specs**: Unveiling Mastery
+
+**Overview**
+Elegantly marrying craftsmanship and technology, Andromeda is not just another step in AI evolution. It's a giant leap. Driven by precision, powered by innovation, and defined by excellence, Andromeda is the epitome of intelligence realized. Here, we detail the marvel that is Andromeda, in numbers, facts, and logic.
+
+---
+
+### **Specifications**
+
+| **Feature**                                  | **Specification**                             |
+|----------------------------------------------|-----------------------------------------------|
+| **Sequence Handling**                        | Ultra Long (32,000 - 200,000+ context lengths)|
+| **Processing Speed**                         | Ultra Fast (32,000+ tokens in < 100ms)        |
+| **Reasoning Abilities**                      | Creativity, Quantitative                                      |
+| **Attention Mechanism**                      | Flash Attention 2.0 Triton                    |
+| **Memory Consumption** (compared to GPT-3)   | 100x Less                                      |
+| **Memory Consumption** (compared to LLAMA)   | 30x Less                                      |
+| **Max Sequence Processing Speed**            | 100,000+ sequences in < 300ms                 |
+| **Dataset Strategy**                         | Books, Falcon, Redpajama, Math, Code                              |
+| **Functionality**                            | FSDP, HF Accelerate,  Poetry Composition, API Calls, and more       |
+
+---
+
+### **Benchmarks**
+**Speed**: At the heart of Andromeda's unparalleled capabilities is its raw speed. Leveraging the prowess of Flash Attention 2.0 Triton, it doesn't merely process data; it blazes through it. This power allows it to consume 50x less memory than its predecessor, GPT-3, and 10x less than LLAMA.
+
+---
+
+### **Why Andromeda?**
+- **Performance**: Andromeda isn't about doing things faster; it's about doing them the best. Reliable processing of sequences, even as extensive as 100,000+ lengths, is realized in the blink of an eye, under 300ms.
+  
+- **Precision and Creativity**: The dataset strategy is no mere algorithm. It's a symphony, meticulously crafted to offer both creativity and quantitative reasoning.
+  
+- **Versatility**: Andromeda doesn't just compute; it contemplates. Whether you need the flair of a poet or the precision of an API call, Andromeda delivers, seamlessly.
+
+---
+
+### **Andromeda Principles**
+- **Efficiency**: It's not just about doing more; it's about doing better. Techniques like attention flashing, rotary position encodings, and deep normalization ensure every cycle, every operation, every byte is optimized for performance.
+  
+- **Flexibility**: In the ever-evolving world of technology, adaptability is king. Andromeda is designed to mold, adapt, and excel, irrespective of the task or domain.
+  
+- **Scalability**: Grow with you, for you. Andromeda isn't static. It's dynamic, designed to scale, accommodating growing resources and expanding data sizes.
+  
+- **Community-Driven**: Behind Andromeda's machine brain is the human heart of the community. It doesn't just utilize open source; it thrives on it, constantly evolving, learning, and improving with contributions from around the world.
+
+
+For enthusiasts, developers, and thinkers looking to dive deeper, the Model Architecture documentation offers an exhaustive, detailed view into the intricate marvel that is Andromeda. Dive in, and witness engineering and artistry in harmony.
+
+---
+
+### **Andromeda: A Detailed Technical Overview**
+
+At the intersection of technological ingenuity and groundbreaking design principles, Andromeda emerges. Representing the zenith of years of research and development, it promises a transformative leap in AI performance, efficiency, and versatility. In this technical specifications document, we deconstruct the intricacies of Andromeda, presenting a meticulous overview of its structure, performance metrics, and underlying methodologies.
+
+## **Feature Insights**
+
+### **Alibi Positional Bias**
+Empowering Andromeda to discern relative positions between tokens, this feature accentuates its ability to grasp intricate relationships within a sequence.
+
+### **Rotary Position Encodings (xpos)**
+This is a revolutionary means of encoding positions, shrinking the model's memory demands and propelling training speeds.
+
+### **Flash Attention**
+This is the linchpin of Andromeda's speed prowess, minimizing attention computations, thus boosting training and inference phases.
+
+### **Deep Normalization (deepnorm)**
+By normalizing activations, deep normalization shores up training stability, allowing Andromeda to identify intricate patterns with finesse.
+
+## **Feature Insights (Contd.)**
+
+### **Attn One KV Head (Multiquery Attention)**
+A breakthrough in attention mechanism design, this feature allows for simultaneous computation of multiple queries against the same set of key-values, fostering speed and efficiency.
+
+### **QK Norm & Attention QK Norm**
+These two features introduce a normalization step in the query and key matrices. This step facilitates stabilization in the attention mechanism, rendering it more robust and enabling it to scale with larger input sizes.
+
+### **Attention QK Norm Dimension Scale**
+A sophisticated adjustment to the attention mechanism, it modulates the normalization scale in accordance to the dimensions of the model. The result is a more adaptive and responsive attention framework.
+
+### **Embedding Provider**
+At the foundation of Andromeda, this module facilitates the embedding process, converting token sequences into dense vectors. Tailored for Andromeda, it ensures rapid and efficient embedding processes.
+
+---
+
+## **Deeper Dive: Model Parameters**
+
+Unpacking Andromeda means diving deep into the parameters that shape its capabilities. Here's a granular view:
+
+| **Parameter**                           | **Description**                                                                                                                                                                           | **Default Value** |
+|-----------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|
+| **num_tokens**                          | Total number of tokens in the vocabulary.                                                                                                                                                | 50432             |
+| **max_seq_len**                         | Maximum sequence length the model can process.                                                                                                                                           | 8192              |
+| **dim**                                 | Dimension size of the model. It represents the size of embeddings and general depth in neural layers.                                                                                    | 2560              |
+| **depth**                               | Represents the number of transformer layers in the architecture.                                                                                                                         | 32                |
+| **dim_head**                            | Dimension size of each head in multi-head attention mechanism.                                                                                                                           | 128               |
+| **heads**                               | Total number of heads in multi-head attention.                                                                                                                                           | 24                |
+| **use_abs_pos_emb**                     | Boolean flag to determine if absolute positional embeddings are used.                                                                                                                     | False             |
+| **alibi_pos_bias**                      | Enables the alibi positional bias in attention mechanisms.                                                                                                                               | True              |
+| **alibi_num_heads**                     | Specifies the number of heads for the alibi positional bias.                                                                                                                             | 12                |
+| **rotary_xpos**                         | Determines if rotary positional encodings are utilized.                                                                                                                                  | True              |
+| **attn_flash**                          | Flag to activate the Flash Attention mechanism, minimizing computations in the attention phase.                                                                                          | True              |
+| **shift_tokens**                        | The number of tokens by which input sequences are shifted. Essential for certain sequence-to-sequence tasks.                                                                             | 1                 |
+| **attn_one_kv_head**                    | Activates multiquery attention by computing multiple queries against a singular key-value pair.                                                                                          | True              |
+| **qk_norm**                             | Enables the query-key normalization mechanism in the attention phase.                                                                                                                    | True              |
+| **attn_qk_norm**                        | A more advanced version of query-key normalization that scales according to the model's dimensions.                                                                                      | True              |
+| **attn_qk_norm_dim_scale**              | Modulates the scale of the aforementioned attention normalization based on the model's dimensionality.                                                                                  | True              |
+| **embedding_provider**                  | The module responsible for providing embeddings. Custom providers can be passed for tailored embedding processes.                                                                       | AndromedaEmbedding|
+
+---
+
+
+## **Insights and Techniques**
+
+#### **1. Floating-Point Operations (FLOPs)**
+Considering the number of FLOPs is paramount. It provides a metric to gauge the computational intensity and, by extension, the potential speed of the model.
+
+#### **2. Flash Attention 2.0 Triton**
+Enhanced with CUDA, this method offers a significant surge in the number of FLOPs the model can handle, amplifying its overall efficiency.
+
+#### **3. Mixed Precision Training**
+By embracing mixed precision, Andromeda realizes a noteworthy uptick in training speed while achieving commendable memory efficiency.
+
+#### **4. Deepspeed 3 with NVMe Integration**
+This powerful combination paves the way for superlative optimization during the training phase.
+
+#### **5. 8-bit Optimizer**
+Further pushing the boundaries of speed, the 8-bit optimizer boosts processing times without compromising the integrity of results.
+
+#### **6. Gradient Clipping**
+This technique has been integrated into the training regimen, achieving a massive speedup and preventing undesirable spikes during the process.
+
+#### **7. Advanced Techniques: XPOS, ALIBI, QK Layernorm**
+These sophisticated techniques are harnessed for superior extrapolation, interpolation, and stabilization during training.
+
+#### **8. Multi Query Attention**
+This approach has been adopted to supercharge decoding speeds.
+
+#### **9. Parallelized Transformer Blocks**
+Ensuring that the model's performance is consistently high, these blocks run in tandem to provide a smooth and efficient operational experience.
+
+#### **10. Shifted Tokens**
+In a strategic move, Andromeda sidesteps traditional positional embeddings, relying instead on shifted tokens for sequence length progression.
+
+#### **11. Positional Interpolation**
+This innovative technique augments the model's ability to manage sequences more effectively.
+
+#### **12. Optimized CUDA Embedding Function**
+This function is tailored for peak performance, ensuring rapid and accurate computations.
+
+#### **13. Nebula Loss Function**
+Integrated into Andromeda, this polymorphic loss function is adept at handling multi-task training scenarios.
+
+## **A Word on Optimization and Future Iterations**
+
+As with any state-of-the-art model, Andromeda's design is an ever-evolving tapestry. This means iterative refinement. As feedback streams in and technology progresses, expect advancements in:
+
+- **Model Pruning**: Trimming redundancies, bolstering efficiency.
+- **Knowledge Distillation**: Harnessing the wisdom of larger models in smaller, more agile architectures.
+- **Zero-Shot and Few-Shot Learning**: Broadening adaptability horizons.
+- **Enhanced Data Augmentation**: Fortifying the model's grasp on varied, nuanced contexts.
+- **Decentralized Training**: Tapping into the global hive-mind, harnessing the collaborative power of the community.
+
+
+## **Potential Other Future Trajectories**
+
+#### **1. Clearer Metrics**
+There's always room to elevate the benchmarking rigor, especially concerning reasoning abilities.
+
+#### **2. Robust Validation and Testing Environment**
+Further fine-tuning of the testing environment can offer even more reliable validations of Andromeda's capabilities.
+
+#### **3. Comprehensive Documentation**
+To bolster transparency and replicability, detailed documentation covering every facet of Andromeda is on the horizon.
+
+#### **4. Benchmarking Against Peers**
+By juxtaposing Andromeda against its counterparts, its distinctive advantages can be spotlighted more effectively.
+
+#### **5. Spotlight on Real-World Applications**
+By highlighting tangible use-cases, the versatility and prowess of Andromeda can be showcased in palpable contexts.
+
+#### **6. Model Interpretability**
+Future iterations might delve deeper into model interpretability, especially for critical applications.
+
+#### **7. Niche Customizations**
+By tailoring Andromeda to meet specific niche needs, its adaptability and value proposition can be further enhanced.
+
+#### **8. Collaborative Endeavors**
+Engaging more intimately with the global research community could spawn collaborative projects, bringing diverse insights to the fore.
+
+
+As we voyage further into the AI frontier, Andromeda stands as a beacon, illuminating the path forward, promising marvels yet to come. It's not just about machine intelligence; it's about the dance between human curiosity and machine capability.
+
+---
+
+Join us on this journey. Dive deeper, ask questions, innovate, and let's redefine what's possible, together.
\ No newline at end of file
diff --git a/DOCs/Docs/DOCUMENTATION.md b/DOCs/Docs/DOCUMENTATION.md
new file mode 100644
index 0000000000000000000000000000000000000000..2917356be297f29713234b1efbeb2ee8cfc00e23
--- /dev/null
+++ b/DOCs/Docs/DOCUMENTATION.md
@@ -0,0 +1,145 @@
+# Documentation
+
+## `DatasetBuilder`
+
+### DatasetBuilder
+
+DatasetBuilder provides a convenient way to build datasets for training the Andromeda model.
+
+#### Constructor
+
+```python
+def __init__(
+    self,
+    dataset_name,
+    seq_len=8192,
+    num_cpu=None,
+    hf_account_repo=None,
+    tokenizer="EleutherAI/gpt-neox-20b",
+)
+```
+
+Initialize the DatasetBuilder.
+
+**Args:**
+
+- `dataset_name` (str): Name of the dataset to process.
+- `seq_len` (int): Maximum sequence length.
+- `num_cpu` (int, optional): Number of CPU cores to use for multiprocessing. Defaults to None.
+- `hf_account_repo` (str, optional): Hugging Face account name and repository to push the processed dataset. Defaults to None.
+- `tokenizer` (str, optional): Tokenizer model to use. Defaults to "EleutherAI/gpt-neox-20b".
+
+#### Methods
+
+##### build_dataset
+
+```python
+def build_dataset(self) -> torch.utils.data.Dataset
+```
+
+Build and process the dataset.
+
+**Returns:**
+
+- `torch.utils.data.Dataset`: The processed dataset ready for training.
+
+
+
+## AndromedaTokenizer
+
+### Purpose
+
+The `AndromedaTokenizer` class provides tokenization functionality using the Hugging Face tokenizer. It allows you to tokenize texts using the specified tokenizer model.
+
+### Systems Understanding
+
+The `AndromedaTokenizer` class initializes a tokenizer model from the Hugging Face library. It uses the `AutoTokenizer.from_pretrained` method to load the tokenizer model with specific parameters such as the EOS token, pad token, extra IDs, and model maximum length. The `tokenize_texts` method tokenizes input texts using the tokenizer model and returns the tokenized input IDs.
+
+### Usage Example
+
+```python
+from Andromeda import AndromedaTokenizer
+
+# Initialize the tokenizer
+tokenizer = AndromedaTokenizer()
+
+# Tokenize texts
+texts = ["This is an example sentence.", "Another example sentence."]
+tokenized_ids = tokenizer.tokenize_texts(texts)
+
+print(tokenized_ids)
+```
+
+## Andromeda
+
+### Purpose
+
+The `Andromeda` class is a transformer-based model architecture. It consists of a `Transformer` and `AutoregressiveWrapper` with default or user-specified parameters.
+
+### Systems Understanding
+
+The `Andromeda` class initializes with a `Transformer` and `AutoregressiveWrapper`. The `Transformer` encapsulates the main transformer model, and the `AutoregressiveWrapper` enables autoregressive generation using the transformer model.
+
+The constructor of the `Andromeda` class takes various parameters that define the architecture of the model, such as the number of tokens, maximum sequence length, model dimension, depth, number of heads, etc. These parameters are used to initialize the `Transformer` and `AutoregressiveWrapper` with the specified configuration.
+
+The `forward` method performs a forward pass through the model. It takes the input `text_tokens` as input and passes it through the `Decoder` module inside the `Andromeda` model. The output from the decoder is returned as the result.
+
+### Usage Example
+
+```python
+from Andromeda import Andromeda
+
+# Create an instance of the Andromeda model
+model = Andromeda()
+
+# Define the input text tokens
+text_tokens = [1, 2, 3, 4, 5]  # Example input tokens
+
+# Perform a forward pass through the model
+output = model.forward(text_tokens)
+
+print(output)
+```
+
+### Constructor
+
+```python
+def __init__(self, num_tokens=50304, max_seq_len=8192, dim=2560, depth=32, dim_head=128, heads=24, use_abs_pos_emb=False, alibi_pos_bias=True, alibi_num_heads=12, rotary_xpos=True, attn_flash=True, deepnorm=True, shift_tokens=1, attn_one_kv_head=True, qk_norm=True, attn_qk_norm=True, attn_qk_norm_dim_scale=True, embedding_provider=AndromedaEmbedding())
+```
+
+- `num_tokens` (optional): Number of tokens in the vocabulary.
+- `max_seq_len` (optional): Maximum sequence length.
+- `dim` (optional): Dimension of the model.
+- `depth` (optional): Depth of the model.
+- `dim_head` (optional): Dimension of the model head.
+- `heads` (optional): Number of heads.
+- `use_abs_pos_emb` (optional): Whether to use absolute position embedding.
+- `alibi_pos_bias` (optional): Alibi position bias.
+- `alibi_num_heads` (optional): Number of alibi heads.
+- `rotary_xpos` (optional): Rotary position.
+- `attn_flash` (optional): Attention flash.
+- `deepnorm` (optional): Deep normalization.
+- `shift_tokens` (optional): Number of tokens to shift.
+- `attn_one_kv_head` (optional): Attention one key/value head.
+- `qk_norm` (optional): Query-key normalization.
+- `attn_qk_norm` (optional): Attention query-key normalization.
+- `attn_qk_norm_dim_scale` (optional): Attention query-key normalization dimension scale.
+- `embedding_provider` (optional): Embedding provider module.
+
+### Methods
+
+- `forward(text_tokens, **kwargs)`: Performs a forward pass through the model.
+  - `text_tokens` (required): Input tokens.
+  - `kwargs` (optional): Other arguments.
+
+### Args
+
+- `text_tokens` (list): Input tokens.
+
+### Returns
+
+- Output from the decoder module.
+
+## Conclusion
+
+The Andromeda module provides a transformer-based model architecture for text generation. The `AndromedaTokenizer` class allows you to tokenize texts using the specified tokenizer model. The `Andromeda` class initializes with a transformer and autoregressive wrapper, providing the functionality for text generation. By using the provided classes and methods, you can generate text using the Andromeda model.
\ No newline at end of file
diff --git a/DOCs/Docs/TRAINING.md b/DOCs/Docs/TRAINING.md
new file mode 100644
index 0000000000000000000000000000000000000000..cf38f61c994212f138812261336cdf625cedcde0
--- /dev/null
+++ b/DOCs/Docs/TRAINING.md
@@ -0,0 +1,82 @@
+# Andromeda Model Training Standard Operating Procedure
+
+This document provides instructions on how to train the Andromeda model end-to-end using the provided code. The training procedure consists of three main scripts: `build_dataset.py`, `model.py`, and `train_distributed.py`. Follow the steps below to train the Andromeda model.
+
+## Prerequisites
+
+Before starting the training process, ensure that you have the following requirements:
+
+- Python 3.7 or higher
+- PyTorch 1.9 or higher
+- Transformers library
+- Datasets library
+- Accelerate library
+- Wandb library (optional, for logging)
+
+## Step 1: Building the Dataset
+
+The first step is to build the dataset required for training. The `build_dataset.py` script processes the training data and prepares it for training. Follow the instructions below to build the dataset:
+
+1. Open the `build_dataset.py` script.
+2. Set the configuration parameters in the `CFG` class according to your requirements:
+   - `HF_ACCOUNT_REPO`: Replace with your Hugging Face API key.
+   - `TOKENIZER`: Choose the tokenizer model to use (e.g., "EleutherAI/gpt-neox-20b").
+   - `DATASET_NAME`: Choose the dataset to process (e.g., "tiiuae/falcon-refinedweb").
+   - `SEQ_LEN`: Set the desired sequence length.
+3. Save the changes to the script.
+4. Open a terminal or command prompt and navigate to the directory containing the `build_dataset.py` script.
+5. Run the following command to execute the script:
+   ```
+   python build_dataset.py
+   ```
+6. The script will process the dataset and push it to your Hugging Face account repository specified by `HF_ACCOUNT_REPO`.
+
+## Step 2: Defining the Andromeda Model
+
+The second step is to define the Andromeda model architecture. The `model.py` script contains the model definition and configuration. Follow the instructions below to configure the Andromeda model:
+
+1. Open the `model.py` script.
+2. Set the configuration parameters in the `AndromedaTokenizer` and `Andromeda` classes according to your requirements:
+   - `tokenizer`: Configure the tokenizer with the desired parameters.
+   - `Andromeda`: Configure the Andromeda model with the desired architecture.
+3. Save the changes to the script.
+
+## Step 3: Training the Andromeda Model
+
+The final step is to train the Andromeda model using the `train_distributed.py` script. Follow the instructions below to start the training process:
+
+1. Open the `train_distributed.py` script.
+2. Set the configuration parameters in the `TrainAndromeda.CFG` class according to your requirements:
+   - `BATCH_SIZE`: Set the batch size for training.
+   - `GRADIENT_ACCUMULATE_EVERY`: Set the number of gradient accumulation steps.
+   - `LEARNING_RATE`: Set the learning rate for the optimizer.
+   - `WEIGHT_DECAY`: Set the weight decay for the optimizer.
+   - `SEQ_LEN`: Set the desired sequence length.
+   - `USE_DEEPSPEED`: Set to `True` if using DeepSpeed for optimization.
+   - `USE_FSDP`: Set to `True` if using Fully Sharded Data Parallelism.
+   - `USE_PRETOKENIZED`: Set to `True` if using a pre-tokenized dataset.
+   - `USE_ACTIVATION_CHECKPOINTING`: Set to `True` if using activation checkpointing.
+   - `RESUME_FROM_CHECKPOINT`: Set to the path of a checkpoint to resume training from.
+   - `CHECKPOINTING_STEPS`: Set the number of steps between checkpoints.
+   - `OUTPUT_DIR`: Set the output directory for saving the model checkpoints and logs.
+   - `ENTITY_NAME`: Set the Wandb entity name for logging (optional).
+3. Save the changes to the script.
+4. Open a terminal or command prompt and navigate to the directory containing the `train_distributed.py` script.
+5. Run the following command to start the training:
+   ```
+   python train_distributed.py
+   ```
+6. The script will train the Andromeda model using the specified configuration and dataset.
+7. During training, the progress will be displayed in the terminal, and logs will be saved to the specified output directory.
+
+# Other Training methods
+
+First:
+
+`Accelerate Config`
+
+Enable Deepspeed 3: 
+
+`Accelerate launch train_distributed_accelerate.py`
+
+
diff --git a/DOCs/Docs/Training/DATASET_STRATEGY.md b/DOCs/Docs/Training/DATASET_STRATEGY.md
new file mode 100644
index 0000000000000000000000000000000000000000..bbb00cce023752cc5bd6fd344b171da3b82a8a44
--- /dev/null
+++ b/DOCs/Docs/Training/DATASET_STRATEGY.md
@@ -0,0 +1,100 @@
+#  Andromeda
+
+We should train an 100m param, 500m, 1billion parameters verisions with similiar hyperparameters from these 2 similiar models
+
+[concept of mind's PALM](https://github.com/conceptofmind/PaLM)
+Model Size	Num Tokens	Dim	Depth	Dim Head	Heads	Flash Attention	Learning Rate
+150 M	50304	768	12	128	8	True	6e-4
+410 M	50304	1024	24	128	8	True	3e-4
+1 B	50304	2048	16	128	8	True	3e-4
+
+
+[MPT HF](https://huggingface.co/mosaicml/mpt-7b)
+
+Hyperparameter	Value
+n_parameters	6.7B
+n_layers	32
+n_heads	32
+d_model	4096
+vocab size	50432
+sequence length	2048
+
+
+
+
+## Data prioritization: Prioritize datasets based on their relevance to the desired AI capabilities and the quality of the data.
+
+High priority: C4, openwebtext, super_glue, piqa, Falcon-40B (RefinedWeb-English, RefinedWeb-Europe, Books, Conversations, Code, Technical), glue, tiiuae/falcon-refinedweb, math_dataset
+
+Medium priority:  bigcode/ta-prompt, bigcode/the-stack-dedup, OpenAssistant/oasst1, ehartford/wizard_vicuna_70k_unfiltered, tiiuae/falcon-refinedweb
+
+Low priority: timdettmers/openassistant-guanaco, JosephusCheung/GuanacoDataset,  JosephusCheung/GuanacoDataset, anon8231489123/ShareGPT_Vicuna_unfiltered, togethercomputer/RedPajama-Data, togethercomputer/RedPajama-Data-1T, Anthropic/hh-rlhf, databricks/databricks-dolly-15k, QingyiSi/Alpaca-CoT, alpaca,
+distillation, timdettmers/openassistant-guanaco, OpenAssistant/oasst1, dmayhem93/toolformer-v0-postprocessed, openai_humaneval, yahma/alpaca-cleaned, 
+
+## Data preprocessing: Clean, preprocess, and tokenize the datasets to ensure consistency and compatibility with the AI model.
+
+Remove duplicates, irrelevant content, and low-quality data.
+
+Tokenize the text using a suitable tokenizer, such as GPT Neox tokenizer or potentially falcon's tokenizer
+
+Split the datasets into training, validation, and testing sets.
+
+
+## Training strategy: Train the AI model using the prioritized datasets in a multi-stage process.
+
+Stage 1: Pretrain the model on high-priority datasets (openwebtext, super_glue, piqa, Falcon-40B, glue) to build a strong language understanding foundation.
+
+Stage 2: Fine-tune the model on medium-priority datasets (bigcode/ta-prompt, bigcode/the-stack-dedup, OpenAssistant/oasst1, ehartford/wizard_vicuna_70k_unfiltered, tiiuae/falcon-refinedweb) to enhance its performance in specific domains and tasks.
+
+Stage 3: Further fine-tune the model on low-priority datasets (JosephusCheung/GuanacoDataset, anon8231489123/ShareGPT_Vicuna_unfiltered, togethercomputer/RedPajama-Data, togethercomputer/RedPajama-Data-1T, Anthropic/hh-rlhf, databricks/databricks-dolly-15k, QingyiSi/Alpaca-CoT) to capture any additional knowledge and nuances. PRM800K: A Process Supervision Dataset
+
+
+
+Evaluation and iteration: Continuously evaluate the model's performance on the validation and testing sets, and iterate the training process to improve its performance.
+
+Monitor the model's performance using relevant metrics, such as perplexity, F1 score, or BLEU score, depending on the task.
+Adjust hyperparameters, learning rate, and training duration as needed to optimize the model's performance.
+If necessary, revisit the data prioritization and preprocessing steps to refine the training data.
+
+
+# Evaluations and Benchmarks:
+
+[Chain of thought hub](https://github.com/FranxYao/chain-of-thought-hub)
+SFT stands for Style Fine-tuning and RLHF stands for Reinforcement Learning and Human Feedback. These are techniques used in natural language processing to improve the quality and accuracy of generated text. The statement suggests that if these techniques are applied correctly to the 65B LLaMA dataset, it is possible to recreate ChatGPT.
+
+
+# Analysis of Existing Models
+
+### MPT-7B
+
+```python
+Data Source	Number of Tokens in Source	Proportion	Effective Number of Tokens	Epochs
+mC4 3.1.0 - English	417.99 B	0.33	330 B	0.14
+C4 - English - SemDedup 80%	100.42 B	0.299	299 B	2.98
+RedPajama - CommonCrawl	878.45 B	0.1	100 B	0.11
+The Stack - Selected Languages	463.78 B	0.1	100 B	0.22
+RedPajama - Wikipedia - En	4.87 B	0.04	40 B	8.21
+The Stack - Markdown	107.07 B	0.035	35 B	0.33
+S2ORC	48.85 B	0.033	33 B	0.68
+RedPajama - Books	26.02 B	0.03	30B	1.15
+RedPajama - arXiv	28.10 B	0.019	19 B	0.68
+RedPajama - StackExchange	20.54 B	0.014	14 B	0.68
+``` 
+
+# MPT-1B
+
+```
+Training Data
+The model was trained for 200B tokens (batch size 2200, sequence length 2048). It was trained on the following data mix:
+
+67% RedPajama Common Crawl
+15% C4
+4.5% RedPajama GitHub
+4.5% RedPajama Wikipedia
+4.5% RedPajama Books
+2.5% RedPajama Arxiv
+2% RedPajama StackExchange
+
+Each sample was chosen from one of the datasets, with the dataset selected with the probability specified above. The examples were shuffled within each dataset. Each example was constructed from as many sequences from that dataset as were necessary to fill the 2048 sequence length.
+
+```
diff --git a/DOCs/Tests/BENCHMARKING.md b/DOCs/Tests/BENCHMARKING.md
new file mode 100644
index 0000000000000000000000000000000000000000..133053c5050f7faed40ae74421f8ce8a7d9dc1bf
--- /dev/null
+++ b/DOCs/Tests/BENCHMARKING.md
@@ -0,0 +1,111 @@
+# Andromeda Performance Benchmarking Analysis: Pre-Training Metrics
+
+Before initiating the pre-training phase, we need to ensure that every component of our model – the Andromeda, is performing as expected. To do this, we'll create an extensive suite of metrics to monitor and evaluate. This will allow us to identify any bottlenecks, inefficiencies, or errors, and optimize the model accordingly.
+
+## Component-wise Metrics
+We focus on the transformer layer and the attention mechanism, key components of Andromeda, to extract meaningful metrics.
+
+### Transformer Layer Metrics
+1. **Number of Parameters**: The total number of parameters in the transformer layer. More parameters can lead to a more powerful model but also increase the risk of overfitting and the computational load.
+
+2. **Layer-wise Activation Statistics**: For each layer in the transformer, calculate statistics such as mean, standard deviation, min, and max of the activations.
+
+3. **Layer-wise Gradient Statistics**: Similarly, calculate statistics for the gradients flowing through each layer. Look for any layer where the gradients are consistently close to zero, as this could indicate that the layer isn't learning effectively.
+
+4. **Feed-forward Network (FFN) Activation Statistics**: Calculate activation statistics specifically for the feed-forward networks in the transformer layer.
+
+5. **FFN Gradient Statistics**: Similarly, calculate gradient statistics for the FFNs.
+
+### Attention Mechanism Metrics
+1. **Self-Attention Distribution**: Plot the distribution of attention weights. This can help identify if the model is paying attention to the right inputs.
+
+2. **Multi-Head Attention Distribution**: For multi-head attention, plot the distribution of attention weights for each head.
+
+3. **Attention Entropy**: Calculate the entropy of the attention distribution. A higher entropy can indicate that the model is distributing its attention more evenly, while a lower entropy can indicate that it's focusing on a smaller number of inputs.
+
+4. **Self-Attention Gradient Statistics**: Calculate statistics for the gradients flowing through the self-attention mechanism.
+
+5. **Multi-Head Attention Gradient Statistics**: Similarly, calculate gradient statistics for the multi-head attention mechanism.
+
+6. **Number of Heads Paying Attention**: Count the number of heads that are paying significant attention (i.e., have a high average attention weight) to understand the model's attention spread.
+
+## Test Suite Execution
+
+These metrics should be calculated for a range of input examples to ensure the model performs well across different situations. To do this, we create a test suite. 
+
+The test suite should include:
+
+1. **Various Input Lengths**: Test inputs of varying lengths to ensure the model performs well regardless of input size.
+
+2. **Different Data Modalities**: If the model is designed to handle different data types (text, images, etc.), these should be included in the test suite.
+
+3. **Varied Content**: Include a range of different content in the inputs to test how well the model handles different topics or styles.
+
+4. **Out-of-Distribution Data**: Include some data that's not from the training distribution to see how the model handles unexpected inputs.
+
+5. **Noise**: Include inputs with added noise to test the model's robustness.
+
+Remember, the goal here is not just to have a laundry list of metrics but to understand what each metric tells us about the model's performance and use this information to optimize the model. This extreme attention to detail will ensure Andromeda's high performance and broad applicability.
+
+# Speed and Scalability Metrics
+
+While model performance is crucial, it isn't the only factor that determines the success of a system. We must also consider its speed, scalability, and context limits. 
+
+### Speed Metrics
+1. **Model Inference Time**: Measure the average time it takes for the model to make predictions for a set of inputs. This can be done using methods like `time.perf_counter()` in Python.
+
+2. **Batch Processing Time**: The time taken to process a batch of inputs can provide an insight into the model's speed at scale. This is especially important when processing large datasets.
+
+3. **Forward Pass Time**: Record the time taken for a forward pass through the network. 
+
+4. **Backward Pass Time**: Measure the time taken for the backward pass, especially if the model will be fine-tuned or trained further.
+
+5. **End-to-End Latency**: This measures the total time taken from the moment the input is provided to the model till the output is produced. This includes preprocessing, inference, and postprocessing times.
+
+### Scalability Metrics
+1. **Throughput**: Evaluate the number of inputs the model can process per unit of time. 
+
+2. **Memory Footprint**: Analyze the memory usage of the model during inference. Large models may require significant memory resources, especially during training.
+
+3. **Parallel Processing Performance**: If the model is designed to run on multiple GPUs or across multiple machines, measure its performance in these settings.
+
+4. **Load Balancing**: Measure how well the model can distribute computational load across multiple GPUs or nodes.
+
+### Context Limits Metrics
+1. **Sequence Length Impact**: Evaluate how the model's performance changes with varying sequence lengths. Some models struggle with very short or very long sequences.
+
+2. **Robustness to Input Variation**: Test the model with a variety of inputs, such as out-of-vocabulary words, uncommon syntax, etc., to understand its ability to handle diverse inputs.
+
+3. **Contextual Ambiguity**: Measure the model's ability to handle ambiguous inputs where context is crucial for understanding.
+
+4. **Sensitivity to Input Changes**: Evaluate how much the model's output changes when small modifications are made to the input. If the model is too sensitive, it might overreact to minor changes.
+
+Each of these metrics should be calculated across a range of situations to understand the model's behavior under different conditions. This exhaustive testing will allow us to optimize Andromeda for the best balance of speed, scalability, and context limits.
+
+# Key Metrics
+
+1. **Perplexity:** This is a common metric for assessing language models, which measures how well the model predicts a sample. Lower perplexity indicates better performance. However, it's worth noting that while perplexity is a useful indicator, it doesn't capture everything, especially for creative tasks like language generation.
+
+2. **Validation Loss:** While perplexity is great, you also want to track your validation loss directly. This is your primary optimization target and often gives the most actionable insights.
+
+3. **Speed Metrics:** This includes forward pass time, backward pass time, and end-to-end latency. Ensuring that your model operates quickly is crucial for scalability and user experience. The lower these metrics, the better.
+
+4. **Throughput:** Measures the number of instances your model can process per second. Higher throughput indicates a more efficient model.
+
+5. **Memory Footprint:** You need to measure the amount of memory your model uses during inference and training. This is especially important for larger models, as it could limit scalability. Lower memory usage is better.
+
+6. **Sequence Length Impact:** How does your model's performance change with the length of the input sequence? This is critical for understanding its applicability to real-world scenarios where sequence lengths can vary widely.
+
+7. **Parameter Efficiency:** How well does your model make use of its parameters? This is typically measured as performance relative to the number of parameters. More efficient use of parameters is better.
+
+8. **Accuracy on Benchmarked Datasets:** For instance, GLUE or SuperGLUE benchmark scores for natural language understanding tasks, or SQuAD for question answering. Higher scores on these benchmarks indicate better performance.
+
+9. **Consistency over Time:** Does the model's performance degrade or remain consistent over multiple, identical runs? If performance varies greatly, the model may be unstable.
+
+10. **Robustness to Noise:** How well does your model handle noise in the input data? This can be simulated by adding random noise to your validation data and measuring the model's performance.
+
+11. **Fairness and Bias:** Test the model on a variety of fairness metrics to ensure it treats all users and inputs equally. This can be complex and requires a dataset that is diverse and representative.
+
+Remember that these metrics will vary depending on your specific use case and model. For example, a translation model may need to prioritize sequence length performance, while a chatbot may need to emphasize throughput and latency.
+
+Also, be aware that these are just some of the metrics you could test before pre-training. The exact list will depend on your specific use case and requirements.
\ No newline at end of file
diff --git a/FailureAnalysis/CPU_MEMORY.md b/FailureAnalysis/CPU_MEMORY.md
new file mode 100644
index 0000000000000000000000000000000000000000..1abea38e0ef9bc9a0ed3a572ee2514adaf71c9a3
--- /dev/null
+++ b/FailureAnalysis/CPU_MEMORY.md
@@ -0,0 +1,489 @@
+# July 32, 9:12pm
+
+* Failure to train perhaps to due to not having enough CPU memory
+
+
+
+## Sources
+* [torch.distributed.elastic.multiprocessing.errors.ChildFailedError](https://discuss.huggingface.co/t/torch-distributed-elastic-multiprocessing-errors-childfailederror/28242)
+* `export TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO``
+```
+Hey guys, I’m glad to announce I solved the issue on my side.
+As can be seen I use multiple GPUs, which have sufficient memory for the use case.
+HOWEVER! My issue was due to not enough CPU memory. That’s why my runs crashed and without any trace of the reason.
+Once I allocated enough cpu (on my case I increased it from 32GB to 96+ GB).
+
+If the CPU allocation is constant and you can not allocated more, I’m sure you can try solutions as compressed models, deepspeed optimization levels and more.
+
+Good luck to future readers.
+```
+
+
+### Root cause:
+* Not having enough cpu memory, 
+
+
+# Solutions:
+* perhaps move everything into nvme or offload the parameters to the cpu using deepspeed
+
+## Log
+```
+commune@r1n2a6000bittensor:~/Andromeda$ accelerate launch train.py
+[2023-08-01 01:04:13,441] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[2023-08-01 01:04:16,624] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:04:16,634] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:04:16,641] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:04:16,669] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:04:16,712] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:04:16,720] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208581 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208582 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208583 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208584 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208586 closing signal SIGTERM
+ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 4 (pid: 208585) of binary: /usr/bin/python3.10
+Traceback (most recent call last):
+  File "/home/commune/.local/bin/accelerate", line 8, in <module>
+    sys.exit(main())
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
+    args.func(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 964, in launch_command
+    deepspeed_launcher(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 687, in deepspeed_launcher
+    distrib_run.run(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+=======================================================
+train.py FAILED
+-------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+-------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-08-01_01:06:47
+  host      : r1n2a6000bittensor
+  rank      : 4 (local_rank: 4)
+  exitcode  : -9 (pid: 208585)
+  error_file: <N/A>
+  traceback : Signal 9 (SIGKILL) received by PID 208585
+=======================================================
+commune@r1n2a6000bittensor:~/Andromeda$ export TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO
+
+commune@r1n2a6000bittensor:~/Andromeda$ accelerate launch train.py
+[2023-08-01 01:09:31,113] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[I socket.cpp:566] [c10d] The server socket has started to listen on [::]:29500.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:46392.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:46406.
+[2023-08-01 01:09:34,414] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:09:34,417] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:09:34,477] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+[2023-08-01 01:09:34,541] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+[2023-08-01 01:09:34,614] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:09:34,642] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209014 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209015 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209016 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209018 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209019 closing signal SIGTERM
+ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 3 (pid: 209017) of binary: /usr/bin/python3.10
+Traceback (most recent call last):
+  File "/home/commune/.local/bin/accelerate", line 8, in <module>
+    sys.exit(main())
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
+    args.func(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 964, in launch_command
+    deepspeed_launcher(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 687, in deepspeed_launcher
+    distrib_run.run(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+=======================================================
+train.py FAILED
+-------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+-------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-08-01_01:11:46
+  host      : r1n2a6000bittensor
+  rank      : 3 (local_rank: 3)
+  exitcode  : -9 (pid: 209017)
+  error_file: <N/A>
+  traceback : Signal 9 (SIGKILL) received by PID 209017
+=======================================================
+commune@r1n2a6000bittensor:~/Andromeda$ 
+```
+------
+----
+
+# Log2
+* I reconfigurd the setting to utilize torch dynamo and offload parameters to nvme
+
+```
+ commune@r1n2a6000bittensor:~/Andromeda$ accelerate config
+[2023-08-01 01:15:17,803] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+----------------------------------------------------------------------------------------------------------In which compute environment are you running?
+This machine                                                                                              
+----------------------------------------------------------------------------------------------------------Which type of machine are you using?                                                                      
+multi-GPU                                                                                                 
+How many different machines will you use (use more than 1 for multi-node training)? [1]:                  
+Do you wish to optimize your script with torch dynamo?[yes/NO]:yes                                        
+----------------------------------------------------------------------------------------------------------Which dynamo backend would you like to use?                                                               
+nvfuser                                                                                                   
+Do you want to customize the defaults sent to torch.compile? [yes/NO]:                                    
+Do you want to use DeepSpeed? [yes/NO]: yes                                                               
+Do you want to specify a json file to a DeepSpeed config? [yes/NO]: no                                    
+----------------------------------------------------------------------------------------------------------What should be your DeepSpeed's ZeRO optimization stage?                                                  
+3                                                                                                         
+----------------------------------------------------------------------------------------------------------Where to offload optimizer states?                                                                        
+nvme                                                                                                      
+----------------------------------------------------------------------------------------------------------Where to offload parameters?                                                                              
+nvme                                                                                                      
+Nvme Path to offload parameters?                                                                          
+Nvme Path to offload optimizer states?                                                                    
+How many gradient accumulation steps you're passing in your script? [1]:                                  
+Do you want to use gradient clipping? [yes/NO]: yes
+What is the gradient clipping value? [1.0]: 
+Do you want to save 16-bit model weights when using ZeRO Stage-3? [yes/NO]: yes
+Do you want to enable `deepspeed.zero.Init` when using ZeRO Stage-3 for constructing massive models? [yes/NO]: yes
+How many GPU(s) should be used for distributed training? [1]:6
+----------------------------------------------------------------------------------------------------------Do you wish to use FP16 or BF16 (mixed precision)?
+fp8                                                                                                       
+accelerate configuration saved at /home/commune/.cache/huggingface/accelerate/default_config.yaml         
+commune@r1n2a6000bittensor:~/Andromeda$ accelerate launch train.py                   
+[2023-08-01 01:15:58,494] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)                                                                                             
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[I socket.cpp:566] [c10d] The server socket has started to listen on [::]:29500.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:45830.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:45838.
+[2023-08-01 01:16:01,364] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:16:01,455] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:16:01,456] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+[2023-08-01 01:16:01,484] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:16:01,555] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+[2023-08-01 01:16:01,593] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209602 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209603 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209604 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209605 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209606 closing signal SIGTERM
+ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 0 (pid: 209601) of binary: /usr/bin/python3.10
+Traceback (most recent call last):
+  File "/home/commune/.local/bin/accelerate", line 8, in <module>
+    sys.exit(main())
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
+    args.func(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 964, in launch_command
+    deepspeed_launcher(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 687, in deepspeed_launcher
+    distrib_run.run(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+=======================================================
+train.py FAILED
+-------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+-------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-08-01_01:18:29
+  host      : r1n2a6000bittensor
+  rank      : 0 (local_rank: 0)
+  exitcode  : -9 (pid: 209601)
+  error_file: <N/A>
+  traceback : Signal 9 (SIGKILL) received by PID 209601
+=======================================================
+```
+
+
+# Log3
+* I changed the config to use deepspeed1, same error
+
+```
+commune@r1n2a6000bittensor:~/Andromeda$ accelerate config
+[2023-08-01 01:21:26,715] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+-----------------------------------------------------------------------------------------------------------------------------------In which compute environment are you running?
+This machine                                                                                                                       
+-----------------------------------------------------------------------------------------------------------------------------------Which type of machine are you using?                                                                                               
+multi-GPU                                                                                                                          
+How many different machines will you use (use more than 1 for multi-node training)? [1]:                                           
+Do you wish to optimize your script with torch dynamo?[yes/NO]:no                                                                  
+Do you want to use DeepSpeed? [yes/NO]: yes                                                                                        
+Do you want to specify a json file to a DeepSpeed config? [yes/NO]: no                                                             
+-----------------------------------------------------------------------------------------------------------------------------------What should be your DeepSpeed's ZeRO optimization stage?                                                                           
+1                                                                                                                                  
+How many gradient accumulation steps you're passing in your script? [1]:                                                           
+Do you want to use gradient clipping? [yes/NO]: no                                                                                 
+Do you want to enable `deepspeed.zero.Init` when using ZeRO Stage-3 for constructing massive models? [yes/NO]: yes                 
+How many GPU(s) should be used for distributed training? [1]:6                                                                     
+-----------------------------------------------------------------------------------------------------------------------------------Do you wish to use FP16 or BF16 (mixed precision)?
+fp8                                                                                                                                
+accelerate configuration saved at /home/commune/.cache/huggingface/accelerate/default_config.yaml                                  
+commune@r1n2a6000bittensor:~/Andromeda$ accelerate launch train.py                                                        
+[2023-08-01 01:21:50,336] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)            
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[I socket.cpp:566] [c10d] The server socket has started to listen on [::]:29500.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:57524.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:57530.
+[2023-08-01 01:21:53,173] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:21:53,189] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:21:53,237] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+[2023-08-01 01:21:53,367] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:21:53,439] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:21:53,452] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 210195 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 210197 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 210198 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 210199 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 210200 closing signal SIGTERM
+ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 1 (pid: 210196) of binary: /usr/bin/python3.10
+Traceback (most recent call last):
+  File "/home/commune/.local/bin/accelerate", line 8, in <module>
+    sys.exit(main())
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
+    args.func(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 964, in launch_command
+    deepspeed_launcher(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 687, in deepspeed_launcher
+    distrib_run.run(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+=======================================================
+train.py FAILED
+-------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+-------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-08-01_01:24:23
+  host      : r1n2a6000bittensor
+  rank      : 1 (local_rank: 1)
+  exitcode  : -9 (pid: 210196)
+  error_file: <N/A>
+  traceback : Signal 9 (SIGKILL) received by PID 210196
+=======================================================
+commune@r1n2a6000bittensor:~/Andromeda$ 
+
+```
+
+# Log3
+* No deepspeed at all but rather fullyshardeddataparallel with shardgradop,transformerbasedwrap,
+sharded_state_dict,
+
+
+```
+ommune@r1n2a6000bittensor:~/Andromeda$ accelerate config
+[2023-08-01 01:25:09,849] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+-----------------------------------------------------------------------------------------------------------------------------------In which compute environment are you running?
+This machine                                                                                                                       
+-----------------------------------------------------------------------------------------------------------------------------------Which type of machine are you using?                                                                                               
+multi-GPU                                                                                                                          
+How many different machines will you use (use more than 1 for multi-node training)? [1]:                                           
+Do you wish to optimize your script with torch dynamo?[yes/NO]:                                                                    
+Do you want to use DeepSpeed? [yes/NO]:                                                                                            
+Do you want to use FullyShardedDataParallel? [yes/NO]: yes                                                                         
+-----------------------------------------------------------------------------------------------------------------------------------What should be your sharding strategy?                                                                                             
+SHARD_GRAD_OP                                                                                                                      
+Do you want to offload parameters and gradients to CPU? [yes/NO]: yes                                                              
+-----------------------------------------------------------------------------------------------------------------------------------What should be your auto wrap policy?                                                                                              
+TRANSFORMER_BASED_WRAP                                                                                                             
+Specify the comma-separated list of transformer layer class names (case-sensitive) to wrap ,e.g, :`BertLayer`, `GPTJBlock`, `T5Block`, `BertLayer,BertEmbeddings,BertSelfOutput` ...? :                                                                               
+-----------------------------------------------------------------------------------------------------------------------------------What should be your FSDP's backward prefetch policy?
+BACKWARD_PRE                                                                                                                       
+-----------------------------------------------------------------------------------------------------------------------------------What should be your FSDP's state dict type?                                                                                        
+SHARDED_STATE_DICT                                                                                                                 
+Do you want to enable FSDP's forward prefetch policy? [yes/NO]: yes                                                                
+Do you want to enable FSDP's `use_orig_params` feature? [yes/NO]: yes                                                              
+Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start? [yes/NO]:                 
+How many GPU(s) should be used for distributed training? [1]:
+-----------------------------------------------------------------------------------------------------------------------------------Do you wish to use FP16 or BF16 (mixed precision)?
+fp8                                                                                                                                
+accelerate configuration saved at /home/commune/.cache/huggingface/accelerate/default_config.yaml                                  
+commune@r1n2a6000bittensor:~/Andromeda$ accelerate launch train.py                                                                 
+[2023-08-01 01:25:47,200] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)            
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+[I socket.cpp:566] [c10d] The server socket has started to listen on [::]:29500.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:47910.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:47916.
+[2023-08-01 01:25:49,991] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:45082.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:45084.
+[I ProcessGroupNCCL.cpp:665] [Rank 0] ProcessGroupNCCL initialized with following options:
+NCCL_ASYNC_ERROR_HANDLING: 1
+NCCL_DESYNC_DEBUG: 0
+NCCL_BLOCKING_WAIT: 0
+TIMEOUT(ms): 1800000
+USE_HIGH_PRIORITY_STREAM: 0
+[I ProcessGroupNCCL.cpp:842] [Rank 0] NCCL watchdog thread started!
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/train.py", line 705, in <module>
+    main()
+  File "/home/commune/Andromeda/train.py", line 702, in main
+    Train()
+  File "/home/commune/Andromeda/train.py", line 484, in Train
+    state.deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = CFG.BATCH_SIZE #??????
+AttributeError: 'NoneType' object has no attribute 'deepspeed_config'
+[I ProcessGroupNCCL.cpp:844] [Rank 0] NCCL watchdog thread terminated normally
+ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 210780) of binary: /usr/bin/python3.10
+Traceback (most recent call last):
+  File "/home/commune/.local/bin/accelerate", line 8, in <module>
+    sys.exit(main())
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
+    args.func(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 966, in launch_command
+    multi_gpu_launcher(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 646, in multi_gpu_launcher
+    distrib_run.run(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+train.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-08-01_01:29:53
+  host      : r1n2a6000bittensor
+  rank      : 0 (local_rank: 0)
+  exitcode  : 1 (pid: 210780)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+commune@r1n2a6000bittensor:~/Andromeda$ 
+
+```
\ No newline at end of file
diff --git a/FailureAnalysis/OptimizerDict.md b/FailureAnalysis/OptimizerDict.md
new file mode 100644
index 0000000000000000000000000000000000000000..039d5b889ea354b55f225bdba02b0bac4d78fb52
--- /dev/null
+++ b/FailureAnalysis/OptimizerDict.md
@@ -0,0 +1,238 @@
+ata: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 180M/180M [00:03<00:00, 46.3MB/s]
+Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 180M/180M [00:04<00:00, 37.2MB/s]
+Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 180M/180M [00:03<00:00, 47.5MB/s]
+Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 180M/180M [00:03<00:00, 46.0MB/s]
+Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 180M/180M [00:04<00:00, 41.2MB/s]
+Downloading data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1Downloading data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [27:46<00:00, 1666.10s/it]
+Extracting data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.85it/s]
+Dataset parquet downloaded and prepared to /home/commune/.cache/huggingface/datasets/conceptofmind___parquet/conceptofmind--c4_0-to-20_neox_with_eos_8k-dd8655ce54e7b6cc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.
+Found cached dataset parquet (/home/commune/.cache/huggingface/datasets/conceptofmind___parquet/conceptofmind--c4_0-to-20_neox_with_eos_8k-dd8655ce54e7b6cc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/home/commune/.cache/huggingface/datasets/conceptofmind___parquet/conceptofmind--c4_0-to-20_neox_with_eos_8k-dd8655ce54e7b6cc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/home/commune/.cache/huggingface/datasets/conceptofmind___parquet/conceptofmind--c4_0-to-20_neox_with_eos_8k-dd8655ce54e7b6cc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/home/commune/.cache/huggingface/datasets/conceptofmind___parquet/conceptofmind--c4_0-to-20_neox_with_eos_8k-dd8655ce54e7b6cc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/home/commune/.cache/huggingface/datasets/conceptofmind___parquet/conceptofmind--c4_0-to-20_neox_with_eos_8k-dd8655ce54e7b6cc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+[2023-07-24 15:58:13,787] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
+[2023-07-24 15:58:13,787] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
+[2023-07-24 15:58:13,787] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2023-07-24 15:58:13,787] [INFO] [comm.py:594:init_distributed] cdb=None
+[2023-07-24 15:58:13,787] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2023-07-24 15:58:13,787] [INFO] [comm.py:594:init_distributed] cdb=None
+[2023-07-24 15:58:13,789] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
+[2023-07-24 15:58:13,790] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2023-07-24 15:58:13,790] [INFO] [comm.py:594:init_distributed] cdb=None
+[2023-07-24 15:58:13,790] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
+[2023-07-24 15:58:13,790] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
+[2023-07-24 15:58:13,790] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2023-07-24 15:58:13,790] [INFO] [comm.py:594:init_distributed] cdb=None
+[2023-07-24 15:58:13,790] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2023-07-24 15:58:13,790] [INFO] [comm.py:594:init_distributed] cdb=None
+[2023-07-24 15:58:13,791] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
+[2023-07-24 15:58:13,792] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2023-07-24 15:58:13,792] [INFO] [comm.py:594:init_distributed] cdb=None
+[2023-07-24 15:58:17,032] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2023-07-24 15:58:17,035] [INFO] [logging.py:96:log_dist] [Rank 0] Creating ZeRO Offload
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95, 
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
+[2023-07-24 15:58:17,268] [INFO] [utils.py:785:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]
+[2023-07-24 15:58:17,268] [INFO] [utils.py:786:see_memory_usage] MA 0.68 GB         Max_MA 0.68 GB         CA 0.69 GB         Max_CA 1 GB 
+[2023-07-24 15:58:17,268] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 18.35 GB, percent = 3.6%
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95, 
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95, 
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95, 
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
+Parameter Offload: Total persistent parameters: 108032 in 490 params
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95, 
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
+[2023-07-24 15:58:17,449] [INFO] [utils.py:785:see_memory_usage] DeepSpeedZeRoOffload initialize [end]
+[2023-07-24 15:58:17,450] [INFO] [utils.py:786:see_memory_usage] MA 0.8 GB         Max_MA 0.8 GB         CA 0.8 GB         Max_CA 1 GB 
+[2023-07-24 15:58:17,450] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 18.39 GB, percent = 3.7%
+[2023-07-24 15:58:17,451] [INFO] [config.py:960:print] DeepSpeedEngine configuration:
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   amp_enabled .................. False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   amp_params ................... False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   autotuning_config ............ {
+    "enabled": false, 
+    "start_step": null, 
+    "end_step": null, 
+    "metric_path": null, 
+    "arg_mappings": null, 
+    "metric": "throughput", 
+    "model_info": null, 
+    "results_dir": "autotuning_results", 
+    "exps_dir": "autotuning_exps", 
+    "overwrite": true, 
+    "fast": true, 
+    "start_profile_step": 3, 
+    "end_profile_step": 5, 
+    "tuner_type": "gridsearch", 
+    "tuner_early_stopping": 5, 
+    "tuner_num_trials": 50, 
+    "model_info_path": null, 
+    "mp_size": 1, 
+    "max_train_batch_size": null, 
+    "min_train_batch_size": 1, 
+    "max_train_micro_batch_size_per_gpu": 1.024000e+03, 
+    "min_train_micro_batch_size_per_gpu": 1, 
+    "num_tuning_micro_batch_sizes": 3
+}
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   bfloat16_enabled ............. False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   checkpoint_parallel_write_pipeline  False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   checkpoint_tag_validation_enabled  True
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   checkpoint_tag_validation_fail  False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f06c428a950>
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   communication_data_type ...... None
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   curriculum_enabled_legacy .... False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   curriculum_params_legacy ..... False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   data_efficiency_enabled ...... False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   dataloader_drop_last ......... False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   disable_allgather ............ False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   dump_state ................... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   dynamic_loss_scale_args ...... None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_enabled ........... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_gas_boundary_resolution  1
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_layer_num ......... 0
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_max_iter .......... 100
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_stability ......... 1e-06
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_tol ............... 0.01
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_verbose ........... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   elasticity_enabled ........... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "recompute_fwd_factor": 0.0, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   fp16_auto_cast ............... True
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   fp16_enabled ................. True
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   fp16_master_weights_and_gradients  False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   global_rank .................. 0
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   grad_accum_dtype ............. None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   gradient_accumulation_steps .. 1
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   gradient_clipping ............ 0.0
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   gradient_predivide_factor .... 1.0
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   initial_dynamic_scale ........ 65536
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   load_universal_checkpoint .... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   loss_scale ................... 0
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   memory_breakdown ............. False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   mics_hierarchial_params_gather  False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   mics_shard_size .............. -1
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   nebula_config ................ {
+    "enabled": false, 
+    "persistent_storage_path": null, 
+    "persistent_time_interval": 100, 
+    "num_of_version_in_retention": 2, 
+    "enable_nebula_load": true, 
+    "load_path": null
+}
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   optimizer_legacy_fusion ...... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   optimizer_name ............... None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   optimizer_params ............. None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   pld_enabled .................. False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   pld_params ................... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   prescale_gradients ........... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   scheduler_name ............... None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   scheduler_params ............. None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   sparse_attention ............. None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   sparse_gradients_enabled ..... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   steps_per_print .............. inf
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   train_batch_size ............. 18
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   train_micro_batch_size_per_gpu  3
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   use_node_local_storage ....... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   wall_clock_breakdown ......... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   world_size ................... 6
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   zero_allow_untested_optimizer  False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   zero_enabled ................. True
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   zero_force_ds_cpu_optimizer .. True
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   zero_optimization_stage ...... 3
+[2023-07-24 15:58:17,453] [INFO] [config.py:950:print_user_config]   json = {
+    "train_batch_size": 18, 
+    "train_micro_batch_size_per_gpu": 3, 
+    "gradient_accumulation_steps": 1, 
+    "zero_optimization": {
+        "stage": 3, 
+        "offload_optimizer": {
+            "device": "none", 
+            "nvme_path": null
+        }, 
+        "offload_param": {
+            "device": "none", 
+            "nvme_path": null
+        }, 
+        "stage3_gather_16bit_weights_on_model_save": true
+    }, 
+    "steps_per_print": inf, 
+    "fp16": {
+        "enabled": true, 
+        "auto_cast": true
+    }, 
+    "bf16": {
+        "enabled": false
+    }
+}
+Using stable_adamw optimizer
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95,
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95, 
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..f288702d2fa16d3cdf0035b15a9fcbc552cd88e7
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/agorabanner.png b/agorabanner.png
new file mode 100644
index 0000000000000000000000000000000000000000..030ad15560bb5b6154592b392fd594dceeccd4e7
Binary files /dev/null and b/agorabanner.png differ
diff --git a/agorammai.png b/agorammai.png
new file mode 100644
index 0000000000000000000000000000000000000000..b1de82154b3d58551ae3db5a5e7acd6b0b550ff1
Binary files /dev/null and b/agorammai.png differ
diff --git a/build_dataset.py b/build_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2726e781cece6f10ca361619e335ef31bea6dc7
--- /dev/null
+++ b/build_dataset.py
@@ -0,0 +1,88 @@
+import multiprocessing
+from itertools import chain
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from huggingface_hub import HfApi
+
+
+class DatasetBuilder:
+    def __init__(
+        self,
+        dataset_name,
+        seq_len=8192,
+        num_cpu=None,
+        hf_account_repo=None,
+        tokenizer="EleutherAI/gpt-neox-20b",
+    ):
+        self.dataset_name = dataset_name
+        self.seq_len = seq_len
+        self.num_cpu = num_cpu or multiprocessing.cpu_count()
+        self.hf_account_repo = hf_account_repo
+        self.tokenizer = tokenizer
+
+    def build_dataset(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)
+        train_dataset = load_dataset(self.dataset_name, split="train", streaming=True)
+        dataset = train_dataset.shuffle()
+
+        def tokenize_function(example):
+            return tokenizer([t + tokenizer.eos_token for t in example["text"]])
+
+        tokenized_dataset = dataset.map(
+            tokenize_function,
+            batched=True,
+            # num_proc=self.num_cpu,
+            remove_columns=["text"],
+            #num_proc=32
+        )
+
+        block_size = self.seq_len
+
+        def group_texts(examples):
+            concatenated_examples = {
+                k: list(chain(*examples[k])) for k in examples.keys()
+            }
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+
+            if total_length >= block_size:
+                total_length = (total_length // block_size) * block_size
+
+            result = {
+                k: [
+                    t[i : i + block_size]
+                    for i in range(0, total_length, block_size)
+                ]
+                for k, t in concatenated_examples.items()
+            }
+
+            return result
+
+        train_tokenized_dataset = tokenized_dataset.map(
+            group_texts, batched=True, #num_proc=32#num_proc=self.num_cpu
+        )
+
+        #TODO: ValueError: path_or_fileobj must be either an instance of str, bytes or io.BufferedIOBase. If you passed a file-like object, make sure it is in binary mode.
+        if self.hf_account_repo:
+            # train_tokenized_dataset.push_to_hub(self.hf_account_repo, private=True)
+            hf_api = HfApi()
+            hf_api.upload_file(
+                path_or_fileobj= "TOKENIZED_DATASET",#train_tokenized_dataset, #path to local space
+                path_in_repo="README.md",
+                repo_id=self.hf_account_repo,
+                repo_type="dataset"
+            )
+
+        return train_tokenized_dataset
+
+            
+
+builder = DatasetBuilder(
+    dataset_name="the_pile_books3",
+    seq_len=8192,
+    # num_cpu=4,
+    hf_account_repo="kye/thepilebooks3-gptneox-8k",
+    tokenizer="EleutherAI/gpt-neox-20b",
+)
+
+dataset = builder.build_dataset()
diff --git a/config/main.json b/config/main.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2e7105649dae132ab6dd526785be6e945d34726
--- /dev/null
+++ b/config/main.json
@@ -0,0 +1,27 @@
+
+{
+    "train_batch_size": 18, 
+    "train_micro_batch_size_per_gpu": 3, 
+    "gradient_accumulation_steps": 1, 
+    "zero_optimization": {
+        "stage": 3, 
+        "offload_optimizer": {
+            "device": "none", 
+            "nvme_path": null
+        }, 
+        "offload_param": {
+            "device": "none", 
+            "nvme_path": null
+        }, 
+        "stage3_gather_16bit_weights_on_model_save": true
+    }, 
+    "steps_per_print": 2, 
+    "fp16": {
+        "enabled": true, 
+        "auto_cast": true
+    }, 
+    "bf16": {
+        "enabled": false
+    }
+}
+
diff --git a/config/zero3.json b/config/zero3.json
new file mode 100644
index 0000000000000000000000000000000000000000..20e6161cabbfd1a1b78c19c6c43ad2fb7a7584d1
--- /dev/null
+++ b/config/zero3.json
@@ -0,0 +1,54 @@
+{
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "sub_group_size": 1e9,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": "auto"
+    },
+    "gradient_accumulation_steps": 1,
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
+
+
diff --git a/data/README.md b/data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ae481673afba4dc144aa04032aae44467fcdb0c3
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1,3 @@
+# Data source
+
+The enwik8 data was downloaded from the Hutter prize page: http://prize.hutter1.net/
\ No newline at end of file
diff --git a/data/enwik8.gz b/data/enwik8.gz
new file mode 100644
index 0000000000000000000000000000000000000000..7da9615e954233b7776f1aa80048f2850cef6177
--- /dev/null
+++ b/data/enwik8.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d52ea2118b505f79007e084b2f49acef9fd0e45fb481f84b2cab0a7191a7ef60
+size 36548940
diff --git a/dist/thebestllmever-0.0.3-py3-none-any.whl b/dist/thebestllmever-0.0.3-py3-none-any.whl
new file mode 100644
index 0000000000000000000000000000000000000000..303d34389aeaf13bfed8998d8fad42d7cd249329
Binary files /dev/null and b/dist/thebestllmever-0.0.3-py3-none-any.whl differ
diff --git a/dist/thebestllmever-0.0.3.tar.gz b/dist/thebestllmever-0.0.3.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..ea0aadf819dde1ba1b27fed1c6b1fa46ae869bbe
--- /dev/null
+++ b/dist/thebestllmever-0.0.3.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a04efcb0fcaf2d957f91d5c885607c18143ba7d70bca4c7642631ec8f1eaa29d
+size 60345
diff --git a/example.py b/example.py
new file mode 100644
index 0000000000000000000000000000000000000000..22f88897425dde1269ed562055f07854f59ab4a2
--- /dev/null
+++ b/example.py
@@ -0,0 +1,8 @@
+import torch
+from Andromeda.configs import Andromeda1Billion
+
+model =  Andromeda1Billion().cuda()
+
+x = torch.randint(0, 256, (1, 1024)).cuda()
+
+model(x) # (1, 1024, 20000)
\ No newline at end of file
diff --git a/infra/Dockerfile b/infra/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..80d47304f8ece6a3649f7fbca35ba6ca620ac394
--- /dev/null
+++ b/infra/Dockerfile
@@ -0,0 +1,37 @@
+# base image
+FROM nvidia/cuda:11.3.1-devel-ubuntu20.04
+
+# avoid warnings by switching to noninteractive
+ARG DEBIAN_FRONTEND=noninteractive
+
+# make a directory for our application
+RUN mkdir -p /app
+WORKDIR /app
+
+# install system-wide dependencies
+RUN apt-get -qq update && \
+    apt-get -qq install -y --no-install-recommends curl git python3-pip python3-dev
+
+# Install PyTorch
+RUN pip3 install --upgrade torch torchvision torchaudio
+
+# Install APEX
+RUN git clone https://github.com/NVIDIA/apex.git /app/apex
+WORKDIR /app/apex
+RUN git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea
+RUN pip3 install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./
+
+# Install your other dependencies...
+
+# Copy requirements.txt and install python dependencies
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application
+COPY . .
+
+# Execute the accelerate config command
+# RUN accelerate config
+
+# Command to run when starting the container
+CMD ["python3", "train.py"]
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..18c1b7d035d61dc95ca4f8dd2a67bc60173c0a46
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,38 @@
+[tool.poetry]
+name = "TheBestLLMEver"
+version = "0.0.3"
+description = "andromeda - Pytorch"
+authors = ["Kye Gomez <kye@apac.ai>"]
+license = "MIT"
+readme = "README.md"  # assuming you have a README.md file
+homepage = "https://github.com/kyegomez/Andromeda"
+keywords = ["artificial intelligence", "attention mechanism", "transformers"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3.6",
+]
+
+[tool.poetry.dependencies]
+python = "^3.6"
+torch = "*"
+lion-pytorch = "*"
+numpy = "*"
+einops = "*"
+accelerate = "*"
+transformers = "*"
+SentencePiece = "*"
+datasets = "*"
+matplotlib = "*"
+deepspeed = "*"
+
+[tool.poetry.dev-dependencies]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[[tool.poetry.packages]]
+include = "andromeda"
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3dfe4a87170662371b97aceadad3667eefc2755b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,20 @@
+torch --index-url https://download.pytorch.org/whl/nightly/cu118
+lion-pytorch
+numpy
+einops
+bitsandbytes
+triton
+accelerate
+transformers
+
+SentencePiece
+datasets
+deepspeed
+memory-profiler
+rouge
+nltk
+scikit-learn
+wandb
+
+
+matplotlib
\ No newline at end of file
diff --git a/testing/README.md b/testing/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3b38cd089fbd70aaec2fe411521bd032c9f29b1b
--- /dev/null
+++ b/testing/README.md
@@ -0,0 +1,57 @@
+# Transformer Model Pre-training Testing Suite SOP
+
+This Standard Operating Procedure (SOP) outlines the steps and checkpoints needed to evaluate and test a Language Learning Model (LLM) based on a Transformer architecture prior to pre-training.
+
+## 1. Model Architecture Review
+   - Confirm model architecture aligns with the specific NLP task.
+   - Ensure configuration parameters (number of layers, dimensions, heads, etc.) are set correctly.
+   - Validate selection of activation functions, loss functions, and optimization methods.
+
+## 2. Forward Pass Test
+   - Use sample input to perform a forward pass and verify the output.
+   - Ensure output shape matches the expected shape.
+
+## 3. Backward Pass Test
+   - Conduct a backward pass to validate model's capability to calculate gradients correctly.
+   - Confirm that gradients are not null, NaN, or infinite.
+
+## 4. Parameter Initialization Test
+   - Check correct initialization of all layers and their parameters.
+   - Inspect weights before and after a forward and backward pass to verify their correct updating.
+
+## 5. Optimizer and Loss Function Test
+   - Confirm appropriateness of optimizer and loss function for the task.
+   - Validate reduction of loss and learning of model during initial training phases.
+
+## 6. Data Loader Test
+   - Ensure data loaders supply data in the correct format and batch size for the model.
+   - Validate any data augmentation procedures used.
+
+## 7. Learning Rate Scheduler Test
+   - If used, verify correct setup and functionality of the learning rate scheduler.
+
+## 8. Hardware Compatibility Test
+   - Confirm model, data, and all necessary components are correctly moved to the desired device (CPU, GPU, or TPU).
+
+## 9. Reproducibility Test
+   - Set random seeds for all components that introduce randomness to ensure reproducibility of model training.
+
+# Important Metrics to Check
+
+## 1. Accuracy Metrics
+- **Perplexity**: Lower values indicate better model prediction of a sample.
+- **BLEU Score**: Assesses overlap of words in predicted and actual outputs, with emphasis on word order. Particularly useful in translation tasks.
+- **ROUGE Score**: Evaluates quality of summaries by counting overlapping units (n-grams, word sequences, word pairs) between source and target text.
+- **F1 Score**: Harmonic mean of precision and recall.
+
+## 2. Speed and Resource Metrics
+- **Latency**: Time it takes to generate a response post-input.
+- **Throughput**: Number of tasks the model can complete in a set time period.
+- **Memory Consumption**: Quantity of RAM consumed during prediction.
+
+## 3. Qualitative Metrics
+- **Coherence**: Assessment of whether output makes sense.
+- **Relevance**: Assessment of whether output is relevant to the input query.
+- **Versatility**: Assessment of model's ability to handle diverse input types and produce coherent, relevant output.
+
+It's important to note that there are no specific tests for accuracy metrics such as perplexity, BLEU score, ROUGE score, or F1 score, as these are often task-specific and need to be evaluated on a task-by-task basis. Furthermore, ensure to conduct manual tests for coherence, relevance, and versatility, in addition to benchmarking speed (latency and throughput) and memory consumption.
\ No newline at end of file
diff --git a/testing/accuracy.py b/testing/accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..405de202beec1f4916ad3f3dbfbcfe29cb7f7aed
--- /dev/null
+++ b/testing/accuracy.py
@@ -0,0 +1,96 @@
+import matplotlib.pyplot as plt
+import time
+import torch
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+import numpy as np
+import tracemalloc
+
+# from Andromeda.model import Andromeda
+from Andromeda.model import Andromeda
+from Andromeda.utils.stable_adamw import StableAdamWUnfused
+
+torch.manual_seed(0)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed(0)
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+
+
+import torch.nn.functional as F
+from nltk.translate.bleu_score import corpus_bleu
+from rouge import Rouge
+from sklearn.metrics import f1_score
+
+
+class AccuracyMetrics:
+    def __init__(self):
+        self.rouge = Rouge()
+    
+    def calculate_perplexity(self, model, data_loader):
+        model.eval()
+        total_loss = 0
+        with torch.no_grad():
+            for batch in data_loader:
+                input_ids, labels = batch
+                output = model(input_ids)
+                loss = F.cross_entropy(output.view(-1, output.size(-1)), labels.view(-1))
+                total_loss += loss.item()
+        return torch.exp(torch.tensor(total_loss / len(data_loader)))
+    
+    def calculate_bleu(self, references, hypotheses):
+        return corpus_bleu(references, hypotheses)
+    
+    def calculate_rouge(self, references, hypotheses):
+        scores = self.rouge.get_scores(hypotheses, references, avg=True)
+        return scores
+    
+    def calculate_f1(self, true_labels, pred_labels):
+        return f1_score(true_labels, pred_labels, average="weighted")
+
+
+
+
+
+#mock test dataset
+test_dataset = datasets.FakeData(size=1000, transform=transforms.ToTensor())
+
+#model
+model = Andromeda(
+    num_tokens=50304, 
+    dim=1024,
+    depth=24,
+    dim_head=128,
+    heads=8,
+    alibi_num_heads=4
+)
+
+
+
+# Usage:
+accuracy_metrics = AccuracyMetrics()
+
+# Calculate Perplexity
+perplexity = accuracy_metrics.calculate_perplexity(model, data_loader)
+print('Perplexity:', perplexity)
+
+# Calculate BLEU
+bleu = accuracy_metrics.calculate_bleu(references, hypotheses)
+print('BLEU Score:', bleu)
+
+# Calculate ROUGE
+rouge_scores = accuracy_metrics.calculate_rouge(references, hypotheses)
+print('ROUGE Scores:', rouge_scores)
+
+# Calculate F1 Score
+f1 = accuracy_metrics.calculate_f1(true_labels, pred_labels)
+print('F1 Score:', f1)
+
+
+
+
+# Add at the bottom of your file
+if __name__ == "__main__":
+    AccuracyMetrics()
\ No newline at end of file
diff --git a/testing/benchmarking.py b/testing/benchmarking.py
new file mode 100644
index 0000000000000000000000000000000000000000..69036df0cae482d04a8b1a90a86fa7040b920dd5
--- /dev/null
+++ b/testing/benchmarking.py
@@ -0,0 +1,259 @@
+import matplotlib.pyplot as plt
+import time
+import torch
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+import numpy as np
+import tracemalloc
+
+# from Andromeda.model import Andromeda
+from Andromeda.model import Andromeda
+from Andromeda.utils.stable_adamw import StableAdamWUnfused
+
+torch.manual_seed(0)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed(0)
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+class AndromedaModelTest:
+    def __init__(self):
+        self.model = Andromeda
+        self.optimizer = StableAdamWUnfused()
+        self.loss_function = torch.nn.CrossEntropyLoss()
+        self.test_input = torch.randint(0, 256, (1, 1024)).cuda()
+
+    def test_forward_pass(self):
+        output = self.model(self.test_input)
+        assert output.shape == (1, 1024, 64007), "Forward pass output shape mismatch"
+
+    def test_backward_pass(self):
+        self.optimizer.zero_grad()
+        output = self.model(self.test_input)
+        loss = self.loss_function(output, self.test_input)
+
+        loss.backward()
+        for name, parameter in self.model.named_parameters():
+            assert not torch.isnan(parameter.grad().any()), f"Gradient for {name} contains NaNs"
+            assert not torch.isinf(parameter.grad().any()), f"Gradient for {name} contains Infs"
+
+
+    def test_optimizer_step(self):
+        initial_params = [param.clone() for param in self.model_parameters()]
+        output = self.model(self.test_input)
+        loss = self.loss_function(output, self.test_input)
+
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        for initial_param, param in zip(initial_params, self.model.parameters()):
+            assert not torch.equal(initial_param, param), "Model Parameters did not change after an optimizer step"
+
+
+
+
+
+class SpeedMetrics:
+    def __init__(self, model):
+        self.model = model.to(device)
+
+    def forward_pass_time(self):
+        start_time = time.time()
+        self.model.decoder.forward(torch.randint(0, 50304, (1, 8192), device=device, dtype=torch.long))[0]
+        end_time = time.time()
+        return end_time - start_time
+    
+    def backward_pass_time(self):
+        model_input = self.model.decoder.forward(torch.randint(0, 50304, (1, 8192), device=device, dtype=torch.long))[0]
+        start_time = time.time()
+        loss = torch.nn.CrossEntropyLoss()(model_input, torch.randint(0, 50304, (1, 8192), device=device, dtype=torch.long))
+        loss.backward()
+        end_time = time.time()
+        return end_time - start_time
+    
+    def end_to_end_latency(self):
+        start_time = time.time()
+        self.model.forward(torch.randint(0, 50304, (1, 8192), device=device, dtype=torch.long))
+        end_time = time.time()
+        return end_time - start_time
+    
+
+
+class ScalabilityMetrics:
+    def __init__(self, model, dataset):
+        self.model = model
+        self.dataset = dataset
+        self.dataloader = DataLoader(dataset, batch_size=32)
+
+    def throughput(self):
+        start_time = time.time()
+        for i, data in enumerate(self.dataloader, 0):
+            self.model.forward(data)
+        end_time = time.time()
+        return len(self.dataset) / (end_time - start_time)
+
+
+class ConsistencyMetrics:
+    def __init__(self, model):
+        self.model = model
+
+    def consistency_over_time(self):
+        consistency_times = []
+        outputs_list = []
+        for _ in range(10):
+            start_time = time.time()
+            outputs = self.model.forward(torch.randint(0, 50304, (1, 8192)))
+            end_time = time.time()
+            consistency_times.append(end_time - start_time)
+            outputs_list.append(outputs.detach().numpy())
+
+        initial_output = outputs_list[0]
+        consistency_score = 0
+        for output in outputs_list[1:]:
+            if np.array_equal(initial_output, output):
+                consistency_score += 1
+        consistency_score = consistency_score / len(outputs_list) * 100
+
+        return consistency_times, consistency_score
+
+
+class MemoryMetrics:
+    def __init__(self, model):
+        self.model = model
+
+    def memory_footprint(self):
+        tracemalloc.start()
+        self.model.forward(torch.randint(0, 50304, (1, 8192)))
+        current, peak = tracemalloc.get_traced_memory()
+        tracemalloc.stop()
+        return current, peak
+
+
+class SequenceMetrics:
+    def __init__(self, model):
+        self.model = model
+
+    def sequence_length_impact(self):
+        seq_lengths = [1024, 2048, 4096, 8192]
+        seq_impact_times = []
+        for length in seq_lengths:
+            start_time = time.time()
+            self.model.forward(torch.randint(0, 50304, (1, length)))
+            end_time = time.time()
+            seq_impact_times.append(end_time - start_time)
+        return seq_lengths, seq_impact_times
+
+
+
+
+class FlopsBenchmark:
+    def __init__(self, model, bsz=32, d_model=1024, num_heads=8, sequence_lengths=list(range(500, 32001, 500))):
+        self.bsz = bsz
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.sequence_lengths = sequence_lengths
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.dtype=torch.float32
+        self.model = model.to(self.device)
+
+    def benchmark(self):
+        time_taken = []
+        tflops_per_s = []
+
+        for seq_len in self.sequence_lengths:
+            x = torch.randn(self.bsz, seq_len, self.d_model).to(self.device).type(self.dtype)
+            torch.cuda.synchronize()
+
+            start = time.time()
+            self.model(x)
+            torch.cuda.synchronize()
+            elapsed = time.time() - start
+
+            time_taken.append(elapsed)
+            total_flops = 4 * seq_len **2 * (self.d_model // self.num_heads) * self.num_heads
+            tflops_per_s.append(total_flops / elapsed / 1e12)  # Convert to TFLOPs
+
+        for seq_len, elapsed, tflops in zip(self.sequence_lengths, time_taken, tflops_per_s):
+            print(f"Sequence length: {seq_len}, Time elapsed: {elapsed} s, TFLOPs/s: {tflops}")
+
+
+#mock test dataset
+test_dataset = datasets.FakeData(size=1000, transform=transforms.ToTensor())
+
+#model
+model = Andromeda(
+    num_tokens=50304, 
+    dim=1024,
+    depth=24,
+    dim_head=128,
+    heads=8,
+    alibi_num_heads=4
+)
+
+
+#speed test metrics test 
+# speed test metrics test 
+speed_metrics = SpeedMetrics(model)
+forward_pass_time = speed_metrics.forward_pass_time()
+backward_pass_time = speed_metrics.backward_pass_time()
+end_to_end_latency = speed_metrics.end_to_end_latency()
+
+
+#scalability metrics test
+scalability_metrics = ScalabilityMetrics(model, test_dataset)
+throughput = scalability_metrics.throughput()
+
+
+#consistency metrucs test
+consistency_metrics = ConsistencyMetrics(model)
+consistency_times, consistency_score = consistency_metrics.consistency_over_time()
+
+
+#memory metrics test
+memory_metrics = MemoryMetrics(model)
+current, peak = memory_metrics.memory_footprint()
+
+#sequence metrics test
+sequence_metrics = SequenceMetrics(model)
+seq_lengths, seq_impact_times = sequence_metrics.sequence_length_impact()
+
+
+
+#flops
+
+flops_benchmark = FlopsBenchmark(model)
+flops_benchmark.benchmark()
+
+# Graphical Interface
+fig, axs = plt.subplots(3)
+
+axs[0].bar(["Forward Pass Time", "Backward Pass Time", "End-to-End Latency"], [forward_pass_time, backward_pass_time, end_to_end_latency])
+axs[0].set_title('Speed Metrics')
+axs[0].set_xlabel('Metrics')
+axs[0].set_ylabel('Time (seconds)')
+
+axs[1].bar(seq_lengths, seq_impact_times)
+axs[1].set_title('Sequence Length Impact')
+axs[1].set_xlabel('Sequence Length')
+axs[1].set_ylabel('Time (seconds)')
+
+axs[2].plot(list(range(1, 11)), consistency_times)
+axs[2].set_title('Consistency Over Time')
+axs[2].set_xlabel('Run Number')
+axs[2].set_ylabel('Time (seconds)')
+
+plt.tight_layout()
+plt.show()
+
+print(f"Throughput: {throughput} instances/second")
+print(f"Memory used: {current / 10**6}MB; Peak: {peak / 10**6}MB")
+
+
+
+# Add at the bottom of your file
+if __name__ == "__main__":
+    model_test = AndromedaModelTest()
+    model_test.test_forward_pass()
+    model_test.test_backward_pass()
+    model_test.test_optimizer_step()
\ No newline at end of file
diff --git a/testing/dataset_builder.py b/testing/dataset_builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a06134c2eb54293a6ebe01afd1a8b17336e77b86
--- /dev/null
+++ b/testing/dataset_builder.py
@@ -0,0 +1,32 @@
+import unittest 
+from Andromeda.dataset_builder import DatasetBuilder
+
+class TestDatasetBuilder(unittest.TestCase):
+    def setUp(self):
+        self.builder = DatasetBuilder(dataset_name="tiiuae/falcon-refinedweb")
+
+    def test_initialization(self):
+        self.assertEqual(self.builder.dataset_name, "tiiuae/falcon-refinedweb", "Dataset name is not correctly set.")
+        self.assertEqual(self.builder.seq_len, 8192, "Sequence length is not correctly set.")
+        self.assertEqual(self.builder.tokenizer, "EleutherAI/gpt-neox-20b", "Tokenizer is not correctly set.")
+
+    def test_build_dataset(self):
+        dataset = self.builder.build_dataset()
+        self.assertIsNotNone(dataset, "Dataset is not built.")
+        self.assertTrue(hasattr(dataset, "map"), "Dataset does not have a map method.")
+
+    def test_tokenize_function(self):
+        example = {"text": ["Hello, world!", "Andromeda is great."]}
+        tokenized_example = self.builder.tokenize_function(example)
+        self.assertIsInstance(tokenized_example, dict, "Tokenized example is not a dictionary.")
+        self.assertTrue(all(isinstance(t, list) for t in tokenized_example.values()), "Tokenized example values are not lists.")
+
+    def test_group_texts(self):
+        examples = {"input_ids": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]] * 10}
+        grouped_examples = self.builder.group_texts(examples)
+        self.assertIsInstance(grouped_examples, dict, "Grouped examples is not a dictionary.")
+        self.assertTrue(all(isinstance(t, list) for t in grouped_examples.values()), "Grouped example values are not lists.")
+        self.assertTrue(all(len(t) == self.builder.seq_len for t in grouped_examples["input_ids"]), "Grouped example sequences are not the correct length.")
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/testing/model.py b/testing/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..fccb4ac486c462559db808b8d82cbddf6e17ab8d
--- /dev/null
+++ b/testing/model.py
@@ -0,0 +1,108 @@
+import torch
+import unittest
+from Andromeda.model import Andromeda
+
+
+class TestAndromeda(unittest.TestCase):
+    def setUp(self):
+        self.model = Andromeda()
+
+    def test_initialization(self):
+        self.assertIsNotNone(self.model.andromeda, "Transformer is not initialized.")
+        self.assertIsNotNone(self.model.decoder, "AutoregressiveWrapper is not initialized.")
+
+    def test_forward_pass(self):
+        input_tokens = torch.randint(0, 50432, (1, 8192))
+        output = self.model(input_tokens)
+        self.assertIsInstance(output, torch.Tensor, "Output is not a PyTorch tensor.")
+        self.assertEqual(output.shape[0], input_tokens.shape[0], "Output batch size does not match input.")
+
+    def test_error_handling(self):
+        with self.assertRaises(Exception):
+            self.model.forward(None)
+
+    def test_model_parameters(self):
+        self.assertEqual(self.model.Andromeda.num_tokens, 50432, "Number of tokens is not correctly set.")
+        self.assertEqual(self.model.Andromeda.max_seq_len, 8192, "Max sequence length is not correctly set.")
+
+    def test_model_output(self):
+        input_tokens = torch.randint(0, 50432, (1, 8192))
+        output1 = self.model(input_tokens)
+        output2 = self.model(input_tokens)
+        self.assertTrue(torch.allclose(output1, output2), "Model does not produce consistent output.")
+
+
+class TestAndromedaExtended(unittest.TestCase):
+    def setUp(self):
+        self.model = Andromeda()
+
+    def test_input_size(self):
+        for seq_len in [512, 1024, 2048, 4096]:
+            input_tokens = torch.randint(0, 50432, (1, seq_len))
+            output = self.model(input_tokens)
+            self.assertEqual(output.shape[1], seq_len, f"Output sequence length does not match input for seq_len={seq_len}.")
+
+    def test_batch_size(self):
+        for batch_size in [2, 4, 8, 16]:
+            input_tokens = torch.randint(0, 50432, (batch_size, 8192))
+            output = self.model(input_tokens)
+            self.assertEqual(output.shape[0], batch_size, f"Output batch size does not match input for batch_size={batch_size}.")
+
+    def test_token_range(self):
+        for token in [0, 50431]:
+            input_tokens = torch.full((1, 8192), fill_value=token)
+            output = self.model(input_tokens)
+            self.assertIsInstance(output, torch.Tensor, f"Output is not a PyTorch tensor for token={token}.")
+
+    def test_model_depth(self):
+        for depth in [16, 32, 64]:
+            model = Andromeda(depth=depth)
+            self.assertEqual(model.Andromeda.attn_layers.depth, depth, f"Model depth is not correctly set for depth={depth}.")
+
+    def test_model_dim(self):
+        for dim in [1280, 2560, 5120]:
+            model = Andromeda(dim=dim)
+            self.assertEqual(model.Andromeda.attn_layers.dim, dim, f"Model dimension is not correctly set for dim={dim}.")
+
+    def test_model_heads(self):
+        for heads in [12, 24, 48]:
+            model = Andromeda(heads=heads)
+            self.assertEqual(model.Andromeda.attn_layers.heads, heads, f"Number of heads is not correctly set for heads={heads}.")
+
+    def test_model_dim_head(self):
+        for dim_head in [64, 128, 256]:
+            model = Andromeda(dim_head=dim_head)
+            self.assertEqual(model.Andromeda.attn_layers.dim_head, dim_head, f"Head dimension is not correctly set for dim_head={dim_head}.")
+
+    def test_model_alibi_num_heads(self):
+        for alibi_num_heads in [6, 12, 24]:
+            model = Andromeda(alibi_num_heads=alibi_num_heads)
+            self.assertEqual(model.Andromeda.attn_layers.alibi_num_heads, alibi_num_heads, f"Number of alibi heads is not correctly set for alibi_num_heads={alibi_num_heads}.")
+
+    def test_model_shift_tokens(self):
+        for shift_tokens in [0, 1, 2]:
+            model = Andromeda(shift_tokens=shift_tokens)
+            self.assertEqual(model.Andromeda.attn_layers.shift_tokens, shift_tokens, f"Number of shift tokens is not correctly set for shift_tokens={shift_tokens}.")
+
+    def test_model_use_abs_pos_emb(self):
+        for use_abs_pos_emb in [True, False]:
+            model = Andromeda(use_abs_pos_emb=use_abs_pos_emb)
+            self.assertEqual(model.Andromeda.use_abs_pos_emb, use_abs_pos_emb, f"Use absolute position embedding flag is not correctly set for use_abs_pos_emb={use_abs_pos_emb}.")
+
+    def test_model_alibi_pos_bias(self):
+        for alibi_pos_bias in [True, False]:
+            model = Andromeda(alibi_pos_bias=alibi_pos_bias)
+            self.assertEqual(model.Andromeda.attn_layers.alibi_pos_bias, alibi_pos_bias, f"Alibi position bias flag is not correctly set for alibi_pos_bias={alibi_pos_bias}.")
+
+    def test_model_rotary_xpos(self):
+        for rotary_xpos in [True, False]:
+            model = Andromeda(rotary_xpos=rotary_xpos)
+            self.assertEqual(model.Andromeda.attn_layers.rotary_xpos, rotary_xpos, f"Rotary position flag is not correctly set for rotary_xpos={rotary_xpos}.")
+
+    def test_model_attn_flash(self):
+        for attn_flash in [True, False]:
+            model = Andromeda(attn_flash=attn_flash)
+            self.assertEqual(model.Andromeda.attn_layers.attn_flash, attn_flash, f"Attention flash flag is not correctly set for attn_flash={attn_flash}")
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/testing/tokenizer.py b/testing/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc68e63cde3deb65e20093ffbb7809ec3265e5e2
--- /dev/null
+++ b/testing/tokenizer.py
@@ -0,0 +1,33 @@
+import unittest
+from Andromeda.model import AndromedaTokenizer
+
+
+class TestAndromedaTokenizer(unittest.TestCase):
+    def setUp(self):
+        self.tokenizer = AndromedaTokenizer()
+
+    def test_initialization(self):
+        self.assertIsNotNone(self.tokenizer.tokenizer, "Tokenizer is not initialized.")
+        self.assertEqual(self.tokenizer.tokenizer.eos_token, "<eos>", "EOS token is not correctly set.")
+        self.assertEqual(self.tokenizer.tokenizer.pad_token, "<pad>", "PAD token is not correctly set.")
+        self.assertEqual(self.tokenizer.tokenizer.model_max_length, 8192, "Model max length is not correctly set.")
+
+    def test_tokenize_texts(self):
+        texts = ["Hello, world!", "Andromeda is great."]
+        tokenized_texts = self.tokenizer.tokenize_texts(texts)
+        self.assertEqual(tokenized_texts.shape[0], len(texts), "Number of tokenized texts does not match input.")
+        self.assertTrue(all(isinstance(t, torch.Tensor) for t in tokenized_texts), "Not all tokenized texts are PyTorch tensors.")
+
+    def test_decode(self):
+        texts = ["Hello, world!", "Andromeda is great."]
+        tokenized_texts = self.tokenizer.tokenize_texts(texts)
+        decoded_texts = [self.tokenizer.decode(t) for t in tokenized_texts]
+        self.assertEqual(decoded_texts, texts, "Decoded texts do not match original texts.")
+
+    def test_len(self):
+        num_tokens = len(self.tokenizer)
+        self.assertIsInstance(num_tokens, int, "Number of tokens is not an integer.")
+        self.assertGreater(num_tokens, 0, "Number of tokens is not greater than 0.")
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/train.ipynb b/train.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..06ebaf5eab77a78b76ac05e0162f2d467077296a
--- /dev/null
+++ b/train.ipynb
@@ -0,0 +1,34 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!git clone https://github.com/kyegomez/Andromeda\n",
+    "%cd Andromeda\n",
+    "!pip3 install -r requirements.txt\n",
+    "!python3 train.py\n",
+    "\n",
+    "# #    os.environ['MASTER_ADDR'] #'localhost'\n",
+    "# os.environ['MASTER_PORT'] #= '9994'\n",
+    "\n",
+    "# # # [CRITICAL] Pay attention to this when scaling to multiple GPUs and clusters\n",
+    "\n",
+    "# # # Pay attention to this, use \"accelerate config\"\n",
+    "\n",
+    "# os.environ['RANK']       #= str(0) # Number of nodes (servers)\n",
+    "# os.environ['WORLD_SIZE'] # = str(torch.cuda.device_count())"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/train.py b/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..e533aa3a016634f42d7d7f51bcb6d3aeb1f05643
--- /dev/null
+++ b/train.py
@@ -0,0 +1,705 @@
+import math
+import multiprocessing
+import os
+from datetime import timedelta
+from functools import partial
+from itertools import chain
+
+import torch
+# import bitsandbytes as bnb
+
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel,
+    MixedPrecision,
+    BackwardPrefetch,
+    ShardingStrategy,
+)
+from accelerate import Accelerator
+from accelerate.utils import (DummyOptim, InitProcessGroupKwargs)
+from accelerate.logging import get_logger
+
+
+from datasets import load_dataset
+from lion_pytorch import Lion
+from torch.nn import LayerNorm
+
+
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointImpl, apply_activation_checkpointing, checkpoint_wrapper)
+from torch.distributed.fsdp.wrap import (
+    transformer_auto_wrap_policy
+)
+
+
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import (AutoTokenizer, default_data_collator,
+                          get_cosine_schedule_with_warmup,
+                          get_linear_schedule_with_warmup, set_seed)
+
+
+from Andromeda.utils.stable_adamw import StableAdamWUnfused
+from Andromeda.core.transformer import Transformer, AndromedaEmbedding
+# from Andromeda.model import Andromeda
+from Andromeda.model import AndromedaEmbedding #, Andromeda
+from Andromeda.configs import Andromeda1Billion
+
+########### SETUP CONFIG
+import torch.distributed as dist
+
+
+from accelerate.state import AcceleratorState
+
+# state = AcceleratorState()
+
+
+logger = get_logger(__name__, log_level="INFO")
+
+class CFG:
+    BATCH_SIZE = 1
+    GRADIENT_ACCUMULATE_EVERY: int = 1
+    SEED: int = 42
+    LEARNING_RATE: float = 1e-4 #3e-4 # 1e-4 for lion
+    WEIGHT_DECAY: float = 0.1
+    SEQ_LEN: int = 8192
+    NUM_CPU: int = multiprocessing.cpu_count()
+    USE_DEEPSPEED: bool = True
+    USE_FSDP: bool = True
+    USE_PRETOKENIZED: bool = True
+    USE_ACTIVATION_CHECKPOINTING: bool = True
+    RESUME_FROM_CHECKPOINT: str = False
+    CHECKPOINTING_STEPS: int = 1000
+    OUTPUT_DIR: str = 'checkpoints/' # Folder
+    ENTITY_NAME: str = "Andromeda"
+    LOGGING_STEPS: int = 100
+
+
+# helpers
+
+
+def print_num_params(model, accelerator: Accelerator):
+    # n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    accelerator.print(f"Number of parameters in model: {n_params}")
+
+
+# activation checkpointing
+
+
+def activation_checkpointing(
+    model: torch.nn.Module,
+    offload_to_cpu: bool = False,
+    accelerator: Accelerator = None,
+):
+    """
+    Apply activation checkpointing to a model.
+
+    Args:
+        model (Module): The model to which to apply activation checkpointing.
+        offload_to_cpu (bool, optional): Whether to offload the activations to CPU. Defaults to False.
+        accelerator (Accelerator, optional): The Accelerate library accelerator. Defaults to None.
+    """
+    if accelerator is not None:
+        accelerator.print("Using activation checkpointing")
+    def check_fn(submodule):
+        return isinstance(submodule, Transformer)
+    non_reentrant_wrapper = partial(
+        checkpoint_wrapper,
+        offload_to_cpu=offload_to_cpu,
+        checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+    )
+    apply_activation_checkpointing(
+        model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn
+    )
+
+
+# FSDP
+
+
+def fsdp(
+    model: torch.nn.Module,
+    auto_wrap: bool = False,
+    mp: str = "fp32",
+    shard_strat: str = "NO_SHARD",
+):
+    """
+    This function wraps a given PyTorch model with the FullyShardedDataParallel (FSDP) wrapper to enable efficient data parallelism and model sharding.
+
+    Args:
+        model (torch.nn.Module): The original PyTorch model to be wrapped with FSDP.
+        auto_wrap (bool, optional): If True, it enables automatic wrapping of the model's layers according to the transformer_auto_wrap_policy. Default is False.
+        mp (str, optional): The mixed precision mode to be used. Can be 'bf16' for BFloat16, 'fp16' for Float16 or 'fp32' for Float32 precision. Default is 'fp32'.
+        shard_strat (str, optional): The sharding strategy to be used. Can be 'SHARD_GRAD' for sharding at gradient computation, 'FULL_SHARD' for full model sharding or 'NO_SHARD' for no sharding. Default is 'NO_SHARD'.
+
+    Raises:
+        ValueError: If the provided mp (mixed precision mode) is not 'bf16', 'fp16' or 'fp32'.
+        ValueError: If the provided shard_strat (sharding strategy) is not 'SHARD_GRAD', 'FULL_SHARD' or 'NO_SHARD'.
+
+    Returns:
+        torch.nn.Module: The input model wrapped with FSDP.
+    """
+    if auto_wrap:
+        Andromeda_auto_wrap_policy = partial(
+            transformer_auto_wrap_policy,
+            transformer_layer_cls={
+                Transformer,
+            },
+        )
+    else:
+        Andromeda_auto_wrap_policy = None
+
+    if mp == "bf16":
+        mp_fsdp = MixedPrecision(
+            param_dtype=torch.bfloat16,
+            # Gradient communication precision.
+            reduce_dtype=torch.bfloat16,
+            # Buffer precision.
+            buffer_dtype=torch.bfloat16,
+        )
+    elif mp == "fp16":
+        mp_fsdp = MixedPrecision(
+            param_dtype=torch.float16,
+            # Gradient communication precision.
+            reduce_dtype=torch.float16,
+            # Buffer precision.
+            buffer_dtype=torch.float16,
+        )
+    elif mp == "fp32":
+        mp_fsdp = MixedPrecision(
+            param_dtype=torch.float32,
+            # Gradient communication precision.
+            reduce_dtype=torch.float32,
+            # Buffer precision.
+            buffer_dtype=torch.float32,
+        )
+    else:
+        raise ValueError(
+            "Invalid scheduler_type. Expected 'bf16', 'fp16' or 'fp32', got: {}".format(
+                mp
+            )
+        )
+
+    if shard_strat == "SHARD_GRAD":
+        sharding_strat_fsdp = ShardingStrategy.SHARD_GRAD_OP 
+    elif shard_strat == "FULL_SHARD":
+        sharding_strat_fsdp = ShardingStrategy.FULL_SHARD
+    elif shard_strat == "NO_SHARD":
+        sharding_strat_fsdp = ShardingStrategy.NO_SHARD
+    else:
+        raise ValueError(
+            "Invalid scheduler_type. Expected 'SHARD_GRAD', 'FULL_SHARD' or 'NO_SHARD', got: {}".format(
+                shard_strat
+            )
+        )
+
+    model = FullyShardedDataParallel(
+        model,
+        auto_wrap_policy=Andromeda_auto_wrap_policy,
+        mixed_precision=mp_fsdp,
+        backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
+        sharding_strategy=sharding_strat_fsdp,
+        forward_prefetch=True,
+        use_orig_params=True,
+    )
+
+    return model
+
+
+# learning rate scheduler
+
+
+def get_lr_scheduler_with_warmup(
+    optimizer: torch.optim.Optimizer,
+    scheduler_type: str,
+    num_warmup_steps: int,
+    max_train_steps: int,
+    grad_accumulate_every: int = 1,
+    accelerator: Accelerator = None,
+):
+    """
+    Get a learning rate scheduler with warmup.
+
+    Args:
+        optimizer (Optimizer): The optimizer for which to create the learning rate scheduler.
+        scheduler_type (str): The type of learning rate scheduler to create, either "linear" or "cosine".
+        num_warmup_steps (int): The number of warmup steps for the learning rate scheduler.
+        max_train_steps (int): The maximum number of training steps.
+        grad_accumulate_every (int, optional): The gradient accumulation factor. Defaults to 1.
+        accelerator (Accelerator, optional): The Accelerate library accelerator. Defaults to None.
+
+    Returns:
+        The learning rate scheduler with warmup.
+
+    Raises:
+        ValueError: If scheduler_type is not "linear" or "cosine".
+    """
+    NUM_WARMUP_STEPS = num_warmup_steps
+    GRADIENT_ACCUMULATE_EVERY = grad_accumulate_every
+    if accelerator is not None:
+        accelerator.print(f"Using {scheduler_type} lr scheduler")
+    if scheduler_type == "linear":
+        return get_linear_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=NUM_WARMUP_STEPS * GRADIENT_ACCUMULATE_EVERY,
+            num_training_steps=max_train_steps * GRADIENT_ACCUMULATE_EVERY,
+        )
+    elif scheduler_type == "cosine":
+        return get_cosine_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=NUM_WARMUP_STEPS * GRADIENT_ACCUMULATE_EVERY,
+            num_training_steps=max_train_steps * GRADIENT_ACCUMULATE_EVERY,
+        )
+    else:
+        raise ValueError(
+            "Invalid scheduler_type. Expected 'linear' or 'cosine', got: {}".format(
+                scheduler_type
+            )
+        )
+
+
+# optimizers
+
+
+def decoupled_optimizer(
+    model: torch.nn.Module,
+    learning_rate: float,
+    weight_decay: float,
+    beta_1: float,
+    beta_2: float,
+    optimizer_type: str,
+    use_fsdp: bool = True,
+    accelerator: Accelerator = None,
+):
+    """
+    Decouples the optimizer from the training process.
+
+    This function sets up the optimizer for the model by creating two groups of parameters:
+    one for weight decay and one without weight decay. Then, it initializes the optimizer
+    with these two groups of parameters.
+
+    Args:
+        model (Module): The model whose parameters are optimized.
+        learning_rate (float): The learning rate for the optimizer.
+        weight_decay (float): The weight decay for the optimizer.
+        beta_1 (float): The exponential decay rate for the 1st moment estimates.
+        beta_2 (float): The exponential decay rate for the 2nd moment estimates.
+        optimizer_type (str): The type of the optimizer. Can be 'lion', 'adamw', or 'stable_adamw'.
+        use_fsdp (bool, optional): If True, the optimizer will work with fully sharded data parallelism. Defaults to True.
+        accelerator (Accelerator, optional): The accelerator from HuggingFace's Accelerate library. Defaults to None.
+
+    Returns:
+        Optimizer: The initialized optimizer.
+
+    Raises:
+        ValueError: If the optimizer type is not 'lion', 'adamw' or 'stable_adamw'.
+    """
+    accelerator.print(f"Using {optimizer_type} optimizer")
+    # Create an empty dictionary called param_dict to store the model's named parameters.
+    param_dict = {}
+    # Iterate over the model's named parameters and populate the param_dict with key-value pairs.
+    for param_name, param in model.named_parameters():
+        param_dict[param_name] = param
+
+    # Separate the model's named modules into two groups: decay and no_decay.
+
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+    no_decay = []
+
+    if use_fsdp:
+        exclude_module = "_fsdp_wrapped_module.token_emb"
+    else:
+        exclude_module = "token_emb"
+
+    # Iterate through the named modules of the model.
+    for module_name, module in model.named_modules():
+        # Check if the current module is an instance of any of the desired types (LayerNorm or torch.nn.Embedding).
+        for ndim in [LayerNorm, torch.nn.Embedding]:
+            if isinstance(module, ndim):
+                # If torch.nn.Embedding, append its name with a ".weight" suffix to the no_decay list.
+                if module_name == exclude_module:
+                    no_decay.append(f"{module_name}.weight")
+                else:
+                    # If the module is an instance of LayerNorm
+                    no_decay.append(f"{module_name}.gamma")
+                # Exit the inner loop since the desired module has been found.
+                break
+
+    # Create an empty list to store the names of the Linear layer weights with weight decay.
+    decay = []
+
+    # Iterate through the named modules of the model.
+    for module_name, module in model.named_modules():
+        # Check if the current module is an instance of the desired type (torch.nn.Linear).
+        for ndim in [torch.nn.Linear]:
+            if isinstance(module, ndim):
+                # If the module is an instance of torch.nn.Linear, append its name with a ".weight" suffix to the decay list.
+                decay.append(f"{module_name}.weight")
+                # Exit the inner loop since the desired module has been found.
+                break
+
+    # Create two separate lists of model parameters: decay_param and no_decay_param.
+    # The decay_param list contains the parameters that should have weight decay applied.
+    # The no_decay_param list contains the parameters that should not have weight decay applied, excluding the 'to_logits.weight' parameter.
+
+    # Create an empty list called decay_param to store the parameters with weight decay.
+    decay_param = []
+
+    if use_fsdp:
+        exclude_param = "_fsdp_wrapped_module.to_logits.weight"
+    else:
+        exclude_param = "to_logits.weight"
+
+    # Iterate over the decay list, which contains the names of the parameters with weight decay.
+    for param in decay:
+        # Check if the current parameter is not 'to_logits.weight'.
+        # Append the corresponding parameter from param_dict to the decay_param list.
+
+        if param != exclude_param:
+            decay_param.append(param_dict[param])
+
+    # Create an empty list called no_decay_param to store the parameters without weight decay.
+    no_decay_param = []
+
+    # Iterate over the no_decay list, which contains the names of the parameters without weight decay.
+    for param in no_decay:
+        try:
+                
+            # Append the corresponding parameter from param_dict to the no_decay_param list.
+            no_decay_param.append(param_dict[param])
+        except KeyError:
+            # print(f"Parameter {param_name} does not exist in the model")
+            pass
+
+    # Create a list called grouped_params that contains two dictionaries.
+    # The first dictionary has the decay_param list and the corresponding weight_decay value.
+    # The second dictionary has the no_decay_param list and a weight_decay value of 0.0.
+    grouped_params = [
+        {"params": decay_param, "weight_decay": weight_decay},
+        {"params": no_decay_param, "weight_decay": 0.0},
+    ]
+
+    # Create a variable called optimizer that stores an instance of the optimizer.
+    if optimizer_type == "lion":
+        optimizer = Lion(grouped_params, lr=learning_rate, betas=(beta_1, beta_2),)
+    elif optimizer_type == "adamw":
+        optimizer = AdamW(grouped_params, lr=learning_rate, betas=(beta_1, beta_2),)
+    elif optimizer_type == "deepspeed":
+        optimizer = DummyOptim(grouped_params, lr=learning_rate, betas=(beta_1, beta_2),)
+    elif optimizer_type == "stable_adamw":
+        optimizer = StableAdamWUnfused(
+            grouped_params, lr=learning_rate, betas=(beta_1, beta_2),
+        )
+    # elif optimizer_type=="Adam8bit":
+    #     optimizer = bnb.optim.Adam8bit(grouped_params, lr=learning_rate, betas=(beta_1, beta_2))
+    # elif optimizer_type=="Lion8Bit":
+    #     optimizer = bnb.optim.Lion8bit(grouped_params, lr=learning_rate, betas=(beta_1, beta_2))
+    else:
+        raise ValueError(
+            "Invalid optimizer_type. Expected 'lion', 'adamw', 'deepspeed' or 'stable_adamw', got: {}".format(
+                optimizer_type
+            )
+        )
+
+    # Return the optimizer.
+    return optimizer
+
+
+# dataloaders
+
+
+def build_dataloaders():
+    """
+    Build data loaders for training.
+
+    This function performs the following steps:
+    1. Load the tokenizer from the pretrained "EleutherAI/gpt-neox-20b" model.
+    2. Load the "openwebtext" dataset.
+    3. Tokenize the dataset, adding the end-of-sentence token to each text.
+    4. Process the tokenized dataset into chunks of a specified block size.
+
+    Returns:
+        Dataset: The processed dataset ready for training.
+    """
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    dataset = load_dataset("openwebtext", split="train")
+
+    tokenized_dataset = dataset.map(
+        lambda example: tokenizer([t + tokenizer.eos_token for t in example["text"]]),
+        batched=True,
+        num_proc=CFG.NUM_CPU,
+        remove_columns=["text"],
+    )
+
+    block_size = CFG.SEQ_LEN
+
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        return result
+
+    train_dataset = tokenized_dataset.map(
+        group_texts, batched=True, num_proc=CFG.NUM_CPU,
+    )
+
+    return train_dataset
+
+#switch to falconwebdataset
+def build_pre_tokenized():
+    d0 = load_dataset("conceptofmind/c4_0-to-20_neox_with_eos_8k", split="train[:10]")
+    # d1 = load_dataset("conceptofmind/c4_21-to-40_neox_with_eos_8k", split="train")
+    # d2 = load_dataset("conceptofmind/c4_41-to-60_neox_with_eos_8k", split="train")
+    # d3 = load_dataset("conceptofmind/c4_61-to-80_neox_with_eos_8k", split="train")
+    # d4 = load_dataset("conceptofmind/c4_81-to-100_neox_with_eos_8k", split="train")
+    # train_dataset = concatenate_datasets([d0, d1, d2, d3, d4])
+    return d0
+
+
+
+def Train():
+    # accelerator
+
+    timeout = InitProcessGroupKwargs(timeout=timedelta(seconds=1_000_000))
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=CFG.GRADIENT_ACCUMULATE_EVERY,
+        mixed_precision="fp16",
+        log_with="wandb",
+        kwargs_handlers=[timeout],
+    )
+
+    state = AcceleratorState()
+    
+    state.deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = CFG.BATCH_SIZE #??????
+
+    accelerator.init_trackers(
+        project_name="Andromeda",
+        config={
+            "batch_size": CFG.BATCH_SIZE,
+            "gradient_accumulate_every": CFG.GRADIENT_ACCUMULATE_EVERY,
+            "learning_rate": CFG.LEARNING_RATE,
+            "seq_len": CFG.SEQ_LEN,
+        },
+        # init_kwargs={"wandb": {"entity": CFG.ENTITY_NAME}},
+    )
+
+    accelerator.print(f"Total GPUS: {accelerator.num_processes}")
+
+    # set seed
+
+    set_seed(CFG.SEED)
+
+    # model = Andromeda(
+    #     num_tokens=50432,
+    #     max_seq_len=8192,
+    #     dim=3072,
+    #     depth=24,
+    #     dim_head=128,
+    #     heads=12,
+    #     use_abs_pos_emb=False, 
+    #     alibi_pos_bias=True, 
+    #     alibi_num_heads=6, 
+    #     rotary_xpos=True,
+    #     attn_flash=True, 
+    #     shift_tokens=1, 
+    #     attn_one_kv_head=True, 
+    #     qk_norm=True, 
+    #     attn_qk_norm=True, 
+    #     attn_qk_norm_dim_scale=True, 
+    #     embedding_provider=AndromedaEmbedding()
+    # )
+    model = Andromeda1Billion()
+
+    print_num_params(model, accelerator)
+
+    if CFG.USE_FSDP:
+        model = fsdp(
+            model,
+            mp="fp16",
+            shard_strat="SHARD_GRAD"
+        )
+
+    if CFG.USE_ACTIVATION_CHECKPOINTING:
+        activation_checkpointing(model, accelerator)
+
+    model = accelerator.prepare(model)
+
+    # dataloaders
+
+    if CFG.USE_PRETOKENIZED:
+        train_dataset = build_pre_tokenized()
+    else:
+        train_dataset = build_dataloaders()
+
+    train_loader = DataLoader(
+        train_dataset, batch_size=CFG.BATCH_SIZE, collate_fn=default_data_collator,
+    )
+
+
+    # optimizer
+    optim = decoupled_optimizer(
+        model=model,
+        learning_rate=CFG.LEARNING_RATE, 
+        weight_decay=CFG.WEIGHT_DECAY, 
+        beta_1=0.90, 
+        beta_2=0.95, 
+        optimizer_type='lion',  
+        use_fsdp=True,
+        accelerator=accelerator
+    )
+
+    # Determine number of training steps
+
+    max_train_steps = math.ceil(len(train_loader) / CFG.GRADIENT_ACCUMULATE_EVERY)
+    accelerator.print(f"Max train steps: {max_train_steps}")
+
+    # lr scheduler
+
+    NUM_WARMUP_STEPS = int(max_train_steps * 0.01)
+    accelerator.print(f"Num warmup steps: {NUM_WARMUP_STEPS}")
+
+    # if False: # if CFG.USE_DEEPSPEED:
+    #     lr_scheduler = DummyScheduler(
+    #         optim, 
+    #         total_num_steps=max_train_steps * accelerator.num_processes, 
+    #         warmup_num_steps=NUM_WARMUP_STEPS
+    #     )
+    # else:
+    lr_scheduler = get_lr_scheduler_with_warmup(
+        optimizer=optim,
+        scheduler_type="cosine",
+        num_warmup_steps=NUM_WARMUP_STEPS,
+        max_train_steps=max_train_steps,
+        grad_accumulate_every=CFG.GRADIENT_ACCUMULATE_EVERY,
+    )
+
+    # prepare
+
+    optim, train_loader, lr_scheduler = accelerator.prepare(
+        optim, train_loader, lr_scheduler
+    )
+
+    # checkpoint scheduler
+
+    accelerator.register_for_checkpointing(lr_scheduler)
+
+    # I do not know why Huggingface recommends recalculation of max_train_steps
+
+    max_train_steps = math.ceil(len(train_loader) / CFG.GRADIENT_ACCUMULATE_EVERY)
+    accelerator.print(f"Max train steps recalculated: {max_train_steps}")
+
+    # Total batch size for logging
+
+    total_batch_size = (
+        CFG.BATCH_SIZE * accelerator.num_processes * CFG.GRADIENT_ACCUMULATE_EVERY
+    )
+    accelerator.print(f"Total batch size: {total_batch_size}")
+
+    # resume training
+
+    progress_bar = tqdm(
+        range(max_train_steps), disable=not accelerator.is_local_main_process
+    )
+    completed_steps = 0
+
+    if CFG.RESUME_FROM_CHECKPOINT:
+        if CFG.RESUME_FROM_CHECKPOINT is not None or CFG.RESUME_FROM_CHECKPOINT != "":
+            accelerator.print(f"Resuming from checkpoint {CFG.RESUME_FROM_CHECKPOINT}")
+            accelerator.load_state(CFG.RESUME_FROM_CHECKPOINT)
+            path = os.path.basename(CFG.RESUME_FROM_CHECKPOINT)
+        training_difference = os.path.splitext(path)[0]
+
+        # need to multiply `gradient_accumulation_steps` to reflect real steps
+        resume_step = (
+            int(training_difference.replace("step_", ""))
+            * CFG.GRADIENT_ACCUMULATE_EVERY
+        )
+
+    if CFG.RESUME_FROM_CHECKPOINT and resume_step is not None:
+        train_loader = accelerator.skip_first_batches(train_loader, resume_step)
+        completed_steps += resume_step
+        progress_bar.update(resume_step)
+
+    # training
+
+    model.train()
+    for step, batch in enumerate(train_loader):
+        with accelerator.accumulate(model):
+            inputs = batch["input_ids"].to(accelerator.device)
+            loss = model(inputs, return_loss=True)
+            accelerator.backward(loss)
+
+            accelerator.log({"loss": loss.item()}, step=step)
+
+            if accelerator.sync_gradients:
+                accelerator.clip_grad_norm_(model.parameters(), 1.0)
+
+            optim.step()
+            lr_scheduler.step()
+            optim.zero_grad()
+
+        if accelerator.sync_gradients:
+            progress_bar.update(1)
+            completed_steps += 1
+
+        if isinstance(CFG.CHECKPOINTING_STEPS, int):
+            if completed_steps % CFG.CHECKPOINTING_STEPS == 0:
+                output_dir = f"step_{completed_steps }"
+                if CFG.OUTPUT_DIR is not None:
+                    output_dir = os.path.join(CFG.OUTPUT_DIR, output_dir)
+                accelerator.save_state(output_dir)
+
+        if completed_steps >= max_train_steps:
+            break
+
+        #logging every CFG.LOGGING STEPS
+        if CFG.LOGGING_STEPS > 0 and step % CFG.LOGGING_STEPS == 0:
+            logger.info(
+                f"Step: {completed_steps}/{max_train_steps}, Loss: {loss.item():.5f}"
+            )
+
+    # end training
+
+    # accelerator.print(f"Training Finished")
+    accelerator.end_training()
+
+    # save final model
+
+    # accelerator.print(f"Saving model to {CFG.OUTPUT_DIR}")
+    if CFG.OUTPUT_DIR is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        with accelerator.main_process_first():
+            accelerator.save(
+                unwrapped_model.state_dict(), f"{CFG.OUTPUT_DIR}/final/final_model.pt"
+            )
+
+
+def main():
+    os.environ['MASTER_ADDR'] #'localhost'
+    os.environ['MASTER_PORT'] #= '9994'
+    
+    # # [CRITICAL] Pay attention to this when scaling to multiple GPUs and clusters
+    
+    # # Pay attention to this, use "accelerate config"
+
+    os.environ['RANK']       #= str(0) # Number of nodes (servers)
+    os.environ['WORLD_SIZE'] # = str(torch.cuda.device_count())
+
+    dist.init_process_group(backend='nccl') #init_method="env://")
+    
+    Train()
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file