Spaces:

aka7774
/

gemma2_9b_7gb

Runtime error

App Files Files Community

aka7774 commited on Aug 7

Commit

5653716

•

1 Parent(s): 69edd86

Upload 6 files

Browse files

Files changed (6) hide show

app.py +126 -0
fn.py +184 -0
install.bat +56 -0
main.py +49 -0
requirements.txt +21 -0
venv.sh +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import fn
+import gradio as gr
+with gr.Blocks() as demo:
+    with gr.Tab('config'):
+        info = gr.Markdown()
+        with gr.Row():
+            with gr.Column(scale=1):
+                model = gr.Textbox(
+                    value=fn.cfg['model_name'],
+                    label='model',
+                    interactive=True,
+                    show_copy_button=True,
+                )
+                dtype = gr.Dropdown(
+                    value=fn.cfg['dtype'],
+                    choices=['4bit'],
+                    label='dtype',
+                    interactive=True,
+                    allow_custom_value=True,
+                )
+            with gr.Column(scale=1):
+                max_new_tokens = gr.Textbox(
+                    value=fn.cfg['max_new_tokens'],
+                    label='max_new_tokens',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                temperature = gr.Textbox(
+                    value=fn.cfg['temperature'],
+                    label='temperature',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                top_p = gr.Textbox(
+                    value=fn.cfg['top_p'],
+                    label='top_p',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                top_k = gr.Textbox(
+                    value=fn.cfg['top_k'],
+                    label='top_k',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                repetition_penalty = gr.Textbox(
+                    value=fn.cfg['repetition_penalty'],
+                    label='repetition_penalty',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+        with gr.Row():
+            with gr.Column(scale=1):
+                inst_template = gr.Textbox(
+                    value='',
+                    lines=10,
+                    label='inst_template',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+            with gr.Column(scale=1):
+                chat_template = gr.Textbox(
+                    value='',
+                    lines=10,
+                    label='chat_template',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+        set_button = gr.Button(value='Save')
+    with gr.Tab('instruct'):
+        with gr.Row():
+            with gr.Column(scale=1):
+                instruction = gr.Textbox(
+                    lines=20,
+                    label='instruction',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                input = gr.Textbox(
+                    lines=1,
+                    label='input',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+            with gr.Column(scale=1):
+                said = gr.Textbox(
+                    label='said',
+                    lines=20,
+                    show_copy_button=True,
+                    )
+                numel = gr.Textbox(
+                    lines=1,
+                    label='numel',
+                    show_copy_button=True,
+                    )
+        inst_button = gr.Button(value='inst')
+        numel_button = gr.Button(value='numel')
+    with gr.Tab('chat'):
+        gr.ChatInterface(fn.chat)
+    set_button.click(
+        fn=fn.set_config,
+        inputs=[model, dtype, instruction, inst_template, chat_template, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[info],
+        )
+    inst_button.click(
+        fn=fn.chat,
+        inputs=[input, input, instruction],
+        outputs=[said],
+        )
+    numel_button.click(
+        fn=fn.numel,
+        inputs=[input, input, instruction],
+        outputs=[numel],
+        )
+if __name__ == '__main__':
+    demo.launch()

fn.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import os
+import torch
+import json
+import gc
+import time
+from unsloth import FastLanguageModel
+from transformers import TextIteratorStreamer
+from threading import Thread
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+tokenizer = None
+model = None
+default_cfg = {
+    'model_name': "unsloth/gemma-2-9b-it-bnb-4bit",
+    'dtype': None,
+    'instruction': None,
+    'inst_template': None,
+    'chat_template': None,
+    'max_length': 2400,
+    'max_seq_length': 2048,
+    'max_new_tokens': 512,
+    'temperature': 0.9,
+    'top_p': 0.95,
+    'top_k': 40,
+    'repetition_penalty': 1.2,
+}
+cfg = default_cfg.copy()
+def load_model(model_name, dtype):
+    global tokenizer, model, cfg
+    if cfg['model_name'] == model_name and cfg['dtype'] == dtype:
+        return
+    del model
+    del tokenizer
+    model = None
+    tokenizer = None
+    gc.collect()
+    torch.cuda.empty_cache()
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name,
+        max_seq_length = cfg['max_seq_length'],
+        dtype = torch.bfloat16,
+        load_in_8bit = (dtype == '8bit'),
+        load_in_4bit = (dtype == '4bit'),
+    )
+    FastLanguageModel.for_inference(model)
+    cfg['model_name'] = model_name
+    cfg['dtype'] = dtype
+def clear_config():
+    global cfg
+    cfg = default_cfg.copy()
+def set_config(model_name, dtype, instruction, inst_template, chat_template, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
+    global cfg
+    load_model(model_name, dtype)
+    cfg.update({
+        'instruction': instruction,
+        'inst_template': inst_template,
+        'chat_template': chat_template,
+        'max_new_tokens': int(max_new_tokens),
+        'temperature': float(temperature),
+        'top_p': float(top_p),
+        'top_k': int(top_k),
+        'repetition_penalty': float(repetition_penalty),
+    })
+    return 'done.'
+def set_config_args(args):
+    global cfg
+    load_model(args['model_name'], args['dtype'])
+    cfg.update(args)
+    return 'done.'
+def chatinterface_to_messages(message, history):
+    global cfg
+    messages = []
+    if cfg['instruction']:
+        messages.append({'role': 'system', 'content': cfg['instruction']})
+    for pair in history:
+        [user, assistant] = pair
+        if user:
+            messages.append({'role': 'user', 'content': user})
+        if assistant:
+            messages.append({'role': 'assistant', 'content': assistant})
+    if message:
+        messages.append({'role': 'user', 'content': message})
+    return messages
+def apply_template(messages):
+    global tokenizer, cfg
+    if cfg['chat_template']:
+        tokenizer.chat_template = cfg['chat_template']
+    if type(messages) is str:
+        if cfg['inst_template']:
+            return cfg['inst_template'].format(instruction=cfg['instruction'], input=messages)
+        return cfg['instruction'].format(input=messages)
+    if type(messages) is list:
+        return tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)
+def chat(message, history = [], instruction = None, args = {}):
+    global tokenizer, model, cfg
+    if instruction:
+        cfg['instruction'] = instruction
+        prompt = apply_template(message)
+    else:
+        messages = chatinterface_to_messages(message, history)
+        prompt = apply_template(messages)
+    inputs = tokenizer(prompt, return_tensors="pt",
+        padding=True, max_length=cfg['max_length'], truncation=True).to("cuda")
+    streamer = TextIteratorStreamer(
+        tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True,
+    )
+    generate_kwargs = dict(
+        inputs,
+        do_sample=True,
+        streamer=streamer,
+        num_beams=1,
+    )
+    for k in [
+        'max_new_tokens',
+        'temperature',
+        'top_p',
+        'top_k',
+        'repetition_penalty'
+        ]:
+        if cfg[k]:
+            generate_kwargs[k] = cfg[k]
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    model_output = ""
+    for new_text in streamer:
+        model_output += new_text
+        if 'fastapi' in args:
+            # fastapiは差分だけを返して欲しい
+            yield new_text
+        else:
+            # gradioは常に全文を返して欲しい
+            yield model_output
+def infer(message, history = [], instruction = None, args = {}):
+    content = ''
+    for s in chat(message, history, instruction, args):
+        content += s
+    return content
+def numel(message, history = [], instruction = None, args = {}):
+    global tokenizer, model, cfg
+    if instruction:
+        cfg['instruction'] = instruction
+        prompt = apply_template(message)
+    else:
+        messages = chatinterface_to_messages(message, history)
+        prompt = apply_template(messages)
+    model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
+    return torch.numel(model_inputs['input_ids'])
+load_model(cfg['model_name'], '4bit')

install.bat ADDED Viewed

	@@ -0,0 +1,56 @@

+@echo off
+rem -------------------------------------------
+rem NOT guaranteed to work on Windows
+set APPDIR=gemma2_9b_7gb
+set REPOS=https://huggingface.co/spaces/aka7774/%APPDIR%
+set VENV=venv
+rem -------------------------------------------
+set INSTALL_DIR=%~dp0
+cd /d %INSTALL_DIR%
+:git_clone
+set DL_URL=%REPOS%
+set DL_DST=%APPDIR%
+git clone %DL_URL% %APPDIR%
+if exist %DL_DST% goto install_python
+set DL_URL=https://github.com/git-for-windows/git/releases/download/v2.41.0.windows.3/PortableGit-2.41.0.3-64-bit.7z.exe
+set DL_DST=PortableGit-2.41.0.3-64-bit.7z.exe
+curl -L -o %DL_DST% %DL_URL%
+if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
+%DL_DST% -y
+del %DL_DST%
+set GIT=%INSTALL_DIR%PortableGit\bin\git
+%GIT% clone %REPOS%
+:install_python
+set DL_URL=https://github.com/indygreg/python-build-standalone/releases/download/20240415/cpython-3.10.14+20240415-x86_64-pc-windows-msvc-shared-install_only.tar.gz
+set DL_DST="%INSTALL_DIR%python.tar.gz"
+curl -L -o %DL_DST% %DL_URL%
+if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
+tar -xzf %DL_DST%
+set PYTHON=%INSTALL_DIR%python\python.exe
+set PATH=%PATH%;%INSTALL_DIR%python310\Scripts
+:install_venv
+cd %APPDIR%
+%PYTHON% -m venv %VENV%
+set PYTHON=%VENV%\Scripts\python.exe
+:install_pip
+set DL_URL=https://bootstrap.pypa.io/get-pip.py
+set DL_DST=%INSTALL_DIR%get-pip.py
+curl -o %DL_DST% %DL_URL%
+if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
+%PYTHON% %DL_DST%
+%PYTHON% -m pip install gradio
+%PYTHON% -m pip install -r requirements.txt
+pause

main.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import sys
+import time
+import signal
+import io
+from fastapi import FastAPI, Request, status, Form, UploadFile
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse, StreamingResponse
+import fn
+import gradio as gr
+from app import demo
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=['*'],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+gr.mount_gradio_app(app, demo, path="/gradio")
+@app.post("/set_config")
+async def api_set_config(args: dict):
+    content = fn.set_config_args(args)
+    return {'content': content}
+@app.post("/infer")
+async def api_infer(args: dict):
+    args['fastapi'] = True
+    if 'stream' in args and args['stream']:
+        return StreamingResponse(
+            fn.chat(args['input'], [], args['instruct'], args),
+            media_type="text/event-stream",
+        )
+    else:
+        content = fn.infer(args['input'], [], args['instruct'], args)
+        return {'content': content}
+@app.post("/numel")
+async def api_numel(args: dict):
+    content = fn.numel(args['input'], [], args['instruct'], args)
+    return {'numel': content}

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+# On Windows, it is difficult to prepare flash_attn2 and probably cannot run.
+# On WSL2:
+# sudo apt install python3.10-dev
+# wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-keyring_1.0-1_all.deb
+# sudo dpkg -i cuda-keyring_1.0-1_all.deb
+# sudo apt update
+# sudo apt-get install cuda-toolkit-12-1
+# vi ~/.bashrc
+# if [ -e /usr/local/cuda ]; then
+#   export PATH="/usr/local/cuda/bin:$PATH"
+#   export LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
+# fi
+fastapi
+uvicorn
+transformers==4.43.3
+bitsandbytes==0.43.3
+accelerate==0.33.0
+peft==0.12.0
+wheel
+python-multipart

venv.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/usr/bin/bash
+python3 -m venv venv
+curl -kL https://bootstrap.pypa.io/get-pip.py | venv/bin/python
+venv/bin/python -m pip install gradio
+venv/bin/python -m pip install -r requirements.txt
+venv/bin/python -m pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121
+venv/bin/python -m pip install flash-attn --no-build-isolation
+venv/bin/python -m pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"