Spaces:

aka7774
/

localgemma2

Sleeping

App Files Files Community

aka7774 commited on Jul 3

Commit

3350c44

•

1 Parent(s): 6bbfa42

Upload 6 files

Browse files

Files changed (6) hide show

app.py +117 -0
fn.py +202 -0
install.bat +56 -0
main.py +43 -0
requirements.txt +5 -0
venv.sh +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import fn
+import gradio as gr
+with gr.Blocks() as demo:
+    gr.Markdown('# gemma2')
+    with gr.Tab('config'):
+        info = gr.Markdown()
+        with gr.Row():
+            with gr.Column(scale=1):
+                size = gr.Dropdown(
+                    value=fn.cfg['size'],
+                    choices=['9b','27b'],
+                    label='size',
+                    interactive=True,
+                )
+            with gr.Column(scale=1):
+                max_new_tokens = gr.Textbox(
+                    value=fn.cfg['max_new_tokens'],
+                    label='max_new_tokens',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                temperature = gr.Textbox(
+                    value=fn.cfg['temperature'],
+                    label='temperature',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                top_p = gr.Textbox(
+                    value=fn.cfg['top_p'],
+                    label='top_p',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                top_k = gr.Textbox(
+                    value=fn.cfg['top_k'],
+                    label='top_k',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                repetition_penalty = gr.Textbox(
+                    value=fn.cfg['repetition_penalty'],
+                    label='repetition_penalty',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+        with gr.Row():
+            with gr.Column(scale=1):
+                inst_template = gr.Textbox(
+                    value='',
+                    lines=10,
+                    label='inst_template',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                is_use_cache = gr.Checkbox(
+                    value=False,
+                    label='is_use_cache',
+                    interactive=True,
+                    )
+        set_button = gr.Button(value='Save')
+    with gr.Tab('instruct'):
+        with gr.Row():
+            with gr.Column(scale=1):
+                instruction = gr.Textbox(
+                    lines=20,
+                    label='instruction',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+                input = gr.Textbox(
+                    lines=1,
+                    label='input',
+                    interactive=True,
+                    show_copy_button=True,
+                    )
+            with gr.Column(scale=1):
+                said = gr.Textbox(
+                    label='said',
+                    lines=20,
+                    show_copy_button=True,
+                    )
+                numel = gr.Textbox(
+                    lines=1,
+                    label='numel',
+                    show_copy_button=True,
+                    )
+        inst_button = gr.Button(value='inst')
+        numel_button = gr.Button(value='numel')
+    with gr.Tab('chat'):
+        gr.ChatInterface(fn.chat)
+    set_button.click(
+        fn=fn.set_config,
+        inputs=[size, instruction, inst_template, is_use_cache, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[info],
+        )
+    inst_button.click(
+        fn=fn.chat,
+        inputs=[input, input, instruction],
+        outputs=[said],
+        )
+    numel_button.click(
+        fn=fn.numel,
+        inputs=[input, input, instruction],
+        outputs=[numel],
+        )
+if __name__ == '__main__':
+    demo.launch()

fn.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+import re
+import torch
+import datetime
+import json
+import csv
+import gc
+import local_gemma
+from transformers import AutoTokenizer, TextStreamer
+from transformers import TextIteratorStreamer
+from transformers import BitsAndBytesConfig, GPTQConfig
+from threading import Thread
+tokenizer = None
+model = None
+default_cfg = {
+    'size': None,
+    'instruction': None,
+    'inst_template': None,
+    'is_use_cache': False,
+    'max_new_tokens': 1024,
+    'temperature': 0.9,
+    'top_p': 0.95,
+    'top_k': 40,
+    'repetition_penalty': 1.2,
+}
+cfg = default_cfg.copy()
+cache = None
+chat_history = []
+def load_model(size = '9b'):
+    global tokenizer, model, cfg
+    if cfg['size'] == size:
+        return
+    del model
+    del tokenizer
+    model = None
+    tokenizer = None
+    gc.collect()
+    torch.cuda.empty_cache()
+    model_name = f"SillyTilly/google-gemma-2-{size}-it"
+    model = local_gemma.LocalGemma2ForCausalLM.from_pretrained(model_name, preset="memory")
+    model._supports_cache_class = True
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    cfg['size'] = size
+def clear_config():
+    global cfg
+    cfg = default_cfg.copy()
+def set_config(size, instruction, inst_template, is_use_cache, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
+    global cfg
+    load_model(size)
+    cfg.update({
+        'instruction': instruction,
+        'inst_template': inst_template,
+        'is_use_cache': bool(is_use_cache),
+        'max_new_tokens': int(max_new_tokens),
+        'temperature': float(temperature),
+        'top_p': float(top_p),
+        'top_k': int(top_k),
+        'repetition_penalty': float(repetition_penalty),
+    })
+    return 'done.'
+def set_config_args(args):
+    global cfg
+    load_model(args['size'])
+    cfg.update(args)
+    return 'done.'
+def chatinterface_to_messages(message, history):
+    global cfg
+    messages = []
+    if cfg['instruction']:
+        messages.append({'role': 'user', 'content': cfg['instruction']})
+        # userとassistantは交互に存在しないといけない
+        if message:
+            messages.append({'role': 'assistant', 'content': '了解しました。'})
+    for pair in history:
+        [user, assistant] = pair
+        if user:
+            messages.append({'role': 'user', 'content': user})
+        if assistant:
+            messages.append({'role': 'assistant', 'content': assistant})
+    if message:
+        messages.append({'role': 'user', 'content': message})
+    return messages
+def apply_template(messages):
+    global tokenizer, cfg, cache, chat_history
+    if type(messages) is str:
+        if cfg['inst_template']:
+            user_input = cfg['inst_template'].format(instruction=cfg['instruction'], input=messages)
+        user_input = cfg['instruction'].format(input=messages)
+        tokenized_chat = tokenizer(user_input, return_tensors="pt").input_ids
+    if type(messages) is list:
+        tokenized_chat = tokenizer.apply_chat_template(
+            messages + chat_history, tokenize=True, add_generation_prompt=True, return_tensors="pt"
+        )
+    return tokenized_chat
+def chat(message, history = [], instruction = None, args = {}):
+    global tokenizer, model, cfg, cache, chat_history
+    if instruction:
+        cfg['instruction'] = instruction
+        tokenized_chat = apply_template(message)
+    else:
+        messages = chatinterface_to_messages(message, history)
+        tokenized_chat = apply_template(messages)
+    device = local_gemma.utils.config.infer_device(None)
+    is_use_cache = cfg['is_use_cache']
+    generation_kwargs = local_gemma.utils.config.get_generation_kwargs('chat')
+    streamer = TextStreamer(tokenizer, skip_prompt=True, **{"skip_special_tokens": True})
+    tokenized_chat = tokenized_chat.to(device)
+    generation_kwargs.update(
+        {
+            "streamer": streamer,
+            "assistant_model": None,
+            "return_dict_in_generate": True,
+            "past_key_values": cache,
+        }
+    )
+    for k in [
+        'max_new_tokens',
+        'temperature',
+        'top_p',
+        'top_k',
+        'repetition_penalty'
+        ]:
+        if cfg[k]:
+            generation_kwargs[k] = cfg[k]
+    # TODO(joao): this if shouldn't be needed, fix in transformers
+    if cache is not None:
+        generation_kwargs["cache_implementation"] = None
+    if cfg['max_new_tokens'] is not None:
+        input_ids_len = tokenized_chat.shape[-1]
+        max_cache_len = cfg['max_new_tokens'] + input_ids_len
+        if cache is not None and cache.max_cache_len < max_cache_len:
+            # reset the cache
+            generation_kwargs.pop("past_key_values")
+            generation_kwargs["cache_implementation"] = "hybrid"
+    else:
+        generation_kwargs["max_length"] = model.config.max_position_embeddings
+    gen_out = model.generate(input_ids=tokenized_chat, **generation_kwargs)
+    # Store the cache for the next generation round; Pull the model output into the chat history.
+    cache = gen_out.past_key_values
+    model_tokens = gen_out.sequences[0, tokenized_chat.shape[1]:]
+    model_output_text = tokenizer.decode(model_tokens, skip_special_tokens=True)
+    chat_history += [{"role": "user", "content": message},]
+    chat_history += [{"role": "assistant", "content": model_output_text},]
+    # Sanity check: EOS was removed, ends in "<end_of_turn>\n"
+    tokenized_chat = tokenizer.apply_chat_template(
+        chat_history, tokenize=True, add_generation_prompt=False, return_tensors="pt"
+    ).tolist()[0]
+    assert tokenized_chat[0] == 2
+    assert tokenized_chat[-1] == 108
+    assert tokenized_chat[-2] == 107
+    if not is_use_cache:
+        cache = None
+        chat_history = []
+    return model_output_text
+def infer(message, history = [], instruction = None, args = {}):
+    return chat(message, history, instruction, args)
+def numel(message, history = [], instruction = None, args = {}):
+    global tokenizer, model, cfg, cache, chat_history
+    if instruction:
+        cfg['instruction'] = instruction
+        tokenized_chat = apply_template(message)
+    else:
+        messages = chatinterface_to_messages(message, history)
+        tokenized_chat = apply_template(messages)
+    return torch.numel(tokenized_chat)

install.bat ADDED Viewed

	@@ -0,0 +1,56 @@

+@echo off
+rem -------------------------------------------
+rem NOT guaranteed to work on Windows
+set APPDIR=gemma2
+set REPOS=https://huggingface.co/spaces/aka7774/%APPDIR%
+set VENV=venv
+rem -------------------------------------------
+set INSTALL_DIR=%~dp0
+cd /d %INSTALL_DIR%
+:git_clone
+set DL_URL=%REPOS%
+set DL_DST=%APPDIR%
+git clone %DL_URL% %APPDIR%
+if exist %DL_DST% goto install_python
+set DL_URL=https://github.com/git-for-windows/git/releases/download/v2.41.0.windows.3/PortableGit-2.41.0.3-64-bit.7z.exe
+set DL_DST=PortableGit-2.41.0.3-64-bit.7z.exe
+curl -L -o %DL_DST% %DL_URL%
+if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
+%DL_DST% -y
+del %DL_DST%
+set GIT=%INSTALL_DIR%PortableGit\bin\git
+%GIT% clone %REPOS%
+:install_python
+set DL_URL=https://github.com/indygreg/python-build-standalone/releases/download/20240415/cpython-3.10.14+20240415-x86_64-pc-windows-msvc-shared-install_only.tar.gz
+set DL_DST="%INSTALL_DIR%python.tar.gz"
+curl -L -o %DL_DST% %DL_URL%
+if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
+tar -xzf %DL_DST%
+set PYTHON=%INSTALL_DIR%python\python.exe
+set PATH=%PATH%;%INSTALL_DIR%python310\Scripts
+:install_venv
+cd %APPDIR%
+%PYTHON% -m venv %VENV%
+set PYTHON=%VENV%\Scripts\python.exe
+:install_pip
+set DL_URL=https://bootstrap.pypa.io/get-pip.py
+set DL_DST=%INSTALL_DIR%get-pip.py
+curl -o %DL_DST% %DL_URL%
+if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
+%PYTHON% %DL_DST%
+%PYTHON% -m pip install gradio
+%PYTHON% -m pip install -r requirements.txt
+pause

main.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import sys
+import time
+import signal
+import io
+from fastapi import FastAPI, Request, status, Form, UploadFile
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse, StreamingResponse
+import fn
+import gradio as gr
+from app import demo
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=['*'],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+gr.mount_gradio_app(app, demo, path="/gradio")
+@app.post("/set_config")
+async def api_set_config(args: dict):
+    content = fn.set_config_args(args)
+    return {'content': content}
+@app.post("/infer")
+async def api_infer(args: dict):
+    args['fastapi'] = True
+    content = fn.infer(args['input'], [], args['instruct'], args)
+    return {'content': content}
+@app.post("/numel")
+async def api_numel(args: dict):
+    content = fn.numel(args['input'], [], args['instruct'], args)
+    return {'numel': content}

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+fastapi
+uvicorn
+local-gemma
+bitsandbytes
+python-multipart

venv.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/usr/bin/bash
+python3 -m venv venv
+curl -kL https://bootstrap.pypa.io/get-pip.py | venv/bin/python
+venv/bin/python -m pip install gradio
+venv/bin/python -m pip install -r requirements.txt