Spaces:
Sleeping
Sleeping
File size: 6,275 Bytes
0a03401 25f8902 0a03401 42058a8 0a03401 25f8902 0a03401 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# First Commit inspiration:
#https://huggingface.co/spaces/lambeth-dai/Light-PDF-Web-QA-Chatbot/blob/main/app.py
#---------------------
#model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral',
#model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(gpu_config))
#---------------------
import gradio as gr
import os
from ctransformers import AutoModelForCausalLM, AutoConfig, Config
import datetime
i_temperature = 0.30
i_max_new_tokens=1100
repo = 'TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF'
model_file = "tinyllama-1.1b-1t-openorca.Q4_K_M.gguf"
i_repetitionpenalty = 1.2
i_contextlength=12048
logfile = 'TinyLlamaOpenOrca1.1B-stream.txt'
print("loading model...")
stt = datetime.datetime.now()
conf = AutoConfig(Config(temperature=i_temperature, repetition_penalty=i_repetitionpenalty, batch_size=64,
max_new_tokens=i_max_new_tokens, context_length=i_contextlength))
llm = AutoModelForCausalLM.from_pretrained(repo, model_file=model_file,
model_type="llama",config = conf)
dt = datetime.datetime.now() - stt
print(f"Model loaded in {dt}")
#MODEL SETTINGS also for DISPLAY
im_user = 'https://github.com/fabiomatricardi/TiniLlamaGradioChat/raw/main/456322.webp'
im_bot = 'https://github.com/fabiomatricardi/TiniLlamaGradioChat/raw/main/TinyLlama_logo.png'
def writehistory(text):
with open(logfile, 'a', encoding='utf-8') as f:
f.write(text)
f.write('\n')
f.close()
with gr.Blocks(theme='ParityError/Interstellar') as demo:
#TITLE SECTION
with gr.Row():
with gr.Column(scale=12):
gr.HTML("<center>"
+ "<h1>π¦ TinyLlama 1.1B π OpenOrca 4K context window</h2></center>")
gr.Markdown("""
**Currently Running**: [tinyllama-1.1b-1t-openorca.Q4_K_M.gguf](https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF) **Chat History Log File**: *TinyLlamaOpenOrca1.1B-stream.txt*
- **Base Model**: PY007/TinyLlama-1.1B-intermediate-step-480k-1T, Fine tuned on OpenOrca GPT4 subset for 1 epoch,Using CHATML format.
- **License**: Apache 2.0, following the TinyLlama base model. The model output is not censored and the authors do not endorse the opinions in the generated content. Use at your own risk.
- **Notes**: this is my first commit. So far the chat is not considering the conversation history. **Note2**: log TXT file is not working too
""")
gr.Image(value=im_bot, width=80)
# chat and parameters settings
with gr.Row():
with gr.Column(scale=4):
chatbot = gr.Chatbot(height = 350, show_copy_button=True,
avatar_images = [im_user,im_bot])
with gr.Row():
with gr.Column(scale=14):
msg = gr.Textbox(show_label=False,
placeholder="Enter text",
lines=2)
submitBtn = gr.Button("\n㪠Send\n", size="lg", variant="primary", min_width=180)
with gr.Column(min_width=50,scale=2):
with gr.Tab(label="Parameter Setting"):
gr.Markdown("# Parameters")
top_p = gr.Slider(
minimum=-0,
maximum=1.0,
value=0.95,
step=0.05,
interactive=True,
label="Top-p",
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.30,
step=0.01,
interactive=True,
label="Temperature",
)
max_length_tokens = gr.Slider(
minimum=0,
maximum=4096,
value=1060,
step=4,
interactive=True,
label="Max Generation Tokens",
)
rep_pen = gr.Slider(
minimum=0,
maximum=5,
value=1.2,
step=0.05,
interactive=True,
label="Repetition Penalty",
)
clear = gr.Button("ποΈ Clear All Messages", variant='secondary')
def user(user_message, history):
writehistory(f"USER: {user_message}")
return "", history + [[user_message, None]]
def bot(history,t,p,m,r):
SYSTEM_PROMPT = """<|im_start|>system
You are a helpful bot. Your answers are clear and concise.
<|im_end|>
"""
prompt = f"<|im_start|>system<|im_end|><|im_start|>user\n{history[-1][0]}<|im_end|>\n<|im_start|>assistant\n"
print(f"history lenght: {len(history)}")
if len(history) == 1:
print("this is the first round")
else:
print("here we should pass more conversations")
history[-1][1] = ""
for character in llm(prompt,
temperature = t,
top_p = p,
repetition_penalty = r,
max_new_tokens=m,
stop = ['<|im_end|>'],
stream = True):
history[-1][1] += character
yield history
writehistory(f"temperature: {t}, top_p: {p}, maxNewTokens: {m}, repetitionPenalty: {r}\n---\nBOT: {history}\n\n")
#Log in the terminal the messages
print(f"USER: {history[-1][0]}\n---\ntemperature: {t}, top_p: {p}, maxNewTokens: {m}, repetitionPenalty: {r}\n---\nBOT: {history[-1][1]}\n\n")
# Clicking the submitBtn will call the generation with Parameters in the slides
submitBtn.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
bot, [chatbot,temperature,top_p,max_length_tokens,rep_pen], chatbot
)
clear.click(lambda: None, None, chatbot, queue=False)
demo.queue() #required to yield the streams from the text generation
demo.launch(inbrowser=True) |