Spaces:
Sleeping
Sleeping
# First Commit inspiration: | |
#https://huggingface.co/spaces/lambeth-dai/Light-PDF-Web-QA-Chatbot/blob/main/app.py | |
#--------------------- | |
#model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', | |
#model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(gpu_config)) | |
#--------------------- | |
import gradio as gr | |
import os | |
from ctransformers import AutoModelForCausalLM, AutoConfig, Config | |
import datetime | |
i_temperature = 0.30 | |
i_max_new_tokens=1100 | |
repo = 'TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF' | |
model_file = "tinyllama-1.1b-1t-openorca.Q4_K_M.gguf" | |
i_repetitionpenalty = 1.2 | |
i_contextlength=12048 | |
logfile = 'TinyLlamaOpenOrca1.1B-stream.txt' | |
print("loading model...") | |
stt = datetime.datetime.now() | |
conf = AutoConfig(Config(temperature=i_temperature, repetition_penalty=i_repetitionpenalty, batch_size=64, | |
max_new_tokens=i_max_new_tokens, context_length=i_contextlength)) | |
llm = AutoModelForCausalLM.from_pretrained(repo, model_file=model_file, | |
model_type="llama",config = conf) | |
dt = datetime.datetime.now() - stt | |
print(f"Model loaded in {dt}") | |
#MODEL SETTINGS also for DISPLAY | |
im_user = 'https://github.com/fabiomatricardi/TiniLlamaGradioChat/raw/main/456322.webp' | |
im_bot = 'https://github.com/fabiomatricardi/TiniLlamaGradioChat/raw/main/TinyLlama_logo.png' | |
def writehistory(text): | |
with open(logfile, 'a', encoding='utf-8') as f: | |
f.write(text) | |
f.write('\n') | |
f.close() | |
with gr.Blocks(theme='ParityError/Interstellar') as demo: | |
#TITLE SECTION | |
with gr.Row(): | |
with gr.Column(scale=12): | |
gr.HTML("<center>" | |
+ "<h1>π¦ TinyLlama 1.1B π OpenOrca 4K context window</h2></center>") | |
gr.Markdown(""" | |
**Currently Running**: [tinyllama-1.1b-1t-openorca.Q4_K_M.gguf](https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF) **Chat History Log File**: *TinyLlamaOpenOrca1.1B-stream.txt* | |
- **Base Model**: PY007/TinyLlama-1.1B-intermediate-step-480k-1T, Fine tuned on OpenOrca GPT4 subset for 1 epoch,Using CHATML format. | |
- **License**: Apache 2.0, following the TinyLlama base model. The model output is not censored and the authors do not endorse the opinions in the generated content. Use at your own risk. | |
- **Notes**: this is my first commit. So far the chat is not considering the conversation history. **Note2**: log TXT file is not working too | |
""") | |
gr.Image(value=im_bot, width=80) | |
# chat and parameters settings | |
with gr.Row(): | |
with gr.Column(scale=4): | |
chatbot = gr.Chatbot(height = 350, show_copy_button=True, | |
avatar_images = [im_user,im_bot]) | |
with gr.Row(): | |
with gr.Column(scale=14): | |
msg = gr.Textbox(show_label=False, | |
placeholder="Enter text", | |
lines=2) | |
submitBtn = gr.Button("\n㪠Send\n", size="lg", variant="primary", min_width=180) | |
with gr.Column(min_width=50,scale=2): | |
with gr.Tab(label="Parameter Setting"): | |
gr.Markdown("# Parameters") | |
top_p = gr.Slider( | |
minimum=-0, | |
maximum=1.0, | |
value=0.95, | |
step=0.05, | |
interactive=True, | |
label="Top-p", | |
) | |
temperature = gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.30, | |
step=0.01, | |
interactive=True, | |
label="Temperature", | |
) | |
max_length_tokens = gr.Slider( | |
minimum=0, | |
maximum=4096, | |
value=1060, | |
step=4, | |
interactive=True, | |
label="Max Generation Tokens", | |
) | |
rep_pen = gr.Slider( | |
minimum=0, | |
maximum=5, | |
value=1.2, | |
step=0.05, | |
interactive=True, | |
label="Repetition Penalty", | |
) | |
clear = gr.Button("ποΈ Clear All Messages", variant='secondary') | |
def user(user_message, history): | |
writehistory(f"USER: {user_message}") | |
return "", history + [[user_message, None]] | |
def bot(history,t,p,m,r): | |
SYSTEM_PROMPT = """<|im_start|>system | |
You are a helpful bot. Your answers are clear and concise. | |
<|im_end|> | |
""" | |
prompt = f"<|im_start|>system<|im_end|><|im_start|>user\n{history[-1][0]}<|im_end|>\n<|im_start|>assistant\n" | |
print(f"history lenght: {len(history)}") | |
if len(history) == 1: | |
print("this is the first round") | |
else: | |
print("here we should pass more conversations") | |
history[-1][1] = "" | |
for character in llm(prompt, | |
temperature = t, | |
top_p = p, | |
repetition_penalty = r, | |
max_new_tokens=m, | |
stop = ['<|im_end|>'], | |
stream = True): | |
history[-1][1] += character | |
yield history | |
writehistory(f"temperature: {t}, top_p: {p}, maxNewTokens: {m}, repetitionPenalty: {r}\n---\nBOT: {history}\n\n") | |
#Log in the terminal the messages | |
print(f"USER: {history[-1][0]}\n---\ntemperature: {t}, top_p: {p}, maxNewTokens: {m}, repetitionPenalty: {r}\n---\nBOT: {history[-1][1]}\n\n") | |
# Clicking the submitBtn will call the generation with Parameters in the slides | |
submitBtn.click(user, [msg, chatbot], [msg, chatbot], queue=False).then( | |
bot, [chatbot,temperature,top_p,max_length_tokens,rep_pen], chatbot | |
) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
demo.queue() #required to yield the streams from the text generation | |
demo.launch(inbrowser=True) |