Sean-Case
commited on
Commit
•
994ad90
1
Parent(s):
275393f
Upgraded large model to Mistral OpenOrca 7B Q4. More checks for empty questions.
Browse files- app.py +12 -14
- chatfuncs/chatfuncs.py +4 -4
app.py
CHANGED
@@ -79,7 +79,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
79 |
if torch_device is None:
|
80 |
torch_device = chatf.torch_device
|
81 |
|
82 |
-
if model_type == "Orca
|
83 |
|
84 |
gpu_config.update_gpu(gpu_layers)
|
85 |
cpu_config.update_gpu(gpu_layers)
|
@@ -90,16 +90,14 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
90 |
print(vars(cpu_config))
|
91 |
|
92 |
try:
|
93 |
-
#model = AutoModelForCausalLM.from_pretrained('
|
94 |
-
model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
95 |
#model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
96 |
-
|
|
|
97 |
except:
|
98 |
-
#model = AutoModelForCausalLM.from_pretrained('
|
99 |
-
model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu())
|
100 |
#model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
|
101 |
-
|
102 |
-
|
103 |
|
104 |
tokenizer = []
|
105 |
|
@@ -138,7 +136,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
138 |
return model_type, load_confirmation, model_type
|
139 |
|
140 |
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
|
141 |
-
model_type = "Orca
|
142 |
|
143 |
load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
|
144 |
|
@@ -181,7 +179,7 @@ with block:
|
|
181 |
|
182 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
183 |
|
184 |
-
gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Orca
|
185 |
|
186 |
with gr.Row():
|
187 |
current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
|
@@ -197,7 +195,7 @@ with block:
|
|
197 |
|
198 |
with gr.Row():
|
199 |
message = gr.Textbox(
|
200 |
-
label="
|
201 |
lines=1,
|
202 |
)
|
203 |
with gr.Row():
|
@@ -231,14 +229,14 @@ with block:
|
|
231 |
ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
|
232 |
|
233 |
with gr.Tab("Advanced features"):
|
234 |
-
model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca (small, fast)", choices = ["Flan Alpaca (small, fast)", "Orca
|
235 |
with gr.Row():
|
236 |
-
gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU (WARNING: please don't modify unless you have a GPU).", value=0, minimum=0, maximum=
|
237 |
change_model_button = gr.Button(value="Load model", scale=0)
|
238 |
load_text = gr.Text(label="Load status")
|
239 |
|
240 |
gr.HTML(
|
241 |
-
"<center>This app is based on the models Flan Alpaca and Orca
|
242 |
)
|
243 |
|
244 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
|
|
79 |
if torch_device is None:
|
80 |
torch_device = chatf.torch_device
|
81 |
|
82 |
+
if model_type == "Mistral Open Orca (larger, slow)":
|
83 |
|
84 |
gpu_config.update_gpu(gpu_layers)
|
85 |
cpu_config.update_gpu(gpu_layers)
|
|
|
90 |
print(vars(cpu_config))
|
91 |
|
92 |
try:
|
93 |
+
#model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
|
|
94 |
#model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
95 |
+
model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='llama', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
96 |
+
|
97 |
except:
|
98 |
+
#model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu())
|
|
|
99 |
#model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
|
100 |
+
model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='llama', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
|
|
|
101 |
|
102 |
tokenizer = []
|
103 |
|
|
|
136 |
return model_type, load_confirmation, model_type
|
137 |
|
138 |
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
|
139 |
+
model_type = "Mistral Open Orca (larger, slow)"
|
140 |
|
141 |
load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
|
142 |
|
|
|
179 |
|
180 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
181 |
|
182 |
+
gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Mistral Open Orca (larger, slow)), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
|
183 |
|
184 |
with gr.Row():
|
185 |
current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
|
|
|
195 |
|
196 |
with gr.Row():
|
197 |
message = gr.Textbox(
|
198 |
+
label="Enter your question here.",
|
199 |
lines=1,
|
200 |
)
|
201 |
with gr.Row():
|
|
|
229 |
ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
|
230 |
|
231 |
with gr.Tab("Advanced features"):
|
232 |
+
model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca (small, fast)", choices = ["Flan Alpaca (small, fast)", "Mistral Open Orca (larger, slow)"])
|
233 |
with gr.Row():
|
234 |
+
gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU (WARNING: please don't modify unless you have a GPU).", value=0, minimum=0, maximum=5, step = 1, visible=True)
|
235 |
change_model_button = gr.Button(value="Load model", scale=0)
|
236 |
load_text = gr.Text(label="Load status")
|
237 |
|
238 |
gr.HTML(
|
239 |
+
"<center>This app is based on the models Flan Alpaca and Mistral Open Orca. It powered by Gradio, Transformers, Ctransformers, and Langchain.</a></center>"
|
240 |
)
|
241 |
|
242 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
chatfuncs/chatfuncs.py
CHANGED
@@ -315,8 +315,8 @@ QUESTION: {question}
|
|
315 |
|
316 |
if model_type == "Flan Alpaca (small, fast)":
|
317 |
INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
|
318 |
-
elif model_type == "Orca
|
319 |
-
INSTRUCTION_PROMPT=PromptTemplate(template=
|
320 |
|
321 |
return INSTRUCTION_PROMPT, CONTENT_PROMPT
|
322 |
|
@@ -360,7 +360,7 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
|
|
360 |
def create_full_prompt(user_input, history, extracted_memory, vectorstore, embeddings, model_type):
|
361 |
|
362 |
if not user_input.strip():
|
363 |
-
return history, "", ""
|
364 |
|
365 |
#if chain_agent is None:
|
366 |
# history.append((user_input, "Please click the button to submit the Huggingface API key before using the chatbot (top right)"))
|
@@ -434,7 +434,7 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type):
|
|
434 |
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
435 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
436 |
|
437 |
-
elif model_type == "Orca
|
438 |
tokens = model.tokenize(full_prompt)
|
439 |
|
440 |
gen_config = CtransGenGenerationConfig()
|
|
|
315 |
|
316 |
if model_type == "Flan Alpaca (small, fast)":
|
317 |
INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
|
318 |
+
elif model_type == "Mistral Open Orca (larger, slow)":
|
319 |
+
INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_mistral_orca, input_variables=['question', 'summaries'])
|
320 |
|
321 |
return INSTRUCTION_PROMPT, CONTENT_PROMPT
|
322 |
|
|
|
360 |
def create_full_prompt(user_input, history, extracted_memory, vectorstore, embeddings, model_type):
|
361 |
|
362 |
if not user_input.strip():
|
363 |
+
return history, "", "Respond with 'Please enter a question.' RESPONSE:"
|
364 |
|
365 |
#if chain_agent is None:
|
366 |
# history.append((user_input, "Please click the button to submit the Huggingface API key before using the chatbot (top right)"))
|
|
|
434 |
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
435 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
436 |
|
437 |
+
elif model_type == "Mistral Open Orca (larger, slow)":
|
438 |
tokens = model.tokenize(full_prompt)
|
439 |
|
440 |
gen_config = CtransGenGenerationConfig()
|