Sean-Case
commited on
Commit
•
d5a8385
1
Parent(s):
9aef340
Stop generation button. Better model load. Trying one source that's longer.
Browse files- app.py +27 -10
- chatfuncs/chatfuncs.py +72 -22
app.py
CHANGED
@@ -13,7 +13,6 @@ from langchain.vectorstores import FAISS
|
|
13 |
import gradio as gr
|
14 |
|
15 |
from transformers import AutoTokenizer
|
16 |
-
from dataclasses import asdict, dataclass
|
17 |
|
18 |
# Alternative model sources
|
19 |
from ctransformers import AutoModelForCausalLM
|
@@ -83,7 +82,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
83 |
if model_type == "Orca Mini":
|
84 |
|
85 |
gpu_config.update_gpu(gpu_layers)
|
86 |
-
cpu_config.update_gpu(
|
87 |
|
88 |
print("Loading with", cpu_config.gpu_layers, "model layers sent to GPU.")
|
89 |
|
@@ -92,8 +91,13 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
92 |
|
93 |
try:
|
94 |
model = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
|
|
|
|
95 |
except:
|
96 |
model = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu())
|
|
|
|
|
|
|
97 |
|
98 |
tokenizer = []
|
99 |
|
@@ -126,8 +130,10 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
126 |
chatf.tokenizer = tokenizer
|
127 |
chatf.model_type = model_type
|
128 |
|
129 |
-
|
130 |
-
|
|
|
|
|
131 |
|
132 |
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
|
133 |
model_type = "Orca Mini"
|
@@ -173,7 +179,7 @@ with block:
|
|
173 |
|
174 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
175 |
|
176 |
-
gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Orca Mini), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app.
|
177 |
|
178 |
current_source = gr.Textbox(label="Current data source that is loaded into the app", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf")
|
179 |
|
@@ -181,8 +187,8 @@ with block:
|
|
181 |
|
182 |
with gr.Row():
|
183 |
chat_height = 500
|
184 |
-
chatbot = gr.Chatbot(height=chat_height, avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False)
|
185 |
-
sources = gr.HTML(value = "Source paragraphs where I looked for answers will appear here", height=chat_height)
|
186 |
|
187 |
with gr.Row():
|
188 |
message = gr.Textbox(
|
@@ -191,7 +197,8 @@ with block:
|
|
191 |
)
|
192 |
with gr.Row():
|
193 |
submit = gr.Button(value="Send message", variant="secondary", scale = 1)
|
194 |
-
clear = gr.Button(value="Clear chat", variant="secondary", scale=0)
|
|
|
195 |
|
196 |
examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
|
197 |
#value = "What were the five pillars of the previous borough plan?",
|
@@ -220,7 +227,10 @@ with block:
|
|
220 |
|
221 |
with gr.Tab("Advanced features"):
|
222 |
model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca", choices = ["Flan Alpaca", "Orca Mini"])
|
223 |
-
|
|
|
|
|
|
|
224 |
|
225 |
gr.HTML(
|
226 |
"<center>This app is based on the models Flan Alpaca and Orca Mini. It powered by Gradio, Transformers, Ctransformers, and Langchain.</a></center>"
|
@@ -228,7 +238,11 @@ with block:
|
|
228 |
|
229 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
230 |
|
231 |
-
|
|
|
|
|
|
|
|
|
232 |
|
233 |
# Load in a pdf
|
234 |
load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
|
@@ -259,6 +273,9 @@ with block:
|
|
259 |
then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
260 |
then(lambda: chatf.restore_interactivity(), None, [message], queue=False)
|
261 |
|
|
|
|
|
|
|
262 |
# Clear box
|
263 |
clear.click(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic])
|
264 |
clear.click(lambda: None, None, chatbot, queue=False)
|
|
|
13 |
import gradio as gr
|
14 |
|
15 |
from transformers import AutoTokenizer
|
|
|
16 |
|
17 |
# Alternative model sources
|
18 |
from ctransformers import AutoModelForCausalLM
|
|
|
82 |
if model_type == "Orca Mini":
|
83 |
|
84 |
gpu_config.update_gpu(gpu_layers)
|
85 |
+
cpu_config.update_gpu(gpu_layers)
|
86 |
|
87 |
print("Loading with", cpu_config.gpu_layers, "model layers sent to GPU.")
|
88 |
|
|
|
91 |
|
92 |
try:
|
93 |
model = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
94 |
+
#model = AutoModelForCausalLM.from_pretrained('Aryanne/Sheared-LLaMA-1.3B-gguf', model_type='llama', model_file='q8_0-sheared-llama-1.3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
95 |
+
#model = AutoModelForCausalLM.from_pretrained('TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF', model_type='llama', model_file='tinyllama-1.1b-1t-openorca.Q8_0.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
96 |
except:
|
97 |
model = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu())
|
98 |
+
#model = AutoModelForCausalLM.from_pretrained('Aryanne/Sheared-LLaMA-1.3B-gguf', model_type='llama', model_file='q8_0-sheared-llama-1.3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
|
99 |
+
#model = AutoModelForCausalLM.from_pretrained('TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF', model_type='llama', model_file='tinyllama-1.1b-1t-openorca.Q8_0.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
|
100 |
+
|
101 |
|
102 |
tokenizer = []
|
103 |
|
|
|
130 |
chatf.tokenizer = tokenizer
|
131 |
chatf.model_type = model_type
|
132 |
|
133 |
+
load_confirmation = "Finished loading model: " + model_type
|
134 |
+
|
135 |
+
print(load_confirmation)
|
136 |
+
return model_type, load_confirmation
|
137 |
|
138 |
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
|
139 |
model_type = "Orca Mini"
|
|
|
179 |
|
180 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
181 |
|
182 |
+
gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Orca Mini), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
|
183 |
|
184 |
current_source = gr.Textbox(label="Current data source that is loaded into the app", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf")
|
185 |
|
|
|
187 |
|
188 |
with gr.Row():
|
189 |
chat_height = 500
|
190 |
+
chatbot = gr.Chatbot(height=chat_height, avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1)
|
191 |
+
sources = gr.HTML(value = "Source paragraphs where I looked for answers will appear here", height=chat_height, scale = 2)
|
192 |
|
193 |
with gr.Row():
|
194 |
message = gr.Textbox(
|
|
|
197 |
)
|
198 |
with gr.Row():
|
199 |
submit = gr.Button(value="Send message", variant="secondary", scale = 1)
|
200 |
+
clear = gr.Button(value="Clear chat", variant="secondary", scale=0)
|
201 |
+
stop = gr.Button(value="Stop generating", variant="secondary", scale=0)
|
202 |
|
203 |
examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
|
204 |
#value = "What were the five pillars of the previous borough plan?",
|
|
|
227 |
|
228 |
with gr.Tab("Advanced features"):
|
229 |
model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca", choices = ["Flan Alpaca", "Orca Mini"])
|
230 |
+
with gr.Row():
|
231 |
+
gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU (WARNING: please don't modify unless you have a GPU).", value=0, minimum=0, maximum=6, step = 1, visible=True)
|
232 |
+
change_model_button = gr.Button(value="Load model", scale=0)
|
233 |
+
load_text = gr.Text(label="Load status")
|
234 |
|
235 |
gr.HTML(
|
236 |
"<center>This app is based on the models Flan Alpaca and Orca Mini. It powered by Gradio, Transformers, Ctransformers, and Langchain.</a></center>"
|
|
|
238 |
|
239 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
240 |
|
241 |
+
change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
242 |
+
then(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text]).\
|
243 |
+
then(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
|
244 |
+
then(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
|
245 |
+
then(lambda: None, None, chatbot, queue=False)
|
246 |
|
247 |
# Load in a pdf
|
248 |
load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
|
|
|
273 |
then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
274 |
then(lambda: chatf.restore_interactivity(), None, [message], queue=False)
|
275 |
|
276 |
+
# Stop box
|
277 |
+
stop.click(fn=None, inputs=None, outputs=None, cancels=[response_click, response_enter])
|
278 |
+
|
279 |
# Clear box
|
280 |
clear.click(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic])
|
281 |
clear.click(lambda: None, None, chatbot, queue=False)
|
chatfuncs/chatfuncs.py
CHANGED
@@ -69,7 +69,7 @@ kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniL
|
|
69 |
|
70 |
if torch.cuda.is_available():
|
71 |
torch_device = "cuda"
|
72 |
-
gpu_layers =
|
73 |
else:
|
74 |
torch_device = "cpu"
|
75 |
gpu_layers = 0
|
@@ -82,25 +82,38 @@ print("CPU threads:", threads)
|
|
82 |
temperature: float = 0.1
|
83 |
top_k: int = 3
|
84 |
top_p: float = 1
|
85 |
-
repetition_penalty: float = 1.
|
86 |
flan_alpaca_repetition_penalty: float = 1.3
|
|
|
87 |
last_n_tokens: int = 64
|
88 |
-
max_new_tokens: int =
|
89 |
seed: int = 42
|
90 |
reset: bool = False
|
91 |
stream: bool = True
|
92 |
threads: int = threads
|
93 |
-
batch_size:int =
|
94 |
-
context_length:int =
|
95 |
sample = True
|
96 |
|
97 |
|
98 |
class CtransInitConfig_gpu:
|
99 |
-
def __init__(self, temperature=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
self.temperature = temperature
|
101 |
self.top_k = top_k
|
102 |
self.top_p = top_p
|
103 |
-
self.repetition_penalty = repetition_penalty
|
104 |
self.last_n_tokens = last_n_tokens
|
105 |
self.max_new_tokens = max_new_tokens
|
106 |
self.seed = seed
|
@@ -124,17 +137,38 @@ gpu_config = CtransInitConfig_gpu()
|
|
124 |
cpu_config = CtransInitConfig_cpu()
|
125 |
|
126 |
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
class CtransGenGenerationConfig:
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
# Vectorstore funcs
|
140 |
|
@@ -199,6 +233,12 @@ def base_prompt_templates(model_type = "Flan Alpaca"):
|
|
199 |
|
200 |
Response:"""
|
201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
instruction_prompt_template_orca = """
|
203 |
### System:
|
204 |
You are an AI assistant that follows instruction extremely well. Help as much as you can.
|
@@ -215,7 +255,15 @@ def base_prompt_templates(model_type = "Flan Alpaca"):
|
|
215 |
Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.
|
216 |
CONTENT: {summaries}
|
217 |
QUESTION: {question}\n
|
218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
if model_type == "Flan Alpaca":
|
221 |
INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
|
@@ -233,12 +281,12 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
|
|
233 |
new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
|
234 |
|
235 |
|
236 |
-
docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 5, out_passages =
|
237 |
vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
|
238 |
#vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
|
239 |
|
240 |
# Expand the found passages to the neighbouring context
|
241 |
-
docs_keep_as_doc, doc_df = get_expanded_passages(vectorstore, docs_keep_out, width=
|
242 |
|
243 |
if docs_keep_as_doc == []:
|
244 |
{"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
|
@@ -301,7 +349,7 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type):
|
|
301 |
streamer=streamer,
|
302 |
max_new_tokens=max_new_tokens,
|
303 |
do_sample=sample,
|
304 |
-
repetition_penalty=
|
305 |
top_p=top_p,
|
306 |
temperature=temperature,
|
307 |
top_k=top_k
|
@@ -332,13 +380,15 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type):
|
|
332 |
elif model_type == "Orca Mini":
|
333 |
tokens = model.tokenize(full_prompt)
|
334 |
|
|
|
|
|
335 |
# Pull the generated text from the streamer, and update the model output.
|
336 |
start = time.time()
|
337 |
NUM_TOKENS=0
|
338 |
print('-'*4+'Start Generation'+'-'*4)
|
339 |
|
340 |
history[-1][1] = ""
|
341 |
-
for new_text in model.generate(tokens, **
|
342 |
if new_text == None: new_text = ""
|
343 |
history[-1][1] += model.detokenize(new_text) #new_text
|
344 |
NUM_TOKENS+=1
|
|
|
69 |
|
70 |
if torch.cuda.is_available():
|
71 |
torch_device = "cuda"
|
72 |
+
gpu_layers = 0
|
73 |
else:
|
74 |
torch_device = "cpu"
|
75 |
gpu_layers = 0
|
|
|
82 |
temperature: float = 0.1
|
83 |
top_k: int = 3
|
84 |
top_p: float = 1
|
85 |
+
repetition_penalty: float = 1.3
|
86 |
flan_alpaca_repetition_penalty: float = 1.3
|
87 |
+
tinyllama_repetition_penalty: float = 1.5
|
88 |
last_n_tokens: int = 64
|
89 |
+
max_new_tokens: int = 512
|
90 |
seed: int = 42
|
91 |
reset: bool = False
|
92 |
stream: bool = True
|
93 |
threads: int = threads
|
94 |
+
batch_size:int = 256
|
95 |
+
context_length:int = 2048
|
96 |
sample = True
|
97 |
|
98 |
|
99 |
class CtransInitConfig_gpu:
|
100 |
+
def __init__(self, temperature=temperature,
|
101 |
+
top_k=top_k,
|
102 |
+
top_p=top_p,
|
103 |
+
repetition_penalty=repetition_penalty,
|
104 |
+
last_n_tokens=last_n_tokens,
|
105 |
+
max_new_tokens=max_new_tokens,
|
106 |
+
seed=seed,
|
107 |
+
reset=reset,
|
108 |
+
stream=stream,
|
109 |
+
threads=threads,
|
110 |
+
batch_size=batch_size,
|
111 |
+
context_length=context_length,
|
112 |
+
gpu_layers=gpu_layers):
|
113 |
self.temperature = temperature
|
114 |
self.top_k = top_k
|
115 |
self.top_p = top_p
|
116 |
+
self.repetition_penalty = repetition_penalty# repetition_penalty
|
117 |
self.last_n_tokens = last_n_tokens
|
118 |
self.max_new_tokens = max_new_tokens
|
119 |
self.seed = seed
|
|
|
137 |
cpu_config = CtransInitConfig_cpu()
|
138 |
|
139 |
|
140 |
+
#@dataclass
|
141 |
+
#class CtransGenGenerationConfig:
|
142 |
+
# top_k: int = top_k
|
143 |
+
# top_p: float = top_p
|
144 |
+
# temperature: float = temperature
|
145 |
+
# repetition_penalty: float = tinyllama_repetition_penalty
|
146 |
+
# last_n_tokens: int = last_n_tokens
|
147 |
+
# seed: int = seed
|
148 |
+
# batch_size:int = batch_size
|
149 |
+
# threads: int = threads
|
150 |
+
# reset: bool = True
|
151 |
+
|
152 |
class CtransGenGenerationConfig:
|
153 |
+
def __init__(self, temperature=temperature,
|
154 |
+
top_k=top_k,
|
155 |
+
top_p=top_p,
|
156 |
+
repetition_penalty=repetition_penalty,
|
157 |
+
last_n_tokens=last_n_tokens,
|
158 |
+
seed=seed,
|
159 |
+
threads=threads,
|
160 |
+
batch_size=batch_size,
|
161 |
+
reset=True
|
162 |
+
):
|
163 |
+
self.temperature = temperature
|
164 |
+
self.top_k = top_k
|
165 |
+
self.top_p = top_p
|
166 |
+
self.repetition_penalty = repetition_penalty# repetition_penalty
|
167 |
+
self.last_n_tokens = last_n_tokens
|
168 |
+
self.seed = seed
|
169 |
+
self.threads = threads
|
170 |
+
self.batch_size = batch_size
|
171 |
+
self.reset = reset
|
172 |
|
173 |
# Vectorstore funcs
|
174 |
|
|
|
233 |
|
234 |
Response:"""
|
235 |
|
236 |
+
instruction_prompt_template_sheared_llama = """Answer the QUESTION using information from the following CONTENT.
|
237 |
+
CONTENT: {summaries}
|
238 |
+
QUESTION: {question}
|
239 |
+
|
240 |
+
Answer:"""
|
241 |
+
|
242 |
instruction_prompt_template_orca = """
|
243 |
### System:
|
244 |
You are an AI assistant that follows instruction extremely well. Help as much as you can.
|
|
|
255 |
Answer the QUESTION using information from the following CONTENT. Respond with short answers that directly answer the question.
|
256 |
CONTENT: {summaries}
|
257 |
QUESTION: {question}\n
|
258 |
+
Answer:<|im_end|>"""
|
259 |
+
|
260 |
+
instruction_prompt_tinyllama_orca = """<|im_start|>system\n
|
261 |
+
You are an AI assistant that follows instruction extremely well. Help as much as you can.
|
262 |
+
<|im_start|>user\n
|
263 |
+
Answer the QUESTION using information from the following CONTENT. Only quote text that directly answers the question and nothing more. If you can't find an answer to the question, respond with "Sorry, I can't find an answer to that question.".
|
264 |
+
CONTENT: {summaries}
|
265 |
+
QUESTION: {question}\n
|
266 |
+
Answer:<|im_end|>"""
|
267 |
|
268 |
if model_type == "Flan Alpaca":
|
269 |
INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
|
|
|
281 |
new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
|
282 |
|
283 |
|
284 |
+
docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 5, out_passages = 1,
|
285 |
vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
|
286 |
#vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
|
287 |
|
288 |
# Expand the found passages to the neighbouring context
|
289 |
+
docs_keep_as_doc, doc_df = get_expanded_passages(vectorstore, docs_keep_out, width=3)
|
290 |
|
291 |
if docs_keep_as_doc == []:
|
292 |
{"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
|
|
|
349 |
streamer=streamer,
|
350 |
max_new_tokens=max_new_tokens,
|
351 |
do_sample=sample,
|
352 |
+
repetition_penalty=repetition_penalty,
|
353 |
top_p=top_p,
|
354 |
temperature=temperature,
|
355 |
top_k=top_k
|
|
|
380 |
elif model_type == "Orca Mini":
|
381 |
tokens = model.tokenize(full_prompt)
|
382 |
|
383 |
+
gen_config = CtransGenGenerationConfig()
|
384 |
+
|
385 |
# Pull the generated text from the streamer, and update the model output.
|
386 |
start = time.time()
|
387 |
NUM_TOKENS=0
|
388 |
print('-'*4+'Start Generation'+'-'*4)
|
389 |
|
390 |
history[-1][1] = ""
|
391 |
+
for new_text in model.generate(tokens, **vars(gen_config)): #CtransGen_generate(prompt=full_prompt)#, config=CtransGenGenerationConfig()): # #top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty,
|
392 |
if new_text == None: new_text = ""
|
393 |
history[-1][1] += model.detokenize(new_text) #new_text
|
394 |
NUM_TOKENS+=1
|