seanpedrickcase
commited on
Commit
•
85b6613
1
Parent(s):
5cdf399
CPU Flan inference is crashing, so trying to revert to previous package versions that worked
Browse files- app.py +4 -4
- chatfuncs/chatfuncs.py +11 -175
- requirements.txt +5 -6
app.py
CHANGED
@@ -113,14 +113,14 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
113 |
|
114 |
if torch_device == "cuda":
|
115 |
if "flan" in model_name:
|
116 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto"
|
117 |
else:
|
118 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto"
|
119 |
else:
|
120 |
if "flan" in model_name:
|
121 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(model_name
|
122 |
else:
|
123 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True
|
124 |
|
125 |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
|
126 |
|
|
|
113 |
|
114 |
if torch_device == "cuda":
|
115 |
if "flan" in model_name:
|
116 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
|
117 |
else:
|
118 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
|
119 |
else:
|
120 |
if "flan" in model_name:
|
121 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, torch_dtype=torch.float16)
|
122 |
else:
|
123 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)#, torch_dtype=torch.float16)
|
124 |
|
125 |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
|
126 |
|
chatfuncs/chatfuncs.py
CHANGED
@@ -99,66 +99,17 @@ context_length:int = 2048
|
|
99 |
sample = True
|
100 |
|
101 |
|
102 |
-
# class CtransInitConfig_gpu:
|
103 |
-
# def __init__(self, temperature=temperature,
|
104 |
-
# top_k=top_k,
|
105 |
-
# top_p=top_p,
|
106 |
-
# repetition_penalty=repetition_penalty,
|
107 |
-
# last_n_tokens=last_n_tokens,
|
108 |
-
# max_new_tokens=max_new_tokens,
|
109 |
-
# seed=seed,
|
110 |
-
# reset=reset,
|
111 |
-
# stream=stream,
|
112 |
-
# threads=threads,
|
113 |
-
# batch_size=batch_size,
|
114 |
-
# context_length=context_length,
|
115 |
-
# gpu_layers=gpu_layers):
|
116 |
-
# self.temperature = temperature
|
117 |
-
# self.top_k = top_k
|
118 |
-
# self.top_p = top_p
|
119 |
-
# self.repetition_penalty = repetition_penalty# repetition_penalty
|
120 |
-
# self.last_n_tokens = last_n_tokens
|
121 |
-
# self.max_new_tokens = max_new_tokens
|
122 |
-
# self.seed = seed
|
123 |
-
# self.reset = reset
|
124 |
-
# self.stream = stream
|
125 |
-
# self.threads = threads
|
126 |
-
# self.batch_size = batch_size
|
127 |
-
# self.context_length = context_length
|
128 |
-
# self.gpu_layers = gpu_layers
|
129 |
-
# # self.stop: list[str] = field(default_factory=lambda: [stop_string])
|
130 |
-
|
131 |
-
# def update_gpu(self, new_value):
|
132 |
-
# self.gpu_layers = new_value
|
133 |
-
|
134 |
-
# class CtransInitConfig_cpu(CtransInitConfig_gpu):
|
135 |
-
# def __init__(self):
|
136 |
-
# super().__init__()
|
137 |
-
# self.gpu_layers = 0
|
138 |
-
|
139 |
class CtransInitConfig_gpu:
|
140 |
-
def __init__(self,
|
141 |
-
#top_k=top_k,
|
142 |
-
#top_p=top_p,
|
143 |
-
#repetition_penalty=repetition_penalty,
|
144 |
last_n_tokens=last_n_tokens,
|
145 |
-
#max_new_tokens=max_new_tokens,
|
146 |
seed=seed,
|
147 |
-
#reset=reset,
|
148 |
-
#stream=stream,
|
149 |
n_threads=threads,
|
150 |
n_batch=batch_size,
|
151 |
n_ctx=4096,
|
152 |
n_gpu_layers=gpu_layers):
|
153 |
-
|
154 |
-
#self.top_k = top_k
|
155 |
-
#self.top_p = top_p
|
156 |
-
#self.repetition_penalty = repetition_penalty# repetition_penalty
|
157 |
self.last_n_tokens = last_n_tokens
|
158 |
-
#self.max_new_tokens = max_new_tokens
|
159 |
self.seed = seed
|
160 |
-
#self.reset = reset
|
161 |
-
#self.stream = stream
|
162 |
self.n_threads = n_threads
|
163 |
self.n_batch = n_batch
|
164 |
self.n_ctx = n_ctx
|
@@ -177,51 +128,22 @@ gpu_config = CtransInitConfig_gpu()
|
|
177 |
cpu_config = CtransInitConfig_cpu()
|
178 |
|
179 |
|
180 |
-
# class CtransGenGenerationConfig:
|
181 |
-
# def __init__(self, temperature=temperature,
|
182 |
-
# top_k=top_k,
|
183 |
-
# top_p=top_p,
|
184 |
-
# repetition_penalty=repetition_penalty,
|
185 |
-
# last_n_tokens=last_n_tokens,
|
186 |
-
# seed=seed,
|
187 |
-
# threads=threads,
|
188 |
-
# batch_size=batch_size,
|
189 |
-
# reset=True
|
190 |
-
# ):
|
191 |
-
# self.temperature = temperature
|
192 |
-
# self.top_k = top_k
|
193 |
-
# self.top_p = top_p
|
194 |
-
# self.repetition_penalty = repetition_penalty# repetition_penalty
|
195 |
-
# self.last_n_tokens = last_n_tokens
|
196 |
-
# self.seed = seed
|
197 |
-
# self.threads = threads
|
198 |
-
# self.batch_size = batch_size
|
199 |
-
# self.reset = reset
|
200 |
-
|
201 |
class CtransGenGenerationConfig:
|
202 |
def __init__(self, temperature=temperature,
|
203 |
top_k=top_k,
|
204 |
top_p=top_p,
|
205 |
repeat_penalty=repetition_penalty,
|
206 |
-
#last_n_tokens=last_n_tokens,
|
207 |
seed=seed,
|
208 |
stream=stream,
|
209 |
max_tokens=max_new_tokens
|
210 |
-
#threads=threads,
|
211 |
-
#batch_size=batch_size,
|
212 |
-
#reset=True
|
213 |
):
|
214 |
self.temperature = temperature
|
215 |
self.top_k = top_k
|
216 |
self.top_p = top_p
|
217 |
self.repeat_penalty = repeat_penalty
|
218 |
-
#self.last_n_tokens = last_n_tokens
|
219 |
self.seed = seed
|
220 |
self.max_tokens=max_tokens
|
221 |
self.stream = stream
|
222 |
-
#self.threads = threads
|
223 |
-
#self.batch_size = batch_size
|
224 |
-
#self.reset = reset
|
225 |
|
226 |
def update_temp(self, new_value):
|
227 |
self.temperature = new_value
|
@@ -417,93 +339,6 @@ def create_full_prompt(user_input, history, extracted_memory, vectorstore, embed
|
|
417 |
return history, docs_content_string, instruction_prompt_out
|
418 |
|
419 |
# Chat functions
|
420 |
-
# def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
421 |
-
# temperature=temperature,
|
422 |
-
# max_new_tokens=max_new_tokens,
|
423 |
-
# sample=sample,
|
424 |
-
# repetition_penalty=repetition_penalty,
|
425 |
-
# top_p=top_p,
|
426 |
-
# top_k=top_k
|
427 |
-
# ):
|
428 |
-
# #print("Model type is: ", model_type)
|
429 |
-
|
430 |
-
# #if not full_prompt.strip():
|
431 |
-
# # if history is None:
|
432 |
-
# # history = []
|
433 |
-
|
434 |
-
# # return history
|
435 |
-
|
436 |
-
# if model_type == "Flan Alpaca (small, fast)":
|
437 |
-
# # Get the model and tokenizer, and tokenize the user text.
|
438 |
-
# model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device) # return_attention_mask=False was added
|
439 |
-
|
440 |
-
# # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
|
441 |
-
# # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
|
442 |
-
# streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
|
443 |
-
# generate_kwargs = dict(
|
444 |
-
# model_inputs,
|
445 |
-
# streamer=streamer,
|
446 |
-
# max_new_tokens=max_new_tokens,
|
447 |
-
# do_sample=sample,
|
448 |
-
# repetition_penalty=repetition_penalty,
|
449 |
-
# top_p=top_p,
|
450 |
-
# temperature=temperature,
|
451 |
-
# top_k=top_k
|
452 |
-
# )
|
453 |
-
|
454 |
-
# print(generate_kwargs)
|
455 |
-
|
456 |
-
# t = Thread(target=model.generate, kwargs=generate_kwargs)
|
457 |
-
# t.start()
|
458 |
-
|
459 |
-
# # Pull the generated text from the streamer, and update the model output.
|
460 |
-
# start = time.time()
|
461 |
-
# NUM_TOKENS=0
|
462 |
-
# print('-'*4+'Start Generation'+'-'*4)
|
463 |
-
|
464 |
-
# history[-1][1] = ""
|
465 |
-
# for new_text in streamer:
|
466 |
-
# if new_text == None: new_text = ""
|
467 |
-
# history[-1][1] += new_text
|
468 |
-
# NUM_TOKENS+=1
|
469 |
-
# yield history
|
470 |
-
|
471 |
-
# time_generate = time.time() - start
|
472 |
-
# print('\n')
|
473 |
-
# print('-'*4+'End Generation'+'-'*4)
|
474 |
-
# print(f'Num of generated tokens: {NUM_TOKENS}')
|
475 |
-
# print(f'Time for complete generation: {time_generate}s')
|
476 |
-
# print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
477 |
-
# print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
478 |
-
|
479 |
-
# elif model_type == "Mistral Open Orca (larger, slow)":
|
480 |
-
# tokens = model.tokenize(full_prompt)
|
481 |
-
|
482 |
-
# gen_config = CtransGenGenerationConfig()
|
483 |
-
# gen_config.update_temp(temperature)
|
484 |
-
|
485 |
-
# print(vars(gen_config))
|
486 |
-
|
487 |
-
# # Pull the generated text from the streamer, and update the model output.
|
488 |
-
# start = time.time()
|
489 |
-
# NUM_TOKENS=0
|
490 |
-
# print('-'*4+'Start Generation'+'-'*4)
|
491 |
-
|
492 |
-
# history[-1][1] = ""
|
493 |
-
# for new_text in model.generate(tokens, **vars(gen_config)): #CtransGen_generate(prompt=full_prompt)#, config=CtransGenGenerationConfig()): # #top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty,
|
494 |
-
# if new_text == None: new_text = ""
|
495 |
-
# history[-1][1] += model.detokenize(new_text) #new_text
|
496 |
-
# NUM_TOKENS+=1
|
497 |
-
# yield history
|
498 |
-
|
499 |
-
# time_generate = time.time() - start
|
500 |
-
# print('\n')
|
501 |
-
# print('-'*4+'End Generation'+'-'*4)
|
502 |
-
# print(f'Num of generated tokens: {NUM_TOKENS}')
|
503 |
-
# print(f'Time for complete generation: {time_generate}s')
|
504 |
-
# print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
505 |
-
# print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
506 |
-
|
507 |
|
508 |
def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
509 |
temperature=temperature,
|
@@ -523,8 +358,8 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
523 |
|
524 |
if model_type == "Flan Alpaca (small, fast)":
|
525 |
# Get the model and tokenizer, and tokenize the user text.
|
526 |
-
model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device)
|
527 |
-
|
528 |
# Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
|
529 |
# in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
|
530 |
streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
|
@@ -551,10 +386,13 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
551 |
|
552 |
history[-1][1] = ""
|
553 |
for new_text in streamer:
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
|
|
|
|
|
|
558 |
|
559 |
time_generate = time.time() - start
|
560 |
print('\n')
|
@@ -567,8 +405,6 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
567 |
elif model_type == "Mistral Open Orca (larger, slow)":
|
568 |
#tokens = model.tokenize(full_prompt)
|
569 |
|
570 |
-
temp = ""
|
571 |
-
|
572 |
gen_config = CtransGenGenerationConfig()
|
573 |
gen_config.update_temp(temperature)
|
574 |
|
|
|
99 |
sample = True
|
100 |
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
class CtransInitConfig_gpu:
|
103 |
+
def __init__(self,
|
|
|
|
|
|
|
104 |
last_n_tokens=last_n_tokens,
|
|
|
105 |
seed=seed,
|
|
|
|
|
106 |
n_threads=threads,
|
107 |
n_batch=batch_size,
|
108 |
n_ctx=4096,
|
109 |
n_gpu_layers=gpu_layers):
|
110 |
+
|
|
|
|
|
|
|
111 |
self.last_n_tokens = last_n_tokens
|
|
|
112 |
self.seed = seed
|
|
|
|
|
113 |
self.n_threads = n_threads
|
114 |
self.n_batch = n_batch
|
115 |
self.n_ctx = n_ctx
|
|
|
128 |
cpu_config = CtransInitConfig_cpu()
|
129 |
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
class CtransGenGenerationConfig:
|
132 |
def __init__(self, temperature=temperature,
|
133 |
top_k=top_k,
|
134 |
top_p=top_p,
|
135 |
repeat_penalty=repetition_penalty,
|
|
|
136 |
seed=seed,
|
137 |
stream=stream,
|
138 |
max_tokens=max_new_tokens
|
|
|
|
|
|
|
139 |
):
|
140 |
self.temperature = temperature
|
141 |
self.top_k = top_k
|
142 |
self.top_p = top_p
|
143 |
self.repeat_penalty = repeat_penalty
|
|
|
144 |
self.seed = seed
|
145 |
self.max_tokens=max_tokens
|
146 |
self.stream = stream
|
|
|
|
|
|
|
147 |
|
148 |
def update_temp(self, new_value):
|
149 |
self.temperature = new_value
|
|
|
339 |
return history, docs_content_string, instruction_prompt_out
|
340 |
|
341 |
# Chat functions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
|
343 |
def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
344 |
temperature=temperature,
|
|
|
358 |
|
359 |
if model_type == "Flan Alpaca (small, fast)":
|
360 |
# Get the model and tokenizer, and tokenize the user text.
|
361 |
+
model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device)
|
362 |
+
|
363 |
# Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
|
364 |
# in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
|
365 |
streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
|
|
|
386 |
|
387 |
history[-1][1] = ""
|
388 |
for new_text in streamer:
|
389 |
+
try:
|
390 |
+
if new_text == None: new_text = ""
|
391 |
+
history[-1][1] += new_text
|
392 |
+
NUM_TOKENS+=1
|
393 |
+
yield history
|
394 |
+
except Exception as e:
|
395 |
+
print(f"Error during text generation: {e}")
|
396 |
|
397 |
time_generate = time.time() - start
|
398 |
print('\n')
|
|
|
405 |
elif model_type == "Mistral Open Orca (larger, slow)":
|
406 |
#tokens = model.tokenize(full_prompt)
|
407 |
|
|
|
|
|
408 |
gen_config = CtransGenGenerationConfig()
|
409 |
gen_config.update_temp(temperature)
|
410 |
|
requirements.txt
CHANGED
@@ -2,15 +2,14 @@ langchain
|
|
2 |
langchain-community
|
3 |
beautifulsoup4
|
4 |
pandas
|
5 |
-
transformers
|
6 |
llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
7 |
-
|
8 |
-
|
9 |
-
sentence_transformers
|
10 |
-
faiss-cpu
|
11 |
pypdf
|
12 |
python-docx
|
13 |
-
#ctransformers[cuda]
|
14 |
keybert
|
15 |
span_marker
|
16 |
gensim
|
|
|
2 |
langchain-community
|
3 |
beautifulsoup4
|
4 |
pandas
|
5 |
+
transformers==4.34.0
|
6 |
llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
7 |
+
torch \
|
8 |
+
--extra-index-url https://download.pytorch.org/whl/cu121
|
9 |
+
sentence_transformers==2.2.2
|
10 |
+
faiss-cpu==1.7.4
|
11 |
pypdf
|
12 |
python-docx
|
|
|
13 |
keybert
|
14 |
span_marker
|
15 |
gensim
|