Sean-Case
commited on
Commit
•
d2ddc62
1
Parent(s):
f6036ad
Attempt to switch to Orca Mini GGUF
Browse files- app.py +3 -4
- chatfuncs/chatfuncs.py +49 -31
- requirements.txt +1 -1
app.py
CHANGED
@@ -11,7 +11,6 @@ from langchain.vectorstores import FAISS
|
|
11 |
PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
|
12 |
|
13 |
# Disable cuda devices if necessary
|
14 |
-
|
15 |
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
16 |
|
17 |
#from chatfuncs.chatfuncs import *
|
@@ -155,7 +154,7 @@ with block:
|
|
155 |
ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
|
156 |
|
157 |
gr.HTML(
|
158 |
-
"<center>Powered by
|
159 |
)
|
160 |
|
161 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
@@ -177,14 +176,14 @@ with block:
|
|
177 |
# Click/enter to send message action
|
178 |
response_click = submit.click(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False, api_name="retrieval").\
|
179 |
then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
180 |
-
then(chatf.
|
181 |
response_click.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
|
182 |
then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
183 |
then(lambda: gr.update(interactive=True), None, [message], queue=False)
|
184 |
|
185 |
response_enter = message.submit(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False).\
|
186 |
then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
187 |
-
then(chatf.
|
188 |
response_enter.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
|
189 |
then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
190 |
then(lambda: gr.update(interactive=True), None, [message], queue=False)
|
|
|
11 |
PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
|
12 |
|
13 |
# Disable cuda devices if necessary
|
|
|
14 |
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
15 |
|
16 |
#from chatfuncs.chatfuncs import *
|
|
|
154 |
ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
|
155 |
|
156 |
gr.HTML(
|
157 |
+
"<center>Powered by Orca Mini and Langchain</a></center>"
|
158 |
)
|
159 |
|
160 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
|
|
176 |
# Click/enter to send message action
|
177 |
response_click = submit.click(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False, api_name="retrieval").\
|
178 |
then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
179 |
+
then(chatf.produce_streaming_answer_chatbot_ctrans, inputs=[chatbot, instruction_prompt_out], outputs=chatbot)
|
180 |
response_click.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
|
181 |
then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
182 |
then(lambda: gr.update(interactive=True), None, [message], queue=False)
|
183 |
|
184 |
response_enter = message.submit(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False).\
|
185 |
then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
186 |
+
then(chatf.produce_streaming_answer_chatbot_ctrans, [chatbot, instruction_prompt_out], chatbot)
|
187 |
response_enter.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
|
188 |
then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
189 |
then(lambda: gr.update(interactive=True), None, [message], queue=False)
|
chatfuncs/chatfuncs.py
CHANGED
@@ -7,12 +7,13 @@ import numpy as np
|
|
7 |
|
8 |
# Model packages
|
9 |
import torch
|
|
|
10 |
from threading import Thread
|
11 |
from transformers import AutoTokenizer, pipeline, TextIteratorStreamer
|
12 |
|
13 |
# Alternative model sources
|
14 |
from gpt4all import GPT4All
|
15 |
-
from ctransformers import AutoModelForCausalLM
|
16 |
|
17 |
from dataclasses import asdict, dataclass
|
18 |
|
@@ -44,7 +45,11 @@ from gensim.similarities import SparseMatrixSimilarity
|
|
44 |
|
45 |
import gradio as gr
|
46 |
|
47 |
-
|
|
|
|
|
|
|
|
|
48 |
print("Running on device:", torch_device)
|
49 |
threads = 8#torch.get_num_threads()
|
50 |
print("CPU threads:", threads)
|
@@ -72,9 +77,27 @@ stream: bool = True
|
|
72 |
threads: int = threads
|
73 |
batch_size:int = 512
|
74 |
context_length:int = 2048
|
75 |
-
gpu_layers:int = 0
|
76 |
sample = True
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
## Highlight text constants
|
79 |
hlt_chunk_size = 20
|
80 |
hlt_strat = [" ", ".", "!", "?", ":", "\n\n", "\n", ","]
|
@@ -87,17 +110,20 @@ ner_model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-mu
|
|
87 |
# Used to pull out keywords from chat history to add to user queries behind the scenes
|
88 |
kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
|
89 |
|
|
|
|
|
90 |
## Chat models ##
|
91 |
ctrans_llm = [] # Not leaded by default
|
92 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/orca_mini_3B-GGML', model_type='llama', model_file='orca-mini-3b.ggmlv3.q4_0.bin')
|
93 |
-
|
94 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/vicuna-13B-v1.5-16K-GGUF', model_type='llama', model_file='vicuna-13b-v1.5-16k.Q4_K_M.gguf')
|
95 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeUp-Llama-2-13B-Chat-HF-GGUF', model_type='llama', model_file='codeup-llama-2-13b-chat-hf.Q4_K_M.gguf')
|
96 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeLlama-13B-Instruct-GGUF', model_type='llama', model_file='codellama-13b-instruct.Q4_K_M.gguf')
|
97 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-Instruct-v0.1-GGUF', model_type='mistral', model_file='mistral-7b-instruct-v0.1.Q4_K_M.gguf')
|
98 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf')
|
99 |
|
100 |
-
|
|
|
101 |
|
102 |
# Huggingface chat model
|
103 |
#hf_checkpoint = 'jphme/phi-1_5_Wizard_Vicuna_uncensored'
|
@@ -128,7 +154,7 @@ def create_hf_model(model_name):
|
|
128 |
|
129 |
return model, tokenizer, torch_device
|
130 |
|
131 |
-
model, tokenizer, torch_device = create_hf_model(model_name = hf_checkpoint)
|
132 |
|
133 |
# Vectorstore funcs
|
134 |
|
@@ -196,6 +222,17 @@ def create_prompt_templates():
|
|
196 |
|
197 |
### Response:"""
|
198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
|
201 |
|
@@ -581,9 +618,6 @@ def create_final_prompt(inputs: Dict[str, str], instruction_prompt, content_prom
|
|
581 |
#print("The question passed to the vector search is:")
|
582 |
#print(new_question_kworded)
|
583 |
|
584 |
-
#docs_keep_as_doc, docs_content, docs_url = find_relevant_passages(new_question_kworded, k_val = 5, out_passages = 3,
|
585 |
-
# vec_score_cut_off = 1.3, vec_weight = 1, tfidf_weight = 0.5, svm_weight = 1)
|
586 |
-
|
587 |
docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 5, out_passages = 2,
|
588 |
vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
|
589 |
#vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
|
@@ -868,8 +902,8 @@ def produce_streaming_answer_chatbot_ctrans(history, full_prompt):
|
|
868 |
print("The question is: ")
|
869 |
print(full_prompt)
|
870 |
|
871 |
-
|
872 |
-
|
873 |
#import psutil
|
874 |
#from loguru import logger
|
875 |
|
@@ -884,29 +918,13 @@ def produce_streaming_answer_chatbot_ctrans(history, full_prompt):
|
|
884 |
#logger.debug(f"{cpu_count=}")
|
885 |
|
886 |
# Pull the generated text from the streamer, and update the model output.
|
887 |
-
config = GenerationConfig(reset=True)
|
888 |
history[-1][1] = ""
|
889 |
-
for new_text in ctrans_generate(prompt=
|
890 |
-
if new_text == None: new_text =
|
891 |
-
history[-1][1] += new_text
|
892 |
yield history
|
893 |
|
894 |
-
@dataclass
|
895 |
-
class GenerationConfig:
|
896 |
-
temperature: float = temperature
|
897 |
-
top_k: int = top_k
|
898 |
-
top_p: float = top_p
|
899 |
-
repetition_penalty: float = repetition_penalty
|
900 |
-
last_n_tokens: int = last_n_tokens
|
901 |
-
max_new_tokens: int = max_new_tokens
|
902 |
-
#seed: int = 42
|
903 |
-
reset: bool = reset
|
904 |
-
stream: bool = stream
|
905 |
-
threads: int = threads
|
906 |
-
batch_size:int = batch_size
|
907 |
-
#context_length:int = context_length
|
908 |
-
#gpu_layers:int = gpu_layers
|
909 |
-
#stop: list[str] = field(default_factory=lambda: [stop_string])
|
910 |
|
911 |
def ctrans_generate(
|
912 |
prompt: str,
|
|
|
7 |
|
8 |
# Model packages
|
9 |
import torch
|
10 |
+
torch.cuda.empty_cache()
|
11 |
from threading import Thread
|
12 |
from transformers import AutoTokenizer, pipeline, TextIteratorStreamer
|
13 |
|
14 |
# Alternative model sources
|
15 |
from gpt4all import GPT4All
|
16 |
+
from ctransformers import AutoModelForCausalLM#, AutoTokenizer
|
17 |
|
18 |
from dataclasses import asdict, dataclass
|
19 |
|
|
|
45 |
|
46 |
import gradio as gr
|
47 |
|
48 |
+
if torch.cuda.is_available():
|
49 |
+
torch_device = "cuda"
|
50 |
+
gpu_layers = 1
|
51 |
+
else: torch_device = "cpu"
|
52 |
+
|
53 |
print("Running on device:", torch_device)
|
54 |
threads = 8#torch.get_num_threads()
|
55 |
print("CPU threads:", threads)
|
|
|
77 |
threads: int = threads
|
78 |
batch_size:int = 512
|
79 |
context_length:int = 2048
|
80 |
+
gpu_layers:int = 0#10#gpu_layers
|
81 |
sample = True
|
82 |
|
83 |
+
@dataclass
|
84 |
+
class GenerationConfig:
|
85 |
+
temperature: float = temperature
|
86 |
+
top_k: int = top_k
|
87 |
+
top_p: float = top_p
|
88 |
+
repetition_penalty: float = repetition_penalty
|
89 |
+
last_n_tokens: int = last_n_tokens
|
90 |
+
max_new_tokens: int = max_new_tokens
|
91 |
+
#seed: int = 42
|
92 |
+
reset: bool = reset
|
93 |
+
stream: bool = stream
|
94 |
+
threads: int = threads
|
95 |
+
batch_size:int = batch_size
|
96 |
+
context_length:int = context_length
|
97 |
+
gpu_layers:int = gpu_layers
|
98 |
+
#stop: list[str] = field(default_factory=lambda: [stop_string])
|
99 |
+
|
100 |
+
|
101 |
## Highlight text constants
|
102 |
hlt_chunk_size = 20
|
103 |
hlt_strat = [" ", ".", "!", "?", ":", "\n\n", "\n", ","]
|
|
|
110 |
# Used to pull out keywords from chat history to add to user queries behind the scenes
|
111 |
kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
|
112 |
|
113 |
+
|
114 |
+
|
115 |
## Chat models ##
|
116 |
ctrans_llm = [] # Not leaded by default
|
117 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/orca_mini_3B-GGML', model_type='llama', model_file='orca-mini-3b.ggmlv3.q4_0.bin')
|
118 |
+
ctrans_llm = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **asdict(GenerationConfig()))
|
119 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/vicuna-13B-v1.5-16K-GGUF', model_type='llama', model_file='vicuna-13b-v1.5-16k.Q4_K_M.gguf')
|
120 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeUp-Llama-2-13B-Chat-HF-GGUF', model_type='llama', model_file='codeup-llama-2-13b-chat-hf.Q4_K_M.gguf')
|
121 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeLlama-13B-Instruct-GGUF', model_type='llama', model_file='codellama-13b-instruct.Q4_K_M.gguf')
|
122 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-Instruct-v0.1-GGUF', model_type='mistral', model_file='mistral-7b-instruct-v0.1.Q4_K_M.gguf')
|
123 |
#ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf')
|
124 |
|
125 |
+
|
126 |
+
#ctokenizer = AutoTokenizer.from_pretrained(ctrans_llm)
|
127 |
|
128 |
# Huggingface chat model
|
129 |
#hf_checkpoint = 'jphme/phi-1_5_Wizard_Vicuna_uncensored'
|
|
|
154 |
|
155 |
return model, tokenizer, torch_device
|
156 |
|
157 |
+
#model, tokenizer, torch_device = create_hf_model(model_name = hf_checkpoint)
|
158 |
|
159 |
# Vectorstore funcs
|
160 |
|
|
|
222 |
|
223 |
### Response:"""
|
224 |
|
225 |
+
instruction_prompt_template_orca_input = """
|
226 |
+
### System:
|
227 |
+
You are an AI assistant that follows instruction extremely well. Help as much as you can.
|
228 |
+
### User:
|
229 |
+
Answer the QUESTION using information from the following input.
|
230 |
+
### Input:
|
231 |
+
{summaries}
|
232 |
+
QUESTION: {question}
|
233 |
+
|
234 |
+
### Response:"""
|
235 |
+
|
236 |
|
237 |
|
238 |
|
|
|
618 |
#print("The question passed to the vector search is:")
|
619 |
#print(new_question_kworded)
|
620 |
|
|
|
|
|
|
|
621 |
docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 5, out_passages = 2,
|
622 |
vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
|
623 |
#vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
|
|
|
902 |
print("The question is: ")
|
903 |
print(full_prompt)
|
904 |
|
905 |
+
tokens = ctrans_llm.tokenize(full_prompt)
|
906 |
+
|
907 |
#import psutil
|
908 |
#from loguru import logger
|
909 |
|
|
|
918 |
#logger.debug(f"{cpu_count=}")
|
919 |
|
920 |
# Pull the generated text from the streamer, and update the model output.
|
921 |
+
#config = GenerationConfig(reset=True)
|
922 |
history[-1][1] = ""
|
923 |
+
for new_text in ctrans_llm.generate(tokens, top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty): #ctrans_generate(prompt=tokens, config=config):
|
924 |
+
if new_text == None: new_text = ""
|
925 |
+
history[-1][1] += ctrans_llm.detokenize(new_text) #new_text
|
926 |
yield history
|
927 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
928 |
|
929 |
def ctrans_generate(
|
930 |
prompt: str,
|
requirements.txt
CHANGED
@@ -17,7 +17,7 @@ gradio
|
|
17 |
gradio_client==0.2.7
|
18 |
python-docx
|
19 |
gpt4all
|
20 |
-
ctransformers
|
21 |
keybert
|
22 |
span_marker
|
23 |
gensim
|
|
|
17 |
gradio_client==0.2.7
|
18 |
python-docx
|
19 |
gpt4all
|
20 |
+
ctransformers[cuda]
|
21 |
keybert
|
22 |
span_marker
|
23 |
gensim
|