Spaces:
Running
Running
File size: 6,351 Bytes
c5634f3 8dbf887 c5634f3 8194e39 c5634f3 d578004 c5634f3 d578004 c5634f3 8f2438c c5634f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import os
import pandas as pd
import pathlib, fitz
from langchain.vectorstores import Chroma
# from PyPDF2 import PdfReader
# from google.colab import files
# from google.colab import userdata
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.docstore.document import Document
from huggingface_hub import InferenceClient
import gradio as gr
file_paths = ["docs/Prospectus_2023_24_Eng_Version.pdf","docs/MANUU_UG_PROGRAMMES_PROSPECTUS_2022_23_Eng_5_April_2022.pdf"]
page_contents = []
for fname in file_paths:
with fitz.open(fname) as doc:
print("Total Pages in {} are {}".format(fname,len(doc)))
for page in doc:
text = page.get_text()
if "............" in text:
continue
#print(text)
page_contents.append(text)
#break
embedding_model = HuggingFaceInstructEmbeddings(
model_name="hkunlp/instructor-base",
#model_name="jinaai/jina-embedding-b-en-v1",
#model_name="WhereIsAI/UAE-Large-V1",
#model_kwargs={"device": "cuda"}
model_kwargs={"device": "cpu"}
)
df_documents_chunks = pd.DataFrame({"doc_pages":page_contents})
df_documents_chunks["index_id"] = df_documents_chunks.index
print(df_documents_chunks)
def row_to_doc(row):
return Document(metadata={
'id': row['index_id']
}, page_content=row['doc_pages'])
manuuindex_df_processed_documents = df_documents_chunks.apply(lambda row:row_to_doc(row),axis=1).to_list()
COLLECTION_NAME='Manuu_collection'
PERSIST_DIR='MANUU_dir4'
if os.path.exists(PERSIST_DIR):
print('Existing Collection : ', COLLECTION_NAME)
vectordb = Chroma(persist_directory=PERSIST_DIR, collection_name=COLLECTION_NAME, embedding_function=embedding_model)
print(f"Collection {vectordb._collection.name} has {vectordb._collection.count()} documents...")
else:
print('New Collection : ', COLLECTION_NAME)
vectordb = Chroma.from_documents(documents=manuuindex_df_processed_documents,
embedding=embedding_model,
collection_name=COLLECTION_NAME,
persist_directory=PERSIST_DIR,
collection_metadata=None)
client = vectordb.persist() # Save vector database as persistent files in the output folder
print(f"Collection {vectordb._collection.name} has {vectordb._collection.count()} documents...")
client = InferenceClient(
model = "mistralai/Mixtral-8x7B-Instruct-v0.1")
def context_fn(question_text,vectordb):
relevant_chunks = vectordb.similarity_search_with_score(
query=question_text,
k=5,)
context_5 = "\n\n\n".join([i[0].page_content for i in relevant_chunks])
return context_5
def format_prompt(message, history, context_prompt):
prompt = "<s>"
for user_prompt, bot_response in history:
prompt += f"[INST] {user_prompt}. Do not Give information from outside the Document Contexts and general Information[/INST]"
prompt += f" {bot_response}\n"
prompt += f" CONTEXT:{context_prompt}</s> "
prompt += f"[INST] {message} [/INST]"
with open('prompts.txt', 'a') as file:
print("user_prompt",prompt, file=file)
file.close()
return prompt
def generate_fn(
prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,vectordb = vectordb
):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
context_5 = context_fn(question_text = prompt, vectordb = vectordb)
formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history, context_5)
#print("formatted_prompt",formatted_prompt)
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
output += response.token.text
yield output
return output
additional_inputs=[
gr.Textbox(
label="System Prompt",
max_lines=1,
interactive=True,
),
gr.Slider(
label="Temperature",
value=0.7,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Higher values produce more diverse outputs",
),
gr.Slider(
label="Max new tokens",
value=256,
minimum=0,
maximum=2048,
step=64,
interactive=True,
info="The maximum numbers of new tokens",
),
gr.Slider(
label="Top-p (nucleus sampling)",
value=0.90,
minimum=0.0,
maximum=1,
step=0.05,
interactive=True,
info="Higher values sample more low-probability tokens",
),
gr.Slider(
label="Repetition penalty",
value=1.3,
minimum=1.0,
maximum=2.0,
step=0.05,
interactive=True,
info="Penalize repeated tokens",
)
]
examples=[["Where is Maulana Azad National Urdu University?", None, None, None, None, None,],
[ "When was Department of Women Education established?", None, None, None, None, None, ],
["Tell me about Department of Public Administration", None, None, None, None, None,],
["What are Reservations for SCs/STs/OBCs /Women candidates/EWS Categories?", None, None, None, None, None,],
["What is Upper Age Limit limit for Admissions", None, None, None, None, None,],
["Fetch Details of Hostel Fee* (2022-23)?", None, None, None, None, None,],
["What is Entrance Test Schedule 2023-24?", None, None, None, None, None,],
]
gr.ChatInterface(
fn=generate_fn,
analytics_enabled=True,
chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
additional_inputs=additional_inputs,
title="MANUU Chatbot Mixtral 46.7B Model Instructor Base Context Embeddings",
examples=examples,
concurrency_limit=20,
).launch() |