Spaces:
Running
Running
import os | |
import pandas as pd | |
import pathlib, fitz | |
from langchain.vectorstores import Chroma | |
# from PyPDF2 import PdfReader | |
# from google.colab import files | |
# from google.colab import userdata | |
from sentence_transformers import SentenceTransformer | |
from sentence_transformers.util import cos_sim | |
from langchain.embeddings import HuggingFaceInstructEmbeddings | |
from langchain.docstore.document import Document | |
from huggingface_hub import InferenceClient | |
import gradio as gr | |
file_paths = ["docs/Prospectus_2023_24_Eng_Version.pdf","docs/MANUU_UG_PROGRAMMES_PROSPECTUS_2022_23_Eng_5_April_2022.pdf"] | |
page_contents = [] | |
for fname in file_paths: | |
with fitz.open(fname) as doc: | |
print("Total Pages in {} are {}".format(fname,len(doc))) | |
for page in doc: | |
text = page.get_text() | |
if "............" in text: | |
continue | |
#print(text) | |
page_contents.append(text) | |
#break | |
embedding_model = HuggingFaceInstructEmbeddings( | |
model_name="hkunlp/instructor-base", | |
#model_name="jinaai/jina-embedding-b-en-v1", | |
#model_name="WhereIsAI/UAE-Large-V1", | |
#model_kwargs={"device": "cuda"} | |
model_kwargs={"device": "cpu"} | |
) | |
df_documents_chunks = pd.DataFrame({"doc_pages":page_contents}) | |
df_documents_chunks["index_id"] = df_documents_chunks.index | |
print(df_documents_chunks) | |
def row_to_doc(row): | |
return Document(metadata={ | |
'id': row['index_id'] | |
}, page_content=row['doc_pages']) | |
manuuindex_df_processed_documents = df_documents_chunks.apply(lambda row:row_to_doc(row),axis=1).to_list() | |
COLLECTION_NAME='Manuu_collection' | |
PERSIST_DIR='MANUU_dir4' | |
if os.path.exists(PERSIST_DIR): | |
print('Existing Collection : ', COLLECTION_NAME) | |
vectordb = Chroma(persist_directory=PERSIST_DIR, collection_name=COLLECTION_NAME, embedding_function=embedding_model) | |
print(f"Collection {vectordb._collection.name} has {vectordb._collection.count()} documents...") | |
else: | |
print('New Collection : ', COLLECTION_NAME) | |
vectordb = Chroma.from_documents(documents=manuuindex_df_processed_documents, | |
embedding=embedding_model, | |
collection_name=COLLECTION_NAME, | |
persist_directory=PERSIST_DIR, | |
collection_metadata=None) | |
client = vectordb.persist() # Save vector database as persistent files in the output folder | |
print(f"Collection {vectordb._collection.name} has {vectordb._collection.count()} documents...") | |
client = InferenceClient( | |
model = "mistralai/Mixtral-8x7B-Instruct-v0.1") | |
def context_fn(question_text,vectordb): | |
relevant_chunks = vectordb.similarity_search_with_score( | |
query=question_text, | |
k=5,) | |
context_5 = "\n\n\n".join([i[0].page_content for i in relevant_chunks]) | |
return context_5 | |
def format_prompt(message, history, context_prompt): | |
prompt = "<s>" | |
for user_prompt, bot_response in history: | |
prompt += f"[INST] {user_prompt}. Do not Give information from outside the Document Contexts and general Information[/INST]" | |
prompt += f" {bot_response}\n" | |
prompt += f" CONTEXT:{context_prompt}</s> " | |
prompt += f"[INST] {message} [/INST]" | |
with open('prompts.txt', 'a') as file: | |
print("user_prompt",prompt, file=file) | |
file.close() | |
return prompt | |
def generate_fn( | |
prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,vectordb = vectordb | |
): | |
temperature = float(temperature) | |
if temperature < 1e-2: | |
temperature = 1e-2 | |
top_p = float(top_p) | |
generate_kwargs = dict( | |
temperature=temperature, | |
max_new_tokens=max_new_tokens, | |
top_p=top_p, | |
repetition_penalty=repetition_penalty, | |
do_sample=True, | |
seed=42, | |
) | |
context_5 = context_fn(question_text = prompt, vectordb = vectordb) | |
formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history, context_5) | |
#print("formatted_prompt",formatted_prompt) | |
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) | |
output = "" | |
for response in stream: | |
output += response.token.text | |
yield output | |
return output | |
additional_inputs=[ | |
gr.Textbox( | |
label="System Prompt", | |
max_lines=1, | |
interactive=True, | |
), | |
gr.Slider( | |
label="Temperature", | |
value=0.7, | |
minimum=0.0, | |
maximum=1.0, | |
step=0.05, | |
interactive=True, | |
info="Higher values produce more diverse outputs", | |
), | |
gr.Slider( | |
label="Max new tokens", | |
value=256, | |
minimum=0, | |
maximum=2048, | |
step=64, | |
interactive=True, | |
info="The maximum numbers of new tokens", | |
), | |
gr.Slider( | |
label="Top-p (nucleus sampling)", | |
value=0.90, | |
minimum=0.0, | |
maximum=1, | |
step=0.05, | |
interactive=True, | |
info="Higher values sample more low-probability tokens", | |
), | |
gr.Slider( | |
label="Repetition penalty", | |
value=1.3, | |
minimum=1.0, | |
maximum=2.0, | |
step=0.05, | |
interactive=True, | |
info="Penalize repeated tokens", | |
) | |
] | |
examples=[["Where is Maulana Azad National Urdu University?", None, None, None, None, None,], | |
[ "When was Department of Women Education established?", None, None, None, None, None, ], | |
["Tell me about Department of Public Administration", None, None, None, None, None,], | |
["What are Reservations for SCs/STs/OBCs /Women candidates/EWS Categories?", None, None, None, None, None,], | |
["What is Upper Age Limit limit for Admissions", None, None, None, None, None,], | |
["Fetch Details of Hostel Fee* (2022-23)?", None, None, None, None, None,], | |
["What is Entrance Test Schedule 2023-24?", None, None, None, None, None,], | |
] | |
gr.ChatInterface( | |
fn=generate_fn, | |
analytics_enabled=True, | |
chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"), | |
additional_inputs=additional_inputs, | |
title="MANUU Chatbot Mixtral 46.7B Model Instructor Base Context Embeddings", | |
examples=examples, | |
concurrency_limit=20, | |
).launch() |