import os import pandas as pd import pathlib, fitz from langchain.vectorstores import Chroma # from PyPDF2 import PdfReader # from google.colab import files # from google.colab import userdata from sentence_transformers import SentenceTransformer from sentence_transformers.util import cos_sim from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.docstore.document import Document from huggingface_hub import InferenceClient import gradio as gr file_paths = ["docs/Prospectus_2023_24_Eng_Version.pdf","docs/MANUU_UG_PROGRAMMES_PROSPECTUS_2022_23_Eng_5_April_2022.pdf"] page_contents = [] for fname in file_paths: with fitz.open(fname) as doc: print("Total Pages in {} are {}".format(fname,len(doc))) for page in doc: text = page.get_text() if "............" in text: continue #print(text) page_contents.append(text) #break embedding_model = HuggingFaceInstructEmbeddings( model_name="hkunlp/instructor-base", #model_name="jinaai/jina-embedding-b-en-v1", #model_name="WhereIsAI/UAE-Large-V1", #model_kwargs={"device": "cuda"} model_kwargs={"device": "cpu"} ) df_documents_chunks = pd.DataFrame({"doc_pages":page_contents}) df_documents_chunks["index_id"] = df_documents_chunks.index print(df_documents_chunks) def row_to_doc(row): return Document(metadata={ 'id': row['index_id'] }, page_content=row['doc_pages']) manuuindex_df_processed_documents = df_documents_chunks.apply(lambda row:row_to_doc(row),axis=1).to_list() COLLECTION_NAME='Manuu_collection' PERSIST_DIR='MANUU_dir4' if os.path.exists(PERSIST_DIR): print('Existing Collection : ', COLLECTION_NAME) vectordb = Chroma(persist_directory=PERSIST_DIR, collection_name=COLLECTION_NAME, embedding_function=embedding_model) print(f"Collection {vectordb._collection.name} has {vectordb._collection.count()} documents...") else: print('New Collection : ', COLLECTION_NAME) vectordb = Chroma.from_documents(documents=manuuindex_df_processed_documents, embedding=embedding_model, collection_name=COLLECTION_NAME, persist_directory=PERSIST_DIR, collection_metadata=None) client = vectordb.persist() # Save vector database as persistent files in the output folder print(f"Collection {vectordb._collection.name} has {vectordb._collection.count()} documents...") client = InferenceClient( model = "mistralai/Mixtral-8x7B-Instruct-v0.1") def context_fn(question_text,vectordb): relevant_chunks = vectordb.similarity_search_with_score( query=question_text, k=5,) context_5 = "\n\n\n".join([i[0].page_content for i in relevant_chunks]) return context_5 def format_prompt(message, history, context_prompt): prompt = "" for user_prompt, bot_response in history: prompt += f"[INST] {user_prompt}. Do not Give information from outside the Document Contexts and general Information[/INST]" prompt += f" {bot_response}\n" prompt += f" CONTEXT:{context_prompt} " prompt += f"[INST] {message} [/INST]" with open('prompts.txt', 'a') as file: print("user_prompt",prompt, file=file) file.close() return prompt def generate_fn( prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,vectordb = vectordb ): temperature = float(temperature) if temperature < 1e-2: temperature = 1e-2 top_p = float(top_p) generate_kwargs = dict( temperature=temperature, max_new_tokens=max_new_tokens, top_p=top_p, repetition_penalty=repetition_penalty, do_sample=True, seed=42, ) context_5 = context_fn(question_text = prompt, vectordb = vectordb) formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history, context_5) #print("formatted_prompt",formatted_prompt) stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) output = "" for response in stream: output += response.token.text yield output return output additional_inputs=[ gr.Textbox( label="System Prompt", max_lines=1, interactive=True, ), gr.Slider( label="Temperature", value=0.7, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs", ), gr.Slider( label="Max new tokens", value=256, minimum=0, maximum=2048, step=64, interactive=True, info="The maximum numbers of new tokens", ), gr.Slider( label="Top-p (nucleus sampling)", value=0.90, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens", ), gr.Slider( label="Repetition penalty", value=1.3, minimum=1.0, maximum=2.0, step=0.05, interactive=True, info="Penalize repeated tokens", ) ] examples=[["Where is Maulana Azad National Urdu University?", None, None, None, None, None,], [ "When was Department of Women Education established?", None, None, None, None, None, ], ["Tell me about Department of Public Administration", None, None, None, None, None,], ["What are Reservations for SCs/STs/OBCs /Women candidates/EWS Categories?", None, None, None, None, None,], ["What is Upper Age Limit limit for Admissions", None, None, None, None, None,], ["Fetch Details of Hostel Fee* (2022-23)?", None, None, None, None, None,], ["What is Entrance Test Schedule 2023-24?", None, None, None, None, None,], ] gr.ChatInterface( fn=generate_fn, analytics_enabled=True, chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"), additional_inputs=additional_inputs, title="MANUU Chatbot Mixtral 46.7B Model Instructor Base Context Embeddings", examples=examples, concurrency_limit=20, ).launch()