File size: 6,351 Bytes
c5634f3
 
 
 
 
8dbf887
 
c5634f3
 
 
 
 
 
 
 
8194e39
c5634f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d578004
c5634f3
d578004
 
 
c5634f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f2438c
c5634f3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import os
import pandas as pd
import  pathlib, fitz
from langchain.vectorstores import Chroma
# from PyPDF2 import PdfReader
# from google.colab import files
# from google.colab import userdata
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.docstore.document import Document
from huggingface_hub import InferenceClient
import gradio as gr


file_paths = ["docs/Prospectus_2023_24_Eng_Version.pdf","docs/MANUU_UG_PROGRAMMES_PROSPECTUS_2022_23_Eng_5_April_2022.pdf"]

page_contents = []
for fname in file_paths:
  with fitz.open(fname) as doc:
    print("Total Pages in {} are {}".format(fname,len(doc)))
    for page in doc:
      text = page.get_text()
      if "............" in text:
        continue
      #print(text)
      page_contents.append(text)
      #break


embedding_model = HuggingFaceInstructEmbeddings(
            model_name="hkunlp/instructor-base",
            #model_name="jinaai/jina-embedding-b-en-v1",
            #model_name="WhereIsAI/UAE-Large-V1",
            #model_kwargs={"device": "cuda"}
            model_kwargs={"device": "cpu"}
        )

df_documents_chunks = pd.DataFrame({"doc_pages":page_contents})
df_documents_chunks["index_id"] = df_documents_chunks.index
print(df_documents_chunks)

def row_to_doc(row):
    return Document(metadata={
            'id': row['index_id']
        }, page_content=row['doc_pages'])


manuuindex_df_processed_documents = df_documents_chunks.apply(lambda row:row_to_doc(row),axis=1).to_list()


COLLECTION_NAME='Manuu_collection'
PERSIST_DIR='MANUU_dir4'


if os.path.exists(PERSIST_DIR):
    print('Existing Collection : ', COLLECTION_NAME)
    vectordb = Chroma(persist_directory=PERSIST_DIR, collection_name=COLLECTION_NAME, embedding_function=embedding_model)
    print(f"Collection {vectordb._collection.name} has {vectordb._collection.count()} documents...")
else:
    print('New Collection : ', COLLECTION_NAME)
    vectordb = Chroma.from_documents(documents=manuuindex_df_processed_documents,
                                     embedding=embedding_model,
                                     collection_name=COLLECTION_NAME,
                                     persist_directory=PERSIST_DIR,
                                     collection_metadata=None)
    client = vectordb.persist() # Save vector database as persistent files in the output folder

    print(f"Collection {vectordb._collection.name} has {vectordb._collection.count()} documents...")


client = InferenceClient(
    model = "mistralai/Mixtral-8x7B-Instruct-v0.1")


def context_fn(question_text,vectordb):
  relevant_chunks = vectordb.similarity_search_with_score(
      query=question_text,
    k=5,)
  context_5 = "\n\n\n".join([i[0].page_content for i in relevant_chunks])

  return context_5


def format_prompt(message, history, context_prompt):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt}. Do not Give information from outside the Document Contexts and general Information[/INST]"
    prompt += f" {bot_response}\n"
    prompt +=  f" CONTEXT:{context_prompt}</s> "
  prompt += f"[INST] {message} [/INST]"
  with open('prompts.txt', 'a') as file:
          print("user_prompt",prompt, file=file)
          file.close()
  return prompt


def generate_fn(
    prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,vectordb = vectordb
):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )
    context_5 = context_fn(question_text = prompt, vectordb = vectordb)
    formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history, context_5)
    #print("formatted_prompt",formatted_prompt)
    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    output = ""

    for response in stream:
        output += response.token.text
        yield output
    return output
additional_inputs=[
    gr.Textbox(
        label="System Prompt",
        max_lines=1,
        interactive=True,
    ),
    gr.Slider(
        label="Temperature",
        value=0.7,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),
    gr.Slider(
        label="Max new tokens",
        value=256,
        minimum=0,
        maximum=2048,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.90,
        minimum=0.0,
        maximum=1,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Repetition penalty",
        value=1.3,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    )
]

examples=[["Where is Maulana Azad National Urdu University?", None, None, None, None, None,],
          [ "When was Department of Women Education established?", None, None, None, None, None, ],
          ["Tell me about  Department of Public Administration", None, None, None, None, None,],
          ["What are Reservations for SCs/STs/OBCs /Women candidates/EWS Categories?", None, None, None, None, None,],
          ["What is Upper Age Limit limit for Admissions", None, None, None, None, None,],
          ["Fetch Details of Hostel Fee* (2022-23)?", None, None, None, None, None,],
          ["What is Entrance Test Schedule 2023-24?", None, None, None, None, None,],
         ]

gr.ChatInterface(
    fn=generate_fn,
    analytics_enabled=True,

    chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
    additional_inputs=additional_inputs,
    title="MANUU Chatbot Mixtral 46.7B Model Instructor Base Context Embeddings",
    examples=examples,
    concurrency_limit=20,
).launch()