TheDavidYoungblood commited on
Commit
fb75b53
1 Parent(s): 8e70e09

99 additions of files in the repo, 99 additions of files...

Browse files
Files changed (3) hide show
  1. FAISS-index.py +18 -20
  2. RAGbot.py +67 -36
  3. requirements.txt +9 -1
FAISS-index.py CHANGED
@@ -1,27 +1,25 @@
1
  from datasets import Dataset, load_from_disk
2
  import faiss
3
  import numpy as np
4
- from transformers import RagRetriever, RagTokenizer, RagSequenceForGeneration
5
 
6
- # Example: Create a dataset
7
- data = {"text": ["This is a sample text.", "Another sample text."]}
8
- dataset = Dataset.from_dict(data)
 
 
 
 
 
 
 
9
 
10
- # Save the dataset to disk
11
- dataset_path = "path/to/your/dataset"
12
- dataset.save_to_disk(dataset_path)
13
 
14
- # Create FAISS index
15
- passages = dataset["text"]
16
- tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
17
- model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq")
18
- passage_embeddings = model.get_encoder()(
19
- tokenizer(passages, return_tensors="pt", padding=True, truncation=True)
20
- ).last_hidden_state.mean(dim=1).detach().numpy()
21
 
22
- index = faiss.IndexFlatL2(passage_embeddings.shape[1])
23
- index.add(passage_embeddings)
24
-
25
- # Save the index to disk
26
- index_path = "path/to/your/index"
27
- faiss.write_index(index, index_path)
 
1
  from datasets import Dataset, load_from_disk
2
  import faiss
3
  import numpy as np
4
+ from transformers import RagTokenizer, RagSequenceForGeneration
5
 
6
+ def create_and_save_faiss_index(dataset_path, index_path):
7
+ dataset = load_from_disk(dataset_path)
8
+ passages = dataset["text"]
9
+
10
+ tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
11
+ model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq")
12
+
13
+ passage_embeddings = model.get_encoder()(
14
+ tokenizer(passages, return_tensors="pt", padding=True, truncation=True)
15
+ ).last_hidden_state.mean(dim=1).detach().numpy()
16
 
17
+ index = faiss.IndexFlatL2(passage_embeddings.shape[1])
18
+ index.add(passage_embeddings)
 
19
 
20
+ faiss.write_index(index, index_path)
 
 
 
 
 
 
21
 
22
+ if __name__ == "__main__":
23
+ dataset_path = "path/to/your/hf_dataset"
24
+ index_path = "path/to/your/hf_index"
25
+ create_and_save_faiss_index(dataset_path, index_path)
 
 
RAGbot.py CHANGED
@@ -10,7 +10,12 @@ from langchain.document_loaders import PyPDFLoader
10
  from langchain.prompts import PromptTemplate
11
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
12
  import spaces
13
- from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
 
 
 
 
 
14
 
15
  class RAGbot:
16
  def __init__(self, config_path="config.yaml"):
@@ -20,7 +25,8 @@ class RAGbot:
20
  self.prompt = None
21
  self.documents = None
22
  self.embeddings = None
23
- self.vectordb = None
 
24
  self.tokenizer = None
25
  self.model = None
26
  self.pipeline = None
@@ -38,22 +44,26 @@ class RAGbot:
38
  self.model_embeddings = config["modelEmbeddings"]
39
  self.auto_tokenizer = config["autoTokenizer"]
40
  self.auto_model_for_causal_lm = config["autoModelForCausalLM"]
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  def load_embeddings(self):
43
  self.embeddings = HuggingFaceEmbeddings(model_name=self.model_embeddings)
44
- print("Embedding model loaded")
45
-
46
- def load_vectordb(self):
47
- overlap = int((self.overlap_percentage / 100) * self.chunk_size)
48
- text_splitter = RecursiveCharacterTextSplitter(
49
- chunk_size=self.chunk_size,
50
- chunk_overlap=overlap,
51
- length_function=len,
52
- add_start_index=True,
53
- )
54
- docs = text_splitter.split_documents(self.documents)
55
- self.vectordb = Chroma.from_documents(docs, self.embeddings)
56
- print("Vector store created")
57
 
58
  @spaces.GPU
59
  def load_tokenizer(self):
@@ -67,20 +77,34 @@ class RAGbot:
67
  model_kwargs={"torch_dtype": torch.bfloat16},
68
  device="cuda",
69
  )
70
- print("Model pipeline loaded")
71
 
72
- def get_organic_context(self, query):
73
- documents = self.vectordb.similarity_search_with_relevance_scores(query, k=self.max_chunks_in_context)
74
- context = self.format_seperator.join([doc.page_content for doc, score in documents])
 
 
 
 
 
 
 
 
 
 
 
 
75
  self.current_context = context
76
- print("Context Ready")
77
- print(self.current_context)
 
 
 
78
 
79
  @spaces.GPU
80
- def create_organic_response(self, history, query):
81
- self.get_organic_context(query)
82
  messages = [
83
- {"role": "system", "content": "From the context given below, answer the user's question\n" + self.current_context},
84
  {"role": "user", "content": query},
85
  ]
86
 
@@ -97,17 +121,15 @@ class RAGbot:
97
  temperature=temp,
98
  top_p=0.9,
99
  )
100
- print(outputs)
101
  return outputs[0]["generated_text"][len(prompt):]
102
 
103
  def process_file(self, file):
104
  self.documents = PyPDFLoader(file.name).load()
105
  self.load_embeddings()
106
- self.load_vectordb()
107
- self.create_organic_pipeline()
108
 
109
  @spaces.GPU
110
- def generate_response(self, history, query, file, chunk_size, chunk_overlap_percentage, model_temperature, max_chunks_in_context):
111
  self.chunk_size = chunk_size
112
  self.overlap_percentage = chunk_overlap_percentage
113
  self.model_temperatue = model_temperature
@@ -115,19 +137,28 @@ class RAGbot:
115
 
116
  if not query:
117
  raise gr.Error(message='Submit a question')
118
- if not file:
119
- raise gr.Error(message='Upload a PDF')
120
- if not self.processed:
121
- self.process_file(file)
122
- self.processed = True
123
 
124
- result = self.create_organic_response(history="", query=query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  for char in result:
126
  history[-1][-1] += char
127
  return history, ""
128
 
129
  def render_file(self, file, chunk_size, chunk_overlap_percentage, model_temperature, max_chunks_in_context):
130
- print(chunk_size)
131
  doc = fitz.open(file.name)
132
  page = doc[self.page]
133
  self.chunk_size = chunk_size
@@ -142,4 +173,4 @@ class RAGbot:
142
  if not text:
143
  raise gr.Error('Enter text')
144
  history.append((text, ''))
145
- return history
 
10
  from langchain.prompts import PromptTemplate
11
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
12
  import spaces
13
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
14
+ from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType
15
+ from datasets import Dataset, load_from_disk
16
+ import faiss
17
+ import numpy as np
18
+ from pastebin_api import get_protected_content
19
 
20
  class RAGbot:
21
  def __init__(self, config_path="config.yaml"):
 
25
  self.prompt = None
26
  self.documents = None
27
  self.embeddings = None
28
+ self.zilliz_vectordb = None
29
+ self.hf_vectordb = None
30
  self.tokenizer = None
31
  self.model = None
32
  self.pipeline = None
 
44
  self.model_embeddings = config["modelEmbeddings"]
45
  self.auto_tokenizer = config["autoTokenizer"]
46
  self.auto_model_for_causal_lm = config["autoModelForCausalLM"]
47
+ self.zilliz_config = config["zilliz"]
48
+ self.persona_paste_key = config["personaPasteKey"]
49
+
50
+ def connect_to_zilliz(self):
51
+ connections.connect(
52
+ host=self.zilliz_config["host"],
53
+ port=self.zilliz_config["port"],
54
+ user=self.zilliz_config["user"],
55
+ password=self.zilliz_config["password"],
56
+ secure=True
57
+ )
58
+ self.zilliz_vectordb = Collection(self.zilliz_config["collection"])
59
 
60
  def load_embeddings(self):
61
  self.embeddings = HuggingFaceEmbeddings(model_name=self.model_embeddings)
62
+
63
+ def load_hf_vectordb(self, dataset_path, index_path):
64
+ dataset = load_from_disk(dataset_path)
65
+ index = faiss.read_index(index_path)
66
+ self.hf_vectordb = (dataset, index)
 
 
 
 
 
 
 
 
67
 
68
  @spaces.GPU
69
  def load_tokenizer(self):
 
77
  model_kwargs={"torch_dtype": torch.bfloat16},
78
  device="cuda",
79
  )
 
80
 
81
+ def get_organic_context(self, query, use_hf=False):
82
+ if use_hf:
83
+ dataset, index = self.hf_vectordb
84
+ D, I = index.search(np.array([self.embeddings.embed_query(query)]), self.max_chunks_in_context)
85
+ context = self.format_seperator.join([dataset[i] for i in I[0]])
86
+ else:
87
+ result = self.zilliz_vectordb.search(
88
+ data=[self.embeddings.embed_query(query)],
89
+ anns_field="embeddings",
90
+ param={"metric_type": "IP", "params": {"nprobe": 10}},
91
+ limit=self.max_chunks_in_context,
92
+ expr=None,
93
+ )
94
+ context = self.format_seperator.join([hit.entity.get('text') for hit in result[0]])
95
+
96
  self.current_context = context
97
+
98
+ def load_persona_data(self):
99
+ persona_content = get_protected_content(self.persona_paste_key)
100
+ persona_data = yaml.safe_load(persona_content)
101
+ self.persona_text = persona_data["persona_text"]
102
 
103
  @spaces.GPU
104
+ def create_organic_response(self, history, query, use_hf=False):
105
+ self.get_organic_context(query, use_hf=use_hf)
106
  messages = [
107
+ {"role": "system", "content": f"Based on the given context, answer the user's question while maintaining the persona:\n{self.persona_text}\n\nContext:\n{self.current_context}"},
108
  {"role": "user", "content": query},
109
  ]
110
 
 
121
  temperature=temp,
122
  top_p=0.9,
123
  )
 
124
  return outputs[0]["generated_text"][len(prompt):]
125
 
126
  def process_file(self, file):
127
  self.documents = PyPDFLoader(file.name).load()
128
  self.load_embeddings()
129
+ self.connect_to_zilliz()
 
130
 
131
  @spaces.GPU
132
+ def generate_response(self, history, query, file, chunk_size, chunk_overlap_percentage, model_temperature, max_chunks_in_context, use_hf_index=False, hf_dataset_path=None, hf_index_path=None):
133
  self.chunk_size = chunk_size
134
  self.overlap_percentage = chunk_overlap_percentage
135
  self.model_temperatue = model_temperature
 
137
 
138
  if not query:
139
  raise gr.Error(message='Submit a question')
 
 
 
 
 
140
 
141
+ if use_hf_index:
142
+ if not hf_dataset_path or not hf_index_path:
143
+ raise gr.Error(message='Provide HuggingFace dataset and index paths')
144
+ self.load_hf_vectordb(hf_dataset_path, hf_index_path)
145
+ result = self.create_organic_response(history="", query=query, use_hf=True)
146
+ else:
147
+ if not file:
148
+ raise gr.Error(message='Upload a PDF')
149
+ if not self.processed:
150
+ self.process_file(file)
151
+ self.processed = True
152
+ result = self.create_organic_response(history="", query=query)
153
+
154
+ self.load_persona_data()
155
+ result = f"{self.persona_text}\n\n{result}"
156
+
157
  for char in result:
158
  history[-1][-1] += char
159
  return history, ""
160
 
161
  def render_file(self, file, chunk_size, chunk_overlap_percentage, model_temperature, max_chunks_in_context):
 
162
  doc = fitz.open(file.name)
163
  page = doc[self.page]
164
  self.chunk_size = chunk_size
 
173
  if not text:
174
  raise gr.Error('Enter text')
175
  history.append((text, ''))
176
+ return history
requirements.txt CHANGED
@@ -6,7 +6,15 @@ langchain-community
6
  tqdm
7
  accelerate
8
  pypdf
 
9
  protobuf>=3.20,<5
10
  poetry
 
 
 
 
 
 
 
11
  requests
12
- pymilvus
 
6
  tqdm
7
  accelerate
8
  pypdf
9
+ faiss-cpu
10
  protobuf>=3.20,<5
11
  poetry
12
+ pymilvus
13
+ chromadb
14
+ gradio
15
+ fitz
16
+ PyYAML
17
+ datasets
18
+ numpy
19
  requests
20
+ python-dotenv