oceansweep commited on
Commit
aa1db93
1 Parent(s): cfb144b

Update App_Function_Libraries/RAG/ChromaDB_Library.py

Browse files
App_Function_Libraries/RAG/ChromaDB_Library.py CHANGED
@@ -1,287 +1,290 @@
1
- import configparser
2
- import logging
3
- import sqlite3
4
- from typing import List, Dict, Any
5
-
6
- import chromadb
7
- import requests
8
- from chromadb import Settings
9
-
10
- from App_Function_Libraries.Chunk_Lib import improved_chunking_process
11
- from App_Function_Libraries.DB.DB_Manager import add_media_chunk, update_fts_for_media
12
- from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
13
-
14
- #######################################################################################################################
15
- #
16
- # Functions for ChromaDB
17
-
18
- # Get ChromaDB settings
19
- # Load configuration
20
- config = configparser.ConfigParser()
21
- config.read('config.txt')
22
- chroma_db_path = config.get('Database', 'chroma_db_path', fallback='chroma_db')
23
- chroma_client = chromadb.PersistentClient(path=chroma_db_path, settings=Settings(anonymized_telemetry=False))
24
-
25
- # Get embedding settings
26
- embedding_provider = config.get('Embeddings', 'provider', fallback='openai')
27
- embedding_model = config.get('Embeddings', 'model', fallback='text-embedding-3-small')
28
- embedding_api_key = config.get('Embeddings', 'api_key', fallback='')
29
- embedding_api_url = config.get('Embeddings', 'api_url', fallback='')
30
-
31
- # Get chunking options
32
- chunk_options = {
33
- 'method': config.get('Chunking', 'method', fallback='words'),
34
- 'max_size': config.getint('Chunking', 'max_size', fallback=400),
35
- 'overlap': config.getint('Chunking', 'overlap', fallback=200),
36
- 'adaptive': config.getboolean('Chunking', 'adaptive', fallback=False),
37
- 'multi_level': config.getboolean('Chunking', 'multi_level', fallback=False),
38
- 'language': config.get('Chunking', 'language', fallback='english')
39
- }
40
-
41
-
42
- def auto_update_chroma_embeddings(media_id: int, content: str):
43
- """
44
- Automatically update ChromaDB embeddings when a new item is ingested into the SQLite database.
45
-
46
- :param media_id: The ID of the newly ingested media item
47
- :param content: The content of the newly ingested media item
48
- """
49
- collection_name = f"media_{media_id}"
50
-
51
- # Initialize or get the ChromaDB collection
52
- collection = chroma_client.get_or_create_collection(name=collection_name)
53
-
54
- # Check if embeddings already exist for this media_id
55
- existing_embeddings = collection.get(ids=[f"{media_id}_chunk_{i}" for i in range(len(content))])
56
-
57
- if existing_embeddings and len(existing_embeddings) > 0:
58
- logging.info(f"Embeddings already exist for media ID {media_id}, skipping...")
59
- else:
60
- # Process and store content if embeddings do not already exist
61
- process_and_store_content(content, collection_name, media_id)
62
- logging.info(f"Updated ChromaDB embeddings for media ID: {media_id}")
63
-
64
-
65
- # Function to process content, create chunks, embeddings, and store in ChromaDB and SQLite
66
- def process_and_store_content(content: str, collection_name: str, media_id: int):
67
- # Process the content into chunks
68
- chunks = improved_chunking_process(content, chunk_options)
69
- texts = [chunk['text'] for chunk in chunks]
70
-
71
- # Generate embeddings for each chunk
72
- embeddings = [create_embedding(text) for text in texts]
73
-
74
- # Create unique IDs for each chunk using the media_id and chunk index
75
- ids = [f"{media_id}_chunk_{i}" for i in range(len(texts))]
76
-
77
- # Store the texts, embeddings, and IDs in ChromaDB
78
- store_in_chroma(collection_name, texts, embeddings, ids)
79
-
80
- # Store the chunk metadata in SQLite
81
- for i, chunk in enumerate(chunks):
82
- add_media_chunk(media_id, chunk['text'], chunk['start'], chunk['end'], ids[i])
83
-
84
- # Update the FTS table
85
- update_fts_for_media(media_id)
86
-
87
- # Function to store documents and their embeddings in ChromaDB
88
- def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str]):
89
- collection = chroma_client.get_or_create_collection(name=collection_name)
90
- collection.add(
91
- documents=texts,
92
- embeddings=embeddings,
93
- ids=ids
94
- )
95
-
96
- # Function to perform vector search using ChromaDB
97
- def vector_search(collection_name: str, query: str, k: int = 10) -> List[str]:
98
- query_embedding = create_embedding(query)
99
- collection = chroma_client.get_collection(name=collection_name)
100
- results = collection.query(
101
- query_embeddings=[query_embedding],
102
- n_results=k
103
- )
104
- return results['documents'][0]
105
-
106
-
107
- def create_embedding(text: str) -> List[float]:
108
- global embedding_provider, embedding_model, embedding_api_url, embedding_api_key
109
-
110
- if embedding_provider == 'openai':
111
- return get_openai_embeddings(text, embedding_model)
112
- elif embedding_provider == 'local':
113
- response = requests.post(
114
- embedding_api_url,
115
- json={"text": text, "model": embedding_model},
116
- headers={"Authorization": f"Bearer {embedding_api_key}"}
117
- )
118
- return response.json()['embedding']
119
- elif embedding_provider == 'huggingface':
120
- from transformers import AutoTokenizer, AutoModel
121
- import torch
122
-
123
- tokenizer = AutoTokenizer.from_pretrained(embedding_model)
124
- model = AutoModel.from_pretrained(embedding_model)
125
-
126
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
127
- with torch.no_grad():
128
- outputs = model(**inputs)
129
-
130
- # Use the mean of the last hidden state as the sentence embedding
131
- embeddings = outputs.last_hidden_state.mean(dim=1)
132
- return embeddings[0].tolist() # Convert to list for consistency
133
- else:
134
- raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
135
-
136
-
137
- def create_all_embeddings(api_choice: str, model_or_url: str) -> str:
138
- try:
139
- all_content = get_all_content_from_database()
140
-
141
- if not all_content:
142
- return "No content found in the database."
143
-
144
- texts_to_embed = []
145
- embeddings_to_store = []
146
- ids_to_store = []
147
- collection_name = "all_content_embeddings"
148
-
149
- # Initialize or get the ChromaDB collection
150
- collection = chroma_client.get_or_create_collection(name=collection_name)
151
-
152
- for content_item in all_content:
153
- media_id = content_item['id']
154
- text = content_item['content']
155
-
156
- # Check if the embedding already exists in ChromaDB
157
- embedding_exists = collection.get(ids=[f"doc_{media_id}"])
158
-
159
- if embedding_exists:
160
- logging.info(f"Embedding already exists for media ID {media_id}, skipping...")
161
- continue # Skip if embedding already exists
162
-
163
- # Create the embedding
164
- if api_choice == "openai":
165
- embedding = create_openai_embedding(text, model_or_url)
166
- else: # Llama.cpp
167
- embedding = create_llamacpp_embedding(text, model_or_url)
168
-
169
- # Collect the text, embedding, and ID for batch storage
170
- texts_to_embed.append(text)
171
- embeddings_to_store.append(embedding)
172
- ids_to_store.append(f"doc_{media_id}")
173
-
174
- # Store all new embeddings in ChromaDB
175
- if texts_to_embed and embeddings_to_store:
176
- store_in_chroma(collection_name, texts_to_embed, embeddings_to_store, ids_to_store)
177
-
178
- return "Embeddings created and stored successfully for all new content."
179
- except Exception as e:
180
- logging.error(f"Error during embedding creation: {str(e)}")
181
- return f"Error: {str(e)}"
182
-
183
-
184
- def create_openai_embedding(text: str, model: str) -> List[float]:
185
- openai_api_key = config['API']['openai_api_key']
186
- embedding = get_openai_embeddings(text, model)
187
- return embedding
188
-
189
-
190
- def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
191
- response = requests.post(
192
- api_url,
193
- json={"input": text}
194
- )
195
- if response.status_code == 200:
196
- return response.json()['embedding']
197
- else:
198
- raise Exception(f"Error from Llama.cpp API: {response.text}")
199
-
200
-
201
- def get_all_content_from_database() -> List[Dict[str, Any]]:
202
- """
203
- Retrieve all media content from the database that requires embedding.
204
-
205
- Returns:
206
- List[Dict[str, Any]]: A list of dictionaries, each containing the media ID, content, title, and other relevant fields.
207
- """
208
- try:
209
- from App_Function_Libraries.DB.DB_Manager import db
210
- with db.get_connection() as conn:
211
- cursor = conn.cursor()
212
- cursor.execute("""
213
- SELECT id, content, title, author, type
214
- FROM Media
215
- WHERE is_trash = 0 -- Exclude items marked as trash
216
- """)
217
- media_items = cursor.fetchall()
218
-
219
- # Convert the results into a list of dictionaries
220
- all_content = [
221
- {
222
- 'id': item[0],
223
- 'content': item[1],
224
- 'title': item[2],
225
- 'author': item[3],
226
- 'type': item[4]
227
- }
228
- for item in media_items
229
- ]
230
-
231
- return all_content
232
-
233
- except sqlite3.Error as e:
234
- logging.error(f"Error retrieving all content from database: {e}")
235
- from App_Function_Libraries.DB.SQLite_DB import DatabaseError
236
- raise DatabaseError(f"Error retrieving all content from database: {e}")
237
-
238
-
239
- def store_in_chroma_with_citation(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str], sources: List[str]):
240
- collection = chroma_client.get_or_create_collection(name=collection_name)
241
- collection.add(
242
- documents=texts,
243
- embeddings=embeddings,
244
- ids=ids,
245
- metadatas=[{'source': source} for source in sources]
246
- )
247
-
248
-
249
- def check_embedding_status(selected_item):
250
- if not selected_item:
251
- return "Please select an item", ""
252
- item_id = selected_item.split('(')[0].strip()
253
- collection = chroma_client.get_or_create_collection(name="all_content_embeddings")
254
- result = collection.get(ids=[f"doc_{item_id}"])
255
- if result['ids']:
256
- embedding = result['embeddings'][0]
257
- embedding_preview = str(embedding[:50]) # Convert first 50 elements to string
258
- return f"Embedding exists for item: {item_id}", f"Embedding preview: {embedding_preview}..."
259
- else:
260
- return f"No embedding found for item: {item_id}", ""
261
-
262
-
263
- def create_new_embedding(selected_item, api_choice, openai_model, llamacpp_url):
264
- if not selected_item:
265
- return "Please select an item"
266
- item_id = selected_item.split('(')[0].strip()
267
- items = get_all_content_from_database()
268
- item = next((item for item in items if item['title'] == item_id), None)
269
- if not item:
270
- return f"Item not found: {item_id}"
271
-
272
- try:
273
- if api_choice == "OpenAI":
274
- embedding = create_embedding(item['content'])
275
- else: # Llama.cpp
276
- embedding = create_embedding(item['content'])
277
-
278
- collection_name = "all_content_embeddings"
279
- store_in_chroma(collection_name, [item['content']], [embedding], [f"doc_{item['id']}"])
280
- return f"New embedding created and stored for item: {item_id}"
281
- except Exception as e:
282
- return f"Error creating embedding: {str(e)}"
283
-
284
-
285
- #
286
- # End of Functions for ChromaDB
 
 
 
287
  #######################################################################################################################
 
1
+ import configparser
2
+ import logging
3
+ import sqlite3
4
+ from typing import List, Dict, Any
5
+
6
+ import chromadb
7
+ import requests
8
+ from chromadb import Settings
9
+
10
+ from App_Function_Libraries.Chunk_Lib import improved_chunking_process
11
+ from App_Function_Libraries.DB.DB_Manager import add_media_chunk, update_fts_for_media
12
+ from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
13
+
14
+ #######################################################################################################################
15
+ #
16
+ # Functions for ChromaDB
17
+
18
+ # Get ChromaDB settings
19
+ # Load configuration
20
+ config = configparser.ConfigParser()
21
+ config.read('config.txt')
22
+ chroma_db_path = config.get('Database', 'chroma_db_path', fallback='chroma_db')
23
+ chroma_client = chromadb.PersistentClient(path=chroma_db_path, settings=Settings(anonymized_telemetry=False))
24
+
25
+ import os
26
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
27
+
28
+ # Get embedding settings
29
+ embedding_provider = config.get('Embeddings', 'provider', fallback='openai')
30
+ embedding_model = config.get('Embeddings', 'model', fallback='text-embedding-3-small')
31
+ embedding_api_key = config.get('Embeddings', 'api_key', fallback='')
32
+ embedding_api_url = config.get('Embeddings', 'api_url', fallback='')
33
+
34
+ # Get chunking options
35
+ chunk_options = {
36
+ 'method': config.get('Chunking', 'method', fallback='words'),
37
+ 'max_size': config.getint('Chunking', 'max_size', fallback=400),
38
+ 'overlap': config.getint('Chunking', 'overlap', fallback=200),
39
+ 'adaptive': config.getboolean('Chunking', 'adaptive', fallback=False),
40
+ 'multi_level': config.getboolean('Chunking', 'multi_level', fallback=False),
41
+ 'language': config.get('Chunking', 'language', fallback='english')
42
+ }
43
+
44
+
45
+ def auto_update_chroma_embeddings(media_id: int, content: str):
46
+ """
47
+ Automatically update ChromaDB embeddings when a new item is ingested into the SQLite database.
48
+
49
+ :param media_id: The ID of the newly ingested media item
50
+ :param content: The content of the newly ingested media item
51
+ """
52
+ collection_name = f"media_{media_id}"
53
+
54
+ # Initialize or get the ChromaDB collection
55
+ collection = chroma_client.get_or_create_collection(name=collection_name)
56
+
57
+ # Check if embeddings already exist for this media_id
58
+ existing_embeddings = collection.get(ids=[f"{media_id}_chunk_{i}" for i in range(len(content))])
59
+
60
+ if existing_embeddings and len(existing_embeddings) > 0:
61
+ logging.info(f"Embeddings already exist for media ID {media_id}, skipping...")
62
+ else:
63
+ # Process and store content if embeddings do not already exist
64
+ process_and_store_content(content, collection_name, media_id)
65
+ logging.info(f"Updated ChromaDB embeddings for media ID: {media_id}")
66
+
67
+
68
+ # Function to process content, create chunks, embeddings, and store in ChromaDB and SQLite
69
+ def process_and_store_content(content: str, collection_name: str, media_id: int):
70
+ # Process the content into chunks
71
+ chunks = improved_chunking_process(content, chunk_options)
72
+ texts = [chunk['text'] for chunk in chunks]
73
+
74
+ # Generate embeddings for each chunk
75
+ embeddings = [create_embedding(text) for text in texts]
76
+
77
+ # Create unique IDs for each chunk using the media_id and chunk index
78
+ ids = [f"{media_id}_chunk_{i}" for i in range(len(texts))]
79
+
80
+ # Store the texts, embeddings, and IDs in ChromaDB
81
+ store_in_chroma(collection_name, texts, embeddings, ids)
82
+
83
+ # Store the chunk metadata in SQLite
84
+ for i, chunk in enumerate(chunks):
85
+ add_media_chunk(media_id, chunk['text'], chunk['start'], chunk['end'], ids[i])
86
+
87
+ # Update the FTS table
88
+ update_fts_for_media(media_id)
89
+
90
+ # Function to store documents and their embeddings in ChromaDB
91
+ def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str]):
92
+ collection = chroma_client.get_or_create_collection(name=collection_name)
93
+ collection.add(
94
+ documents=texts,
95
+ embeddings=embeddings,
96
+ ids=ids
97
+ )
98
+
99
+ # Function to perform vector search using ChromaDB
100
+ def vector_search(collection_name: str, query: str, k: int = 10) -> List[str]:
101
+ query_embedding = create_embedding(query)
102
+ collection = chroma_client.get_collection(name=collection_name)
103
+ results = collection.query(
104
+ query_embeddings=[query_embedding],
105
+ n_results=k
106
+ )
107
+ return results['documents'][0]
108
+
109
+
110
+ def create_embedding(text: str) -> List[float]:
111
+ global embedding_provider, embedding_model, embedding_api_url, embedding_api_key
112
+
113
+ if embedding_provider == 'openai':
114
+ return get_openai_embeddings(text, embedding_model)
115
+ elif embedding_provider == 'local':
116
+ response = requests.post(
117
+ embedding_api_url,
118
+ json={"text": text, "model": embedding_model},
119
+ headers={"Authorization": f"Bearer {embedding_api_key}"}
120
+ )
121
+ return response.json()['embedding']
122
+ elif embedding_provider == 'huggingface':
123
+ from transformers import AutoTokenizer, AutoModel
124
+ import torch
125
+
126
+ tokenizer = AutoTokenizer.from_pretrained(embedding_model)
127
+ model = AutoModel.from_pretrained(embedding_model)
128
+
129
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
130
+ with torch.no_grad():
131
+ outputs = model(**inputs)
132
+
133
+ # Use the mean of the last hidden state as the sentence embedding
134
+ embeddings = outputs.last_hidden_state.mean(dim=1)
135
+ return embeddings[0].tolist() # Convert to list for consistency
136
+ else:
137
+ raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
138
+
139
+
140
+ def create_all_embeddings(api_choice: str, model_or_url: str) -> str:
141
+ try:
142
+ all_content = get_all_content_from_database()
143
+
144
+ if not all_content:
145
+ return "No content found in the database."
146
+
147
+ texts_to_embed = []
148
+ embeddings_to_store = []
149
+ ids_to_store = []
150
+ collection_name = "all_content_embeddings"
151
+
152
+ # Initialize or get the ChromaDB collection
153
+ collection = chroma_client.get_or_create_collection(name=collection_name)
154
+
155
+ for content_item in all_content:
156
+ media_id = content_item['id']
157
+ text = content_item['content']
158
+
159
+ # Check if the embedding already exists in ChromaDB
160
+ embedding_exists = collection.get(ids=[f"doc_{media_id}"])
161
+
162
+ if embedding_exists:
163
+ logging.info(f"Embedding already exists for media ID {media_id}, skipping...")
164
+ continue # Skip if embedding already exists
165
+
166
+ # Create the embedding
167
+ if api_choice == "openai":
168
+ embedding = create_openai_embedding(text, model_or_url)
169
+ else: # Llama.cpp
170
+ embedding = create_llamacpp_embedding(text, model_or_url)
171
+
172
+ # Collect the text, embedding, and ID for batch storage
173
+ texts_to_embed.append(text)
174
+ embeddings_to_store.append(embedding)
175
+ ids_to_store.append(f"doc_{media_id}")
176
+
177
+ # Store all new embeddings in ChromaDB
178
+ if texts_to_embed and embeddings_to_store:
179
+ store_in_chroma(collection_name, texts_to_embed, embeddings_to_store, ids_to_store)
180
+
181
+ return "Embeddings created and stored successfully for all new content."
182
+ except Exception as e:
183
+ logging.error(f"Error during embedding creation: {str(e)}")
184
+ return f"Error: {str(e)}"
185
+
186
+
187
+ def create_openai_embedding(text: str, model: str) -> List[float]:
188
+ openai_api_key = config['API']['openai_api_key']
189
+ embedding = get_openai_embeddings(text, model)
190
+ return embedding
191
+
192
+
193
+ def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
194
+ response = requests.post(
195
+ api_url,
196
+ json={"input": text}
197
+ )
198
+ if response.status_code == 200:
199
+ return response.json()['embedding']
200
+ else:
201
+ raise Exception(f"Error from Llama.cpp API: {response.text}")
202
+
203
+
204
+ def get_all_content_from_database() -> List[Dict[str, Any]]:
205
+ """
206
+ Retrieve all media content from the database that requires embedding.
207
+
208
+ Returns:
209
+ List[Dict[str, Any]]: A list of dictionaries, each containing the media ID, content, title, and other relevant fields.
210
+ """
211
+ try:
212
+ from App_Function_Libraries.DB.DB_Manager import db
213
+ with db.get_connection() as conn:
214
+ cursor = conn.cursor()
215
+ cursor.execute("""
216
+ SELECT id, content, title, author, type
217
+ FROM Media
218
+ WHERE is_trash = 0 -- Exclude items marked as trash
219
+ """)
220
+ media_items = cursor.fetchall()
221
+
222
+ # Convert the results into a list of dictionaries
223
+ all_content = [
224
+ {
225
+ 'id': item[0],
226
+ 'content': item[1],
227
+ 'title': item[2],
228
+ 'author': item[3],
229
+ 'type': item[4]
230
+ }
231
+ for item in media_items
232
+ ]
233
+
234
+ return all_content
235
+
236
+ except sqlite3.Error as e:
237
+ logging.error(f"Error retrieving all content from database: {e}")
238
+ from App_Function_Libraries.DB.SQLite_DB import DatabaseError
239
+ raise DatabaseError(f"Error retrieving all content from database: {e}")
240
+
241
+
242
+ def store_in_chroma_with_citation(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str], sources: List[str]):
243
+ collection = chroma_client.get_or_create_collection(name=collection_name)
244
+ collection.add(
245
+ documents=texts,
246
+ embeddings=embeddings,
247
+ ids=ids,
248
+ metadatas=[{'source': source} for source in sources]
249
+ )
250
+
251
+
252
+ def check_embedding_status(selected_item):
253
+ if not selected_item:
254
+ return "Please select an item", ""
255
+ item_id = selected_item.split('(')[0].strip()
256
+ collection = chroma_client.get_or_create_collection(name="all_content_embeddings")
257
+ result = collection.get(ids=[f"doc_{item_id}"])
258
+ if result['ids']:
259
+ embedding = result['embeddings'][0]
260
+ embedding_preview = str(embedding[:50]) # Convert first 50 elements to string
261
+ return f"Embedding exists for item: {item_id}", f"Embedding preview: {embedding_preview}..."
262
+ else:
263
+ return f"No embedding found for item: {item_id}", ""
264
+
265
+
266
+ def create_new_embedding(selected_item, api_choice, openai_model, llamacpp_url):
267
+ if not selected_item:
268
+ return "Please select an item"
269
+ item_id = selected_item.split('(')[0].strip()
270
+ items = get_all_content_from_database()
271
+ item = next((item for item in items if item['title'] == item_id), None)
272
+ if not item:
273
+ return f"Item not found: {item_id}"
274
+
275
+ try:
276
+ if api_choice == "OpenAI":
277
+ embedding = create_embedding(item['content'])
278
+ else: # Llama.cpp
279
+ embedding = create_embedding(item['content'])
280
+
281
+ collection_name = "all_content_embeddings"
282
+ store_in_chroma(collection_name, [item['content']], [embedding], [f"doc_{item['id']}"])
283
+ return f"New embedding created and stored for item: {item_id}"
284
+ except Exception as e:
285
+ return f"Error creating embedding: {str(e)}"
286
+
287
+
288
+ #
289
+ # End of Functions for ChromaDB
290
  #######################################################################################################################