Spaces:
Runtime error
Runtime error
Commit
•
5b7578c
1
Parent(s):
75b3ab4
Added vector db
Browse files- notebooks/04_vector_db.ipynb +241 -0
- requirements.txt +1 -0
notebooks/04_vector_db.ipynb
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "6a151ade-7d86-4a2e-bfe7-462089f4e04c",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# Approach\n",
|
9 |
+
"There are a number of aspects of choosing a vector db that might be unique to your situation. You should think through your HW, utilization, latency requirements, scale, etc before choosing. \n",
|
10 |
+
"\n",
|
11 |
+
"Im targeting a demo (low utilization, latency can be relaxed) that will live on a huggingface space. I have a small scale that could even fit in memory. I like [Qdrant](https://qdrant.tech) for this. "
|
12 |
+
]
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"cell_type": "markdown",
|
16 |
+
"id": "b1b28232-b65d-41ce-88de-fd70b93a528d",
|
17 |
+
"metadata": {},
|
18 |
+
"source": [
|
19 |
+
"# Imports"
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"cell_type": "code",
|
24 |
+
"execution_count": 1,
|
25 |
+
"id": "88408486-566a-4791-8ef2-5ee3e6941156",
|
26 |
+
"metadata": {
|
27 |
+
"tags": []
|
28 |
+
},
|
29 |
+
"outputs": [],
|
30 |
+
"source": [
|
31 |
+
"from IPython.core.interactiveshell import InteractiveShell\n",
|
32 |
+
"InteractiveShell.ast_node_interactivity = 'all'"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"cell_type": "code",
|
37 |
+
"execution_count": 2,
|
38 |
+
"id": "abb5186b-ee67-4e1e-882d-3d8d5b4575d4",
|
39 |
+
"metadata": {
|
40 |
+
"tags": []
|
41 |
+
},
|
42 |
+
"outputs": [],
|
43 |
+
"source": [
|
44 |
+
"from pathlib import Path\n",
|
45 |
+
"import pickle\n",
|
46 |
+
"\n",
|
47 |
+
"from tqdm.notebook import tqdm\n",
|
48 |
+
"from haystack.schema import Document\n",
|
49 |
+
"from qdrant_haystack import QdrantDocumentStore"
|
50 |
+
]
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"cell_type": "code",
|
54 |
+
"execution_count": 3,
|
55 |
+
"id": "c4b82ea2-8b30-4c2e-99f0-9a30f2f1bfb7",
|
56 |
+
"metadata": {
|
57 |
+
"tags": []
|
58 |
+
},
|
59 |
+
"outputs": [
|
60 |
+
{
|
61 |
+
"name": "stdout",
|
62 |
+
"output_type": "stream",
|
63 |
+
"text": [
|
64 |
+
"/home/ec2-user/RAGDemo\n"
|
65 |
+
]
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"source": [
|
69 |
+
"proj_dir = Path.cwd().parent\n",
|
70 |
+
"print(proj_dir)"
|
71 |
+
]
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"cell_type": "markdown",
|
75 |
+
"id": "76119e74-f601-436d-a253-63c5a19d1c83",
|
76 |
+
"metadata": {},
|
77 |
+
"source": [
|
78 |
+
"# Config"
|
79 |
+
]
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"cell_type": "code",
|
83 |
+
"execution_count": 4,
|
84 |
+
"id": "f6f74545-54a7-4f41-9f02-96964e1417f0",
|
85 |
+
"metadata": {
|
86 |
+
"tags": []
|
87 |
+
},
|
88 |
+
"outputs": [],
|
89 |
+
"source": [
|
90 |
+
"file_in = proj_dir / 'data/processed/simple_wiki_embeddings.pkl'"
|
91 |
+
]
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"cell_type": "markdown",
|
95 |
+
"id": "d2dd0df0-4274-45b3-9ee5-0205494e4d75",
|
96 |
+
"metadata": {
|
97 |
+
"tags": []
|
98 |
+
},
|
99 |
+
"source": [
|
100 |
+
"# Setup\n",
|
101 |
+
"Read in our list of dictionaries. This is the upper end for the machine Im using. This takes ~10GB of RAM. We could easily do this in batches of ~100k and be fine in most machines. "
|
102 |
+
]
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"cell_type": "code",
|
106 |
+
"execution_count": 5,
|
107 |
+
"id": "3c08e039-3686-4eca-9f87-7c469e3f19bc",
|
108 |
+
"metadata": {
|
109 |
+
"tags": []
|
110 |
+
},
|
111 |
+
"outputs": [
|
112 |
+
{
|
113 |
+
"name": "stdout",
|
114 |
+
"output_type": "stream",
|
115 |
+
"text": [
|
116 |
+
"CPU times: user 11.6 s, sys: 2.25 s, total: 13.9 s\n",
|
117 |
+
"Wall time: 18.1 s\n"
|
118 |
+
]
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"source": [
|
122 |
+
"%%time\n",
|
123 |
+
"with open(file_in, 'rb') as handle:\n",
|
124 |
+
" documents = pickle.load(handle)"
|
125 |
+
]
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"cell_type": "markdown",
|
129 |
+
"id": "98aec715-8d97-439e-99c0-0eff63df386b",
|
130 |
+
"metadata": {},
|
131 |
+
"source": [
|
132 |
+
"Convert the dictionaries to `Documents`"
|
133 |
+
]
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"cell_type": "code",
|
137 |
+
"execution_count": 6,
|
138 |
+
"id": "4821e3c1-697d-4b69-bae3-300168755df9",
|
139 |
+
"metadata": {
|
140 |
+
"tags": []
|
141 |
+
},
|
142 |
+
"outputs": [],
|
143 |
+
"source": [
|
144 |
+
"documents = [Document.from_dict(d) for d in documents]"
|
145 |
+
]
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"cell_type": "markdown",
|
149 |
+
"id": "676f644c-fb09-4d17-89ba-30c92aad8777",
|
150 |
+
"metadata": {},
|
151 |
+
"source": [
|
152 |
+
"Instantiate our `DocumentStore`. Note that Im saving this to disk, this is for portability which is good considering I want to move from this ec2 instance into a Hugging Face Space. \n",
|
153 |
+
"\n",
|
154 |
+
"Note that if you are doing this at scale, you should use a proper instance and not saving to file. You should also take a [measured ingestion](https://qdrant.tech/documentation/tutorials/bulk-upload/) approach instead of using a convenient loader. "
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"cell_type": "code",
|
159 |
+
"execution_count": 7,
|
160 |
+
"id": "e51b6e19-3be8-4cb0-8b65-9d6f6121f660",
|
161 |
+
"metadata": {
|
162 |
+
"tags": []
|
163 |
+
},
|
164 |
+
"outputs": [],
|
165 |
+
"source": [
|
166 |
+
"document_store = QdrantDocumentStore(\n",
|
167 |
+
" path=str(proj_dir/'Qdrant'),\n",
|
168 |
+
" index=\"RAGDemo\",\n",
|
169 |
+
" embedding_dim=768,\n",
|
170 |
+
" recreate_index=True,\n",
|
171 |
+
" hnsw_config={\"m\": 16, \"ef_construct\": 64} # Optional\n",
|
172 |
+
")"
|
173 |
+
]
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"cell_type": "code",
|
177 |
+
"execution_count": 9,
|
178 |
+
"id": "55fbcd5d-922c-4e93-a37a-974ba84464ac",
|
179 |
+
"metadata": {
|
180 |
+
"tags": []
|
181 |
+
},
|
182 |
+
"outputs": [
|
183 |
+
{
|
184 |
+
"name": "stderr",
|
185 |
+
"output_type": "stream",
|
186 |
+
"text": [
|
187 |
+
"270000it [28:43, 156.68it/s] "
|
188 |
+
]
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"name": "stdout",
|
192 |
+
"output_type": "stream",
|
193 |
+
"text": [
|
194 |
+
"CPU times: user 13min 23s, sys: 48.6 s, total: 14min 12s\n",
|
195 |
+
"Wall time: 28min 43s\n"
|
196 |
+
]
|
197 |
+
},
|
198 |
+
{
|
199 |
+
"name": "stderr",
|
200 |
+
"output_type": "stream",
|
201 |
+
"text": [
|
202 |
+
"\n"
|
203 |
+
]
|
204 |
+
}
|
205 |
+
],
|
206 |
+
"source": [
|
207 |
+
"%%time\n",
|
208 |
+
"document_store.write_documents(documents, batch_size=5_000)"
|
209 |
+
]
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"cell_type": "code",
|
213 |
+
"execution_count": null,
|
214 |
+
"id": "9a073815-0191-48f7-890f-a4e4ecc0f9f1",
|
215 |
+
"metadata": {},
|
216 |
+
"outputs": [],
|
217 |
+
"source": []
|
218 |
+
}
|
219 |
+
],
|
220 |
+
"metadata": {
|
221 |
+
"kernelspec": {
|
222 |
+
"display_name": "Python 3 (ipykernel)",
|
223 |
+
"language": "python",
|
224 |
+
"name": "python3"
|
225 |
+
},
|
226 |
+
"language_info": {
|
227 |
+
"codemirror_mode": {
|
228 |
+
"name": "ipython",
|
229 |
+
"version": 3
|
230 |
+
},
|
231 |
+
"file_extension": ".py",
|
232 |
+
"mimetype": "text/x-python",
|
233 |
+
"name": "python",
|
234 |
+
"nbconvert_exporter": "python",
|
235 |
+
"pygments_lexer": "ipython3",
|
236 |
+
"version": "3.10.9"
|
237 |
+
}
|
238 |
+
},
|
239 |
+
"nbformat": 4,
|
240 |
+
"nbformat_minor": 5
|
241 |
+
}
|
requirements.txt
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
wikiextractor==3.0.6
|
2 |
farm-haystack[inference]==1.20.1
|
|
|
3 |
ipywidgets==8.1.1
|
4 |
tqdm==4.66.1
|
5 |
aiohttp-3.8.6
|
|
|
1 |
wikiextractor==3.0.6
|
2 |
farm-haystack[inference]==1.20.1
|
3 |
+
qdrant-haystack==1.0.11
|
4 |
ipywidgets==8.1.1
|
5 |
tqdm==4.66.1
|
6 |
aiohttp-3.8.6
|