Spaces:
Sleeping
Sleeping
feat: add init scripts
Browse files- init_data.py +39 -0
- translated-content +1 -0
init_data.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from qdrant_client import QdrantClient
|
2 |
+
from qdrant_client.http.models import Distance, VectorParams
|
3 |
+
from qdrant_client.http.models import PointStruct
|
4 |
+
import tqdm
|
5 |
+
import glob
|
6 |
+
import model
|
7 |
+
import re
|
8 |
+
|
9 |
+
if __name__ == '__main__':
|
10 |
+
client = QdrantClient("127.0.0.1", port=6333)
|
11 |
+
collection_name = "mdn-docs"
|
12 |
+
client.recreate_collection(
|
13 |
+
collection_name=collection_name,
|
14 |
+
vectors_config=VectorParams(size=768, distance=Distance.COSINE),
|
15 |
+
)
|
16 |
+
|
17 |
+
count = 0
|
18 |
+
files = glob.glob("translated-content/files/zh-cn/**/*.md", recursive=True)
|
19 |
+
print(len(files))
|
20 |
+
for file in tqdm.tqdm(files):
|
21 |
+
count+=1
|
22 |
+
with open(file, 'r', encoding='utf-8') as f:
|
23 |
+
print('file', file)
|
24 |
+
text = f.read()
|
25 |
+
matchObj = re.match(r'\s*---[\n\r]+title:(((?!---).)+)', text, re.M|re.I)
|
26 |
+
if matchObj:
|
27 |
+
title = matchObj.group(1).strip()
|
28 |
+
else:
|
29 |
+
title = file
|
30 |
+
|
31 |
+
vector = model.encode(text)
|
32 |
+
client.upsert(
|
33 |
+
collection_name=collection_name,
|
34 |
+
wait=True,
|
35 |
+
points=[
|
36 |
+
PointStruct(id=count, vector=vector, payload={"title": title, "text": text }),
|
37 |
+
],
|
38 |
+
)
|
39 |
+
|
translated-content
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 79462bd3fd2533e3b71a117d1c98fafb8d4ca0e2
|