Spaces:
Sleeping
Sleeping
File size: 1,251 Bytes
03f8b69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct
import tqdm
import glob
import model
import re
if __name__ == '__main__':
client = QdrantClient("127.0.0.1", port=6333)
collection_name = "mdn-docs"
client.recreate_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)
count = 0
files = glob.glob("translated-content/files/zh-cn/**/*.md", recursive=True)
print(len(files))
for file in tqdm.tqdm(files):
count+=1
with open(file, 'r', encoding='utf-8') as f:
print('file', file)
text = f.read()
matchObj = re.match(r'\s*---[\n\r]+title:(((?!---).)+)', text, re.M|re.I)
if matchObj:
title = matchObj.group(1).strip()
else:
title = file
vector = model.encode(text)
client.upsert(
collection_name=collection_name,
wait=True,
points=[
PointStruct(id=count, vector=vector, payload={"title": title, "text": text }),
],
)
|