Spaces:
Runtime error
Runtime error
feat: add init scripts
Browse files- init_data.py +39 -0
- translated-content +1 -0
init_data.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from qdrant_client import QdrantClient
|
| 2 |
+
from qdrant_client.http.models import Distance, VectorParams
|
| 3 |
+
from qdrant_client.http.models import PointStruct
|
| 4 |
+
import tqdm
|
| 5 |
+
import glob
|
| 6 |
+
import model
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
if __name__ == '__main__':
|
| 10 |
+
client = QdrantClient("127.0.0.1", port=6333)
|
| 11 |
+
collection_name = "mdn-docs"
|
| 12 |
+
client.recreate_collection(
|
| 13 |
+
collection_name=collection_name,
|
| 14 |
+
vectors_config=VectorParams(size=768, distance=Distance.COSINE),
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
count = 0
|
| 18 |
+
files = glob.glob("translated-content/files/zh-cn/**/*.md", recursive=True)
|
| 19 |
+
print(len(files))
|
| 20 |
+
for file in tqdm.tqdm(files):
|
| 21 |
+
count+=1
|
| 22 |
+
with open(file, 'r', encoding='utf-8') as f:
|
| 23 |
+
print('file', file)
|
| 24 |
+
text = f.read()
|
| 25 |
+
matchObj = re.match(r'\s*---[\n\r]+title:(((?!---).)+)', text, re.M|re.I)
|
| 26 |
+
if matchObj:
|
| 27 |
+
title = matchObj.group(1).strip()
|
| 28 |
+
else:
|
| 29 |
+
title = file
|
| 30 |
+
|
| 31 |
+
vector = model.encode(text)
|
| 32 |
+
client.upsert(
|
| 33 |
+
collection_name=collection_name,
|
| 34 |
+
wait=True,
|
| 35 |
+
points=[
|
| 36 |
+
PointStruct(id=count, vector=vector, payload={"title": title, "text": text }),
|
| 37 |
+
],
|
| 38 |
+
)
|
| 39 |
+
|
translated-content
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit 79462bd3fd2533e3b71a117d1c98fafb8d4ca0e2
|