from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct
import tqdm
import glob
import model
import re

if __name__ == '__main__':
    client = QdrantClient("127.0.0.1", port=6333)
    collection_name = "mdn-docs"
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE),
    )

    count = 0
    files = glob.glob("translated-content/files/zh-cn/**/*.md", recursive=True)
    print(len(files))
    for file in tqdm.tqdm(files):
        count+=1
        with open(file, 'r', encoding='utf-8') as f:
            print('file', file)
            text = f.read()
            matchObj = re.match(r'\s*---[\n\r]+title:(((?!---).)+)', text, re.M|re.I)
            if matchObj:
                title = matchObj.group(1).strip()
            else:
                title = file

            vector = model.encode(text)
            client.upsert(
                collection_name=collection_name,
                wait=True,
                points=[
                    PointStruct(id=count, vector=vector, payload={"title": title, "text": text }),
                ],
            )