File size: 1,252 Bytes
b9c7274
 
 
 
 
 
 
 
 
ed5def4
b9c7274
 
 
 
 
 
 
 
 
ed5def4
b9c7274
 
ed5def4
b9c7274
 
 
 
 
ed5def4
 
 
 
b9c7274
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from pathlib import Path
from typing import List

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings

import configs

embeddings_model = OpenAIEmbeddings()


def process_documents(doc_storage_path: str):
    print("doc preprocessing...")
    doc_directory = Path(doc_storage_path)
    docs = []  # type: List[Document]
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=configs.CHUNK_SIZE, chunk_overlap=configs.CHUNK_OVERLAP
    )
    doc_search = Chroma(
        persist_directory=configs.STORE_FILE, embedding_function=embeddings_model
    )
    for file_path in doc_directory.glob("*.txt"):
        loader = TextLoader(str(file_path))
        documents = loader.load()
        docs = text_splitter.split_documents(documents)
        doc_search = doc_search.from_documents(
            docs, embeddings_model, persist_directory=configs.STORE_FILE
        )
    doc_search.persist()
    print("doc preprocessing end.")
    return doc_search


def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])