File size: 1,310 Bytes
6054f54
 
35bb186
6054f54
 
 
 
 
 
 
 
b726d64
6054f54
 
b726d64
3b7343c
 
35bb186
 
 
0404a52
 
6054f54
 
 
 
 
 
 
 
 
3b7343c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import re

from graphrag.index import build_knowledge_graph_chunks
from rag.app import naive
from rag.nlp import rag_tokenizer, tokenize_chunks


def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
          lang="Chinese", callback=None, **kwargs):
    parser_config = kwargs.get(
        "parser_config", {
            "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": True})
    eng = lang.lower() == "english"

    parser_config["layout_recognize"] = True
    sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True,
                           parser_config=parser_config, callback=callback)
    chunks = build_knowledge_graph_chunks(tenant_id, sections, callback,
                                          parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
                                          )
    for c in chunks:
        c["docnm_kwd"] = filename

    doc = {
        "docnm_kwd": filename,
        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
        "knowledge_graph_kwd": "text"
    }
    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
    chunks.extend(tokenize_chunks(sections, doc, eng))

    return chunks