|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import logging |
|
from email import policy |
|
from email.parser import BytesParser |
|
from rag.app.naive import chunk as naive_chunk |
|
import re |
|
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks |
|
from deepdoc.parser import HtmlParser, TxtParser |
|
from timeit import default_timer as timer |
|
import io |
|
|
|
|
|
def chunk( |
|
filename, |
|
binary=None, |
|
from_page=0, |
|
to_page=100000, |
|
lang="Chinese", |
|
callback=None, |
|
**kwargs, |
|
): |
|
""" |
|
Only eml is supported |
|
""" |
|
eng = lang.lower() == "english" |
|
parser_config = kwargs.get( |
|
"parser_config", |
|
{"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}, |
|
) |
|
doc = { |
|
"docnm_kwd": filename, |
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)), |
|
} |
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) |
|
main_res = [] |
|
attachment_res = [] |
|
|
|
if binary: |
|
msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary)) |
|
else: |
|
msg = BytesParser(policy=policy.default).parse(open(filename, "rb")) |
|
|
|
text_txt, html_txt = [], [] |
|
|
|
for header, value in msg.items(): |
|
text_txt.append(f"{header}: {value}") |
|
|
|
|
|
def _add_content(msg, content_type): |
|
if content_type == "text/plain": |
|
text_txt.append( |
|
msg.get_payload(decode=True).decode(msg.get_content_charset()) |
|
) |
|
elif content_type == "text/html": |
|
html_txt.append( |
|
msg.get_payload(decode=True).decode(msg.get_content_charset()) |
|
) |
|
elif "multipart" in content_type: |
|
if msg.is_multipart(): |
|
for part in msg.iter_parts(): |
|
_add_content(part, part.get_content_type()) |
|
|
|
_add_content(msg, msg.get_content_type()) |
|
|
|
sections = TxtParser.parser_txt("\n".join(text_txt)) + [ |
|
(line, "") for line in HtmlParser.parser_txt("\n".join(html_txt)) if line |
|
] |
|
|
|
st = timer() |
|
chunks = naive_merge( |
|
sections, |
|
int(parser_config.get("chunk_token_num", 128)), |
|
parser_config.get("delimiter", "\n!?。;!?"), |
|
) |
|
|
|
main_res.extend(tokenize_chunks(chunks, doc, eng, None)) |
|
logging.debug("naive_merge({}): {}".format(filename, timer() - st)) |
|
|
|
for part in msg.iter_attachments(): |
|
content_disposition = part.get("Content-Disposition") |
|
if content_disposition: |
|
dispositions = content_disposition.strip().split(";") |
|
if dispositions[0].lower() == "attachment": |
|
filename = part.get_filename() |
|
payload = part.get_payload(decode=True) |
|
try: |
|
attachment_res.extend( |
|
naive_chunk(filename, payload, callback=callback, **kwargs) |
|
) |
|
except Exception: |
|
pass |
|
|
|
return main_res + attachment_res |
|
|
|
|
|
if __name__ == "__main__": |
|
import sys |
|
|
|
def dummy(prog=None, msg=""): |
|
pass |
|
|
|
chunk(sys.argv[1], callback=dummy) |
|
|