|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from rag.nlp import find_codec |
|
import readability |
|
import html_text |
|
import chardet |
|
|
|
|
|
def get_encoding(file): |
|
with open(file,'rb') as f: |
|
tmp = chardet.detect(f.read()) |
|
return tmp['encoding'] |
|
|
|
|
|
class RAGFlowHtmlParser: |
|
def __call__(self, fnm, binary=None): |
|
txt = "" |
|
if binary: |
|
encoding = find_codec(binary) |
|
txt = binary.decode(encoding, errors="ignore") |
|
else: |
|
with open(fnm, "r",encoding=get_encoding(fnm)) as f: |
|
txt = f.read() |
|
return self.parser_txt(txt) |
|
|
|
@classmethod |
|
def parser_txt(cls, txt): |
|
if not isinstance(txt, str): |
|
raise TypeError("txt type should be str!") |
|
html_doc = readability.Document(txt) |
|
title = html_doc.title() |
|
content = html_text.extract_text(html_doc.summary(html_partial=True)) |
|
txt = f"{title}\n{content}" |
|
sections = txt.split("\n") |
|
return sections |
|
|