File size: 5,609 Bytes
cdba7f7 072f9dd ee82924 072f9dd e31db28 41c7a59 d54aa01 cfd6ece b085dec 51482f3 072f9dd cdba7f7 072f9dd e7e8c6b 072f9dd 51482f3 072f9dd 51482f3 79ada0b 51482f3 072f9dd cdba7f7 e7e8c6b 072f9dd 54ec234 072f9dd b83edb4 279ca43 072f9dd cdba7f7 b83edb4 3cefaa0 79ada0b 51482f3 072f9dd e6acaf6 072f9dd 79ada0b 072f9dd 79ada0b a8294f2 072f9dd cfd6ece 072f9dd cfd6ece 072f9dd ae35e13 e6acaf6 072f9dd e6acaf6 b085dec 6224edc 79ada0b b085dec 6224edc e6acaf6 072f9dd 79ada0b d54aa01 b5b25b4 072f9dd 79ada0b 072f9dd 407b252 072f9dd e6acaf6 ee82924 79ada0b ee82924 072f9dd 79ada0b 072f9dd 51482f3 072f9dd 51482f3 072f9dd 1daa4bd 79ada0b b085dec 79ada0b 072f9dd 79ada0b 279ca43 51482f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from tika import parser
import re
from io import BytesIO
from docx import Document
from api.db import ParserType
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title, add_positions, tokenize_chunks, find_codec
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser
from rag.settings import cron_logger
class Docx(DocxParser):
def __init__(self):
pass
def __clean(self, line):
line = re.sub(r"\u3000", " ", line).strip()
return line
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
for p in self.doc.paragraphs:
if pn > to_page:
break
if from_page <= pn < to_page and p.text.strip():
lines.append(self.__clean(p.text))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
return [l for l in lines if l]
class Pdf(PdfParser):
def __init__(self):
self.model_speciess = ParserType.LAWS.value
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
callback(msg="OCR is running...")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished")
from timeit import default_timer as timer
start = timer()
self._layouts_rec(zoomin)
callback(0.67, "Layout analysis finished")
cron_logger.info("layouts:".format(
(timer() - start) / (self.total_page + 0.1)))
self._naive_vertical_merge()
callback(0.8, "Text extraction finished")
return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], None
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
"""
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None
sections = []
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
for txt in Docx()(filename, binary):
sections.append(txt)
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() if kwargs.get(
"parser_config", {}).get(
"layout_recognize", True) else PlainParser()
for txt, poss in pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)[0]:
sections.append(txt + poss)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
sections = txt.split("\n")
sections = [l for l in sections if l]
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
sections = doc_parsed['content'].split('\n')
sections = [l for l in sections if l]
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
# is it English
eng = lang.lower() == "english" # is_english(sections)
# Remove 'Contents' part
remove_contents_table(sections, eng)
make_colon_as_title(sections)
bull = bullets_category(sections)
chunks = hierarchical_merge(bull, sections, 5)
if not chunks:
callback(0.99, "No chunk parsed out.")
return tokenize_chunks(["\n".join(ck)
for ck in chunks], doc, eng, pdf_parser)
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)
|