|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import logging |
|
import re |
|
import csv |
|
from copy import deepcopy |
|
from io import BytesIO |
|
from timeit import default_timer as timer |
|
from openpyxl import load_workbook |
|
|
|
from deepdoc.parser.utils import get_text |
|
from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level |
|
from rag.nlp import rag_tokenizer, tokenize_table, concat_img |
|
from deepdoc.parser import PdfParser, ExcelParser, DocxParser |
|
from docx import Document |
|
from PIL import Image |
|
from markdown import markdown |
|
|
|
|
|
class Excel(ExcelParser): |
|
def __call__(self, fnm, binary=None, callback=None): |
|
if not binary: |
|
wb = load_workbook(fnm) |
|
else: |
|
wb = load_workbook(BytesIO(binary)) |
|
total = 0 |
|
for sheetname in wb.sheetnames: |
|
total += len(list(wb[sheetname].rows)) |
|
|
|
res, fails = [], [] |
|
for sheetname in wb.sheetnames: |
|
ws = wb[sheetname] |
|
rows = list(ws.rows) |
|
for i, r in enumerate(rows): |
|
q, a = "", "" |
|
for cell in r: |
|
if not cell.value: |
|
continue |
|
if not q: |
|
q = str(cell.value) |
|
elif not a: |
|
a = str(cell.value) |
|
else: |
|
break |
|
if q and a: |
|
res.append((q, a)) |
|
else: |
|
fails.append(str(i + 1)) |
|
if len(res) % 999 == 0: |
|
callback(len(res) * |
|
0.6 / |
|
total, ("Extract pairs: {}".format(len(res)) + |
|
(f"{len(fails)} failure, line: %s..." % |
|
(",".join(fails[:3])) if fails else ""))) |
|
|
|
callback(0.6, ("Extract pairs: {}. ".format(len(res)) + ( |
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) |
|
self.is_english = is_english( |
|
[rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1]) |
|
return res |
|
|
|
|
|
class Pdf(PdfParser): |
|
def __call__(self, filename, binary=None, from_page=0, |
|
to_page=100000, zoomin=3, callback=None): |
|
start = timer() |
|
callback(msg="OCR started") |
|
self.__images__( |
|
filename if not binary else binary, |
|
zoomin, |
|
from_page, |
|
to_page, |
|
callback |
|
) |
|
callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) |
|
logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start)) |
|
start = timer() |
|
self._layouts_rec(zoomin, drop=False) |
|
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) |
|
|
|
start = timer() |
|
self._table_transformer_job(zoomin) |
|
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) |
|
|
|
start = timer() |
|
self._text_merge() |
|
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) |
|
tbls = self._extract_table_figure(True, zoomin, True, True) |
|
|
|
|
|
|
|
logging.debug("layouts: {}".format(timer() - start)) |
|
sections = [b["text"] for b in self.boxes] |
|
bull_x0_list = [] |
|
q_bull, reg = qbullets_category(sections) |
|
if q_bull == -1: |
|
raise ValueError("Unable to recognize Q&A structure.") |
|
qai_list = [] |
|
last_q, last_a, last_tag = '', '', '' |
|
last_index = -1 |
|
last_box = {'text':''} |
|
last_bull = None |
|
def sort_key(element): |
|
tbls_pn = element[1][0][0] |
|
tbls_top = element[1][0][3] |
|
return tbls_pn, tbls_top |
|
tbls.sort(key=sort_key) |
|
tbl_index = 0 |
|
last_pn, last_bottom = 0, 0 |
|
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', '' |
|
for box in self.boxes: |
|
section, line_tag = box['text'], self._line_tag(box, zoomin) |
|
has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list) |
|
last_box, last_index, last_bull = box, index, has_bull |
|
line_pn = float(line_tag.lstrip('@@').split('\t')[0]) |
|
line_top = float(line_tag.rstrip('##').split('\t')[3]) |
|
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index) |
|
if not has_bull: |
|
if not last_q: |
|
if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top): |
|
tbl_index += 1 |
|
continue |
|
else: |
|
sum_tag = line_tag |
|
sum_section = section |
|
while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \ |
|
and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): |
|
sum_tag = f'{tbl_tag}{sum_tag}' |
|
sum_section = f'{tbl_text}{sum_section}' |
|
tbl_index += 1 |
|
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index) |
|
last_a = f'{last_a}{sum_section}' |
|
last_tag = f'{last_tag}{sum_tag}' |
|
else: |
|
if last_q: |
|
while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \ |
|
and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): |
|
last_tag = f'{last_tag}{tbl_tag}' |
|
last_a = f'{last_a}{tbl_text}' |
|
tbl_index += 1 |
|
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index) |
|
image, poss = self.crop(last_tag, need_position=True) |
|
qai_list.append((last_q, last_a, image, poss)) |
|
last_q, last_a, last_tag = '', '', '' |
|
last_q = has_bull.group() |
|
_, end = has_bull.span() |
|
last_a = section[end:] |
|
last_tag = line_tag |
|
last_bottom = float(line_tag.rstrip('##').split('\t')[4]) |
|
last_pn = line_pn |
|
if last_q: |
|
qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True))) |
|
return qai_list, tbls |
|
|
|
def get_tbls_info(self, tbls, tbl_index): |
|
if tbl_index >= len(tbls): |
|
return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', '' |
|
tbl_pn = tbls[tbl_index][1][0][0]+1 |
|
tbl_left = tbls[tbl_index][1][0][1] |
|
tbl_right = tbls[tbl_index][1][0][2] |
|
tbl_top = tbls[tbl_index][1][0][3] |
|
tbl_bottom = tbls[tbl_index][1][0][4] |
|
tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ |
|
.format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom) |
|
_tbl_text = ''.join(tbls[tbl_index][0][1]) |
|
return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, |
|
|
|
|
|
class Docx(DocxParser): |
|
def __init__(self): |
|
pass |
|
|
|
def get_picture(self, document, paragraph): |
|
img = paragraph._element.xpath('.//pic:pic') |
|
if not img: |
|
return None |
|
img = img[0] |
|
embed = img.xpath('.//a:blip/@r:embed')[0] |
|
related_part = document.part.related_parts[embed] |
|
image = related_part.image |
|
image = Image.open(BytesIO(image.blob)).convert('RGB') |
|
return image |
|
|
|
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): |
|
self.doc = Document( |
|
filename) if not binary else Document(BytesIO(binary)) |
|
pn = 0 |
|
last_answer, last_image = "", None |
|
question_stack, level_stack = [], [] |
|
qai_list = [] |
|
for p in self.doc.paragraphs: |
|
if pn > to_page: |
|
break |
|
question_level, p_text = 0, '' |
|
if from_page <= pn < to_page and p.text.strip(): |
|
question_level, p_text = docx_question_level(p) |
|
if not question_level or question_level > 6: |
|
last_answer = f'{last_answer}\n{p_text}' |
|
current_image = self.get_picture(self.doc, p) |
|
last_image = concat_img(last_image, current_image) |
|
else: |
|
if last_answer or last_image: |
|
sum_question = '\n'.join(question_stack) |
|
if sum_question: |
|
qai_list.append((sum_question, last_answer, last_image)) |
|
last_answer, last_image = '', None |
|
|
|
i = question_level |
|
while question_stack and i <= level_stack[-1]: |
|
question_stack.pop() |
|
level_stack.pop() |
|
question_stack.append(p_text) |
|
level_stack.append(question_level) |
|
for run in p.runs: |
|
if 'lastRenderedPageBreak' in run._element.xml: |
|
pn += 1 |
|
continue |
|
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: |
|
pn += 1 |
|
if last_answer: |
|
sum_question = '\n'.join(question_stack) |
|
if sum_question: |
|
qai_list.append((sum_question, last_answer, last_image)) |
|
|
|
tbls = [] |
|
for tb in self.doc.tables: |
|
html= "<table>" |
|
for r in tb.rows: |
|
html += "<tr>" |
|
i = 0 |
|
while i < len(r.cells): |
|
span = 1 |
|
c = r.cells[i] |
|
for j in range(i+1, len(r.cells)): |
|
if c.text == r.cells[j].text: |
|
span += 1 |
|
i = j |
|
i += 1 |
|
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>" |
|
html += "</tr>" |
|
html += "</table>" |
|
tbls.append(((None, html), "")) |
|
return qai_list, tbls |
|
|
|
|
|
def rmPrefix(txt): |
|
return re.sub( |
|
r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE) |
|
|
|
|
|
def beAdocPdf(d, q, a, eng, image, poss): |
|
qprefix = "Question: " if eng else "问题:" |
|
aprefix = "Answer: " if eng else "回答:" |
|
d["content_with_weight"] = "\t".join( |
|
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) |
|
d["content_ltks"] = rag_tokenizer.tokenize(q) |
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) |
|
d["image"] = image |
|
add_positions(d, poss) |
|
return d |
|
|
|
|
|
def beAdocDocx(d, q, a, eng, image, row_num=-1): |
|
qprefix = "Question: " if eng else "问题:" |
|
aprefix = "Answer: " if eng else "回答:" |
|
d["content_with_weight"] = "\t".join( |
|
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) |
|
d["content_ltks"] = rag_tokenizer.tokenize(q) |
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) |
|
d["image"] = image |
|
if row_num >= 0: |
|
d["top_int"] = [row_num] |
|
return d |
|
|
|
|
|
def beAdoc(d, q, a, eng, row_num=-1): |
|
qprefix = "Question: " if eng else "问题:" |
|
aprefix = "Answer: " if eng else "回答:" |
|
d["content_with_weight"] = "\t".join( |
|
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) |
|
d["content_ltks"] = rag_tokenizer.tokenize(q) |
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) |
|
if row_num >= 0: |
|
d["top_int"] = [row_num] |
|
return d |
|
|
|
|
|
def mdQuestionLevel(s): |
|
match = re.match(r'#*', s) |
|
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) |
|
|
|
|
|
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): |
|
""" |
|
Excel and csv(txt) format files are supported. |
|
If the file is in excel format, there should be 2 column question and answer without header. |
|
And question column is ahead of answer column. |
|
And it's O.K if it has multiple sheets as long as the columns are rightly composed. |
|
|
|
If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer. |
|
|
|
All the deformed lines will be ignored. |
|
Every pair of Q&A will be treated as a chunk. |
|
""" |
|
eng = lang.lower() == "english" |
|
res = [] |
|
doc = { |
|
"docnm_kwd": filename, |
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) |
|
} |
|
if re.search(r"\.xlsx?$", filename, re.IGNORECASE): |
|
callback(0.1, "Start to parse.") |
|
excel_parser = Excel() |
|
for ii, (q, a) in enumerate(excel_parser(filename, binary, callback)): |
|
res.append(beAdoc(deepcopy(doc), q, a, eng, ii)) |
|
return res |
|
|
|
elif re.search(r"\.(txt)$", filename, re.IGNORECASE): |
|
callback(0.1, "Start to parse.") |
|
txt = get_text(filename, binary) |
|
lines = txt.split("\n") |
|
comma, tab = 0, 0 |
|
for line in lines: |
|
if len(line.split(",")) == 2: |
|
comma += 1 |
|
if len(line.split("\t")) == 2: |
|
tab += 1 |
|
delimiter = "\t" if tab >= comma else "," |
|
|
|
fails = [] |
|
question, answer = "", "" |
|
i = 0 |
|
while i < len(lines): |
|
arr = lines[i].split(delimiter) |
|
if len(arr) != 2: |
|
if question: |
|
answer += "\n" + lines[i] |
|
else: |
|
fails.append(str(i+1)) |
|
elif len(arr) == 2: |
|
if question and answer: |
|
res.append(beAdoc(deepcopy(doc), question, answer, eng, i)) |
|
question, answer = arr |
|
i += 1 |
|
if len(res) % 999 == 0: |
|
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + ( |
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) |
|
|
|
if question: |
|
res.append(beAdoc(deepcopy(doc), question, answer, eng, len(lines))) |
|
|
|
callback(0.6, ("Extract Q&A: {}".format(len(res)) + ( |
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) |
|
|
|
return res |
|
|
|
elif re.search(r"\.(csv)$", filename, re.IGNORECASE): |
|
callback(0.1, "Start to parse.") |
|
txt = get_text(filename, binary) |
|
lines = txt.split("\n") |
|
delimiter = "\t" if any("\t" in line for line in lines) else "," |
|
|
|
fails = [] |
|
question, answer = "", "" |
|
res = [] |
|
reader = csv.reader(lines, delimiter=delimiter) |
|
|
|
for i, row in enumerate(reader): |
|
if len(row) != 2: |
|
if question: |
|
answer += "\n" + lines[i] |
|
else: |
|
fails.append(str(i + 1)) |
|
elif len(row) == 2: |
|
if question and answer: |
|
res.append(beAdoc(deepcopy(doc), question, answer, eng, i)) |
|
question, answer = row |
|
if len(res) % 999 == 0: |
|
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + ( |
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) |
|
|
|
if question: |
|
res.append(beAdoc(deepcopy(doc), question, answer, eng, len(reader))) |
|
|
|
callback(0.6, ("Extract Q&A: {}".format(len(res)) + ( |
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) |
|
return res |
|
|
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE): |
|
callback(0.1, "Start to parse.") |
|
pdf_parser = Pdf() |
|
qai_list, tbls = pdf_parser(filename if not binary else binary, |
|
from_page=0, to_page=10000, callback=callback) |
|
for q, a, image, poss in qai_list: |
|
res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss)) |
|
return res |
|
|
|
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): |
|
callback(0.1, "Start to parse.") |
|
txt = get_text(filename, binary) |
|
lines = txt.split("\n") |
|
_last_question, last_answer = "", "" |
|
question_stack, level_stack = [], [] |
|
code_block = False |
|
for index, line in enumerate(lines): |
|
if line.strip().startswith('```'): |
|
code_block = not code_block |
|
question_level, question = 0, '' |
|
if not code_block: |
|
question_level, question = mdQuestionLevel(line) |
|
|
|
if not question_level or question_level > 6: |
|
last_answer = f'{last_answer}\n{line}' |
|
else: |
|
if last_answer.strip(): |
|
sum_question = '\n'.join(question_stack) |
|
if sum_question: |
|
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index)) |
|
last_answer = '' |
|
|
|
i = question_level |
|
while question_stack and i <= level_stack[-1]: |
|
question_stack.pop() |
|
level_stack.pop() |
|
question_stack.append(question) |
|
level_stack.append(question_level) |
|
if last_answer.strip(): |
|
sum_question = '\n'.join(question_stack) |
|
if sum_question: |
|
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index)) |
|
return res |
|
|
|
elif re.search(r"\.docx$", filename, re.IGNORECASE): |
|
docx_parser = Docx() |
|
qai_list, tbls = docx_parser(filename, binary, |
|
from_page=0, to_page=10000, callback=callback) |
|
res = tokenize_table(tbls, doc, eng) |
|
for i, (q, a, image) in enumerate(qai_list): |
|
res.append(beAdocDocx(deepcopy(doc), q, a, eng, image, i)) |
|
return res |
|
|
|
raise NotImplementedError( |
|
"Excel, csv(txt), pdf, markdown and docx format files are supported.") |
|
|
|
|
|
if __name__ == "__main__": |
|
import sys |
|
|
|
def dummy(prog=None, msg=""): |
|
pass |
|
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) |