| """Loader that loads image files.""" | |
| from typing import List | |
| from langchain.document_loaders.unstructured import UnstructuredFileLoader | |
| from paddleocr import PaddleOCR | |
| import os | |
| import fitz | |
| import nltk | |
| from configs.model_config import NLTK_DATA_PATH | |
| nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path | |
| class UnstructuredPaddlePDFLoader(UnstructuredFileLoader): | |
| """Loader that uses unstructured to load image files, such as PNGs and JPGs.""" | |
| def _get_elements(self) -> List: | |
| def pdf_ocr_txt(filepath, dir_path="tmp_files"): | |
| full_dir_path = os.path.join(os.path.dirname(filepath), dir_path) | |
| if not os.path.exists(full_dir_path): | |
| os.makedirs(full_dir_path) | |
| ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False, show_log=False) | |
| doc = fitz.open(filepath) | |
| txt_file_path = os.path.join(full_dir_path, f"{os.path.split(filepath)[-1]}.txt") | |
| img_name = os.path.join(full_dir_path, 'tmp.png') | |
| with open(txt_file_path, 'w', encoding='utf-8') as fout: | |
| for i in range(doc.page_count): | |
| page = doc[i] | |
| text = page.get_text("") | |
| fout.write(text) | |
| fout.write("\n") | |
| img_list = page.get_images() | |
| for img in img_list: | |
| pix = fitz.Pixmap(doc, img[0]) | |
| if pix.n - pix.alpha >= 4: | |
| pix = fitz.Pixmap(fitz.csRGB, pix) | |
| pix.save(img_name) | |
| result = ocr.ocr(img_name) | |
| ocr_result = [i[1][0] for line in result for i in line] | |
| fout.write("\n".join(ocr_result)) | |
| if os.path.exists(img_name): | |
| os.remove(img_name) | |
| return txt_file_path | |
| txt_file_path = pdf_ocr_txt(self.file_path) | |
| from unstructured.partition.text import partition_text | |
| return partition_text(filename=txt_file_path, **self.unstructured_kwargs) | |
| if __name__ == "__main__": | |
| import sys | |
| sys.path.append(os.path.dirname(os.path.dirname(__file__))) | |
| filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.pdf") | |
| loader = UnstructuredPaddlePDFLoader(filepath, mode="elements") | |
| docs = loader.load() | |
| for doc in docs: | |
| print(doc) | |