Spaces:
Sleeping
Sleeping
| from typing import List, Union, Optional | |
| import json | |
| from indexify_extractor_sdk import Content, Extractor, Feature | |
| from pydantic import BaseModel, Field | |
| from .utils.tt_module import get_tables | |
| import fitz | |
| import tempfile | |
| class PDFExtractorConfig(BaseModel): | |
| output_types: List[str] = Field(default_factory=lambda: ["text", "image", "table"]) | |
| class PDFExtractor(Extractor): | |
| name = "tensorlake/pdf-extractor" | |
| description = "PDF Extractor for Texts, Images & Tables" | |
| system_dependencies = ["poppler-utils"] | |
| input_mime_types = ["application/pdf"] | |
| def __init__(self): | |
| super(PDFExtractor, self).__init__() | |
| def extract(self, content: Content, params: PDFExtractorConfig) -> List[Union[Feature, Content]]: | |
| contents = [] | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as inputtmpfile: | |
| inputtmpfile.write(content.data) | |
| inputtmpfile.flush() | |
| doc = fitz.open(inputtmpfile.name) | |
| for i in range(len(doc)): | |
| page = doc[i] | |
| if "text" in params.output_types: | |
| page_text = page.get_text() | |
| feature = Feature.metadata(value={"type": "text", "page": i+1}) | |
| contents.append(Content.from_text(page_text, features=[feature])) | |
| if "image" in params.output_types: | |
| image_list = page.get_images() | |
| for img in image_list: | |
| xref = img[0] | |
| pix = fitz.Pixmap(doc, xref) | |
| if not pix.colorspace.name in (fitz.csGRAY.name, fitz.csRGB.name): | |
| pix = fitz.Pixmap(fitz.csRGB, pix) | |
| feature = Feature.metadata({"type": "image", "page": i+1}) | |
| contents.append(Content(content_type="image/png", data=pix.tobytes(), features=[feature])) | |
| if "table" in params.output_types: | |
| tables = get_tables(content.data) | |
| for page, content in tables.items(): | |
| feature = Feature.metadata({"type": "table", "page": int(page)}) | |
| contents.append(Content(content_type="application/json", data=json.dumps(content), features=[feature])) | |
| return contents |