File size: 866 Bytes
3478195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import re
from docx import Document
from helpers import get_doc_blocks

def get_ikz_pdf(pdf_blocks):
    ikz_pdf = set()
    for block in pdf_blocks:
        ikz_pdf.update(
            re.findall("\d{32,40}", block)
        )
    return ikz_pdf


def get_ikz_doc(doc):
    ikz_docx = set()
    paragraphs = get_doc_blocks(doc)
    ikz_doc_regex = ["\d{36}", "(?:\d{2})(?:-\d{3,20}){5}"]

    for docpara in paragraphs:
        for val in ikz_doc_regex:
            ikz_docx.update(
                re.findall(val, docpara)
            )
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for para in cell.paragraphs:
                    for val in ikz_doc_regex:
                        ikz_docx.update(
                            re.findall(val, para.text)
                        )
    return ikz_docx