File size: 2,617 Bytes
5368a96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import csv
import re

from PyPDF2 import PdfReader


def parse_pdf(pdf_path: str):
    reader = PdfReader(pdf_path)
    extracted_data = []

    for page in reader.pages:
        text = page.extract_text()
        print(text)

        # Regular expression pattern to capture the required fields:
        # 1. A number followed by a period (the index).
        # 2. A sequence of word characters, spaces, and hyphens followed by an arrow (the category).
        # 3. Another sequence of word characters, spaces, and hyphens followed by an arrow (the subcategory).
        # 4. Yet another sequence of word characters, spaces, and hyphens (the characteristic).
        # 5. Text following "•Definition :" until the next newline (the definition).
        # 6. Text following "•Notes :" until the references (the notes).
        # 7. Text following "•References :" until the next number followed by a period or the end of the file (the references).
        pattern = r"(\d+)\.\s+([\w\s-]+)→([\w\s-]+)→([\w\s-]+)\n•Definition :\s(.*?)\n•Notes :([\s\S]*?)•References :([\s\S]*?)(?=(\n\d+\.)|\Z)"

        for match in re.finditer(pattern, text, re.DOTALL):
            index = match.group(1)
            category = match.group(2).strip()
            subcategory = match.group(3).strip()
            characteristic = match.group(4).strip()
            definition = match.group(5).strip()
            notes = match.group(6).strip()
            references = match.group(7).strip()

            extracted_data.append(
                [
                    index,
                    category,
                    subcategory,
                    characteristic,
                    definition,
                    notes,
                    references,
                ]
            )

    try:
        assert len(extracted_data) == 100, "The parser did not find 100 indicators"
    except AssertionError:
        indices = [int(item[0]) for item in extracted_data]
        missing_indices = [i for i in range(1, 101) if i not in indices]
        assert not missing_indices, f"Missing indices: {missing_indices}"

    return extracted_data


data = parse_pdf(pdf_path="fmti_indicators.pdf")

with open("fmti_indicators.csv", "w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(
        [
            "Index",
            "Category",
            "Subcategory",
            "Characteristic",
            "Definition",
            "Notes",
            "References",
        ]
    )
    writer.writerows(data)

    csv_file.seek(0)
    lines = csv_file.readlines()