Spaces:
Runtime error
Runtime error
File size: 2,617 Bytes
5368a96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import csv
import re
from PyPDF2 import PdfReader
def parse_pdf(pdf_path: str):
reader = PdfReader(pdf_path)
extracted_data = []
for page in reader.pages:
text = page.extract_text()
print(text)
# Regular expression pattern to capture the required fields:
# 1. A number followed by a period (the index).
# 2. A sequence of word characters, spaces, and hyphens followed by an arrow (the category).
# 3. Another sequence of word characters, spaces, and hyphens followed by an arrow (the subcategory).
# 4. Yet another sequence of word characters, spaces, and hyphens (the characteristic).
# 5. Text following "•Definition :" until the next newline (the definition).
# 6. Text following "•Notes :" until the references (the notes).
# 7. Text following "•References :" until the next number followed by a period or the end of the file (the references).
pattern = r"(\d+)\.\s+([\w\s-]+)→([\w\s-]+)→([\w\s-]+)\n•Definition :\s(.*?)\n•Notes :([\s\S]*?)•References :([\s\S]*?)(?=(\n\d+\.)|\Z)"
for match in re.finditer(pattern, text, re.DOTALL):
index = match.group(1)
category = match.group(2).strip()
subcategory = match.group(3).strip()
characteristic = match.group(4).strip()
definition = match.group(5).strip()
notes = match.group(6).strip()
references = match.group(7).strip()
extracted_data.append(
[
index,
category,
subcategory,
characteristic,
definition,
notes,
references,
]
)
try:
assert len(extracted_data) == 100, "The parser did not find 100 indicators"
except AssertionError:
indices = [int(item[0]) for item in extracted_data]
missing_indices = [i for i in range(1, 101) if i not in indices]
assert not missing_indices, f"Missing indices: {missing_indices}"
return extracted_data
data = parse_pdf(pdf_path="fmti_indicators.pdf")
with open("fmti_indicators.csv", "w", newline="", encoding="utf-8") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(
[
"Index",
"Category",
"Subcategory",
"Characteristic",
"Definition",
"Notes",
"References",
]
)
writer.writerows(data)
csv_file.seek(0)
lines = csv_file.readlines()
|