Spaces:
Runtime error
Runtime error
| import csv | |
| import re | |
| from PyPDF2 import PdfReader | |
| def parse_pdf(pdf_path: str): | |
| reader = PdfReader(pdf_path) | |
| extracted_data = [] | |
| for page in reader.pages: | |
| text = page.extract_text() | |
| print(text) | |
| # Regular expression pattern to capture the required fields: | |
| # 1. A number followed by a period (the index). | |
| # 2. A sequence of word characters, spaces, and hyphens followed by an arrow (the category). | |
| # 3. Another sequence of word characters, spaces, and hyphens followed by an arrow (the subcategory). | |
| # 4. Yet another sequence of word characters, spaces, and hyphens (the characteristic). | |
| # 5. Text following "•Definition :" until the next newline (the definition). | |
| # 6. Text following "•Notes :" until the references (the notes). | |
| # 7. Text following "•References :" until the next number followed by a period or the end of the file (the references). | |
| pattern = r"(\d+)\.\s+([\w\s-]+)→([\w\s-]+)→([\w\s-]+)\n•Definition :\s(.*?)\n•Notes :([\s\S]*?)•References :([\s\S]*?)(?=(\n\d+\.)|\Z)" | |
| for match in re.finditer(pattern, text, re.DOTALL): | |
| index = match.group(1) | |
| category = match.group(2).strip() | |
| subcategory = match.group(3).strip() | |
| characteristic = match.group(4).strip() | |
| definition = match.group(5).strip() | |
| notes = match.group(6).strip() | |
| references = match.group(7).strip() | |
| extracted_data.append( | |
| [ | |
| index, | |
| category, | |
| subcategory, | |
| characteristic, | |
| definition, | |
| notes, | |
| references, | |
| ] | |
| ) | |
| try: | |
| assert len(extracted_data) == 100, "The parser did not find 100 indicators" | |
| except AssertionError: | |
| indices = [int(item[0]) for item in extracted_data] | |
| missing_indices = [i for i in range(1, 101) if i not in indices] | |
| assert not missing_indices, f"Missing indices: {missing_indices}" | |
| return extracted_data | |
| data = parse_pdf(pdf_path="fmti_indicators.pdf") | |
| with open("fmti_indicators.csv", "w", newline="", encoding="utf-8") as csv_file: | |
| writer = csv.writer(csv_file) | |
| writer.writerow( | |
| [ | |
| "Index", | |
| "Category", | |
| "Subcategory", | |
| "Characteristic", | |
| "Definition", | |
| "Notes", | |
| "References", | |
| ] | |
| ) | |
| writer.writerows(data) | |
| csv_file.seek(0) | |
| lines = csv_file.readlines() | |