Spaces:

mariagrandury
/

fmti-transparency-self-assessment

Runtime error

App Files Files Community

fmti-transparency-self-assessment / pdf_parser.py

mariagrandury

feature: parse questions from official fmti indicators pdf

5368a96 about 2 years ago

raw

history blame

2.62 kB

	import csv
	import re

	from PyPDF2 import PdfReader


	def parse_pdf(pdf_path: str):
	reader = PdfReader(pdf_path)
	extracted_data = []

	for page in reader.pages:
	text = page.extract_text()
	print(text)

	# Regular expression pattern to capture the required fields:
	# 1. A number followed by a period (the index).
	# 2. A sequence of word characters, spaces, and hyphens followed by an arrow (the category).
	# 3. Another sequence of word characters, spaces, and hyphens followed by an arrow (the subcategory).
	# 4. Yet another sequence of word characters, spaces, and hyphens (the characteristic).
	# 5. Text following "•Definition :" until the next newline (the definition).
	# 6. Text following "•Notes :" until the references (the notes).
	# 7. Text following "•References :" until the next number followed by a period or the end of the file (the references).
	pattern = r"(\d+)\.\s+([\w\s-]+)→([\w\s-]+)→([\w\s-]+)\n•Definition :\s(.?)\n•Notes :([\s\S]?)•References :([\s\S]*?)(?=(\n\d+\.)\|\Z)"

	for match in re.finditer(pattern, text, re.DOTALL):
	index = match.group(1)
	category = match.group(2).strip()
	subcategory = match.group(3).strip()
	characteristic = match.group(4).strip()
	definition = match.group(5).strip()
	notes = match.group(6).strip()
	references = match.group(7).strip()

	extracted_data.append(
	[
	index,
	category,
	subcategory,
	characteristic,
	definition,
	notes,
	references,
	]
	)

	try:
	assert len(extracted_data) == 100, "The parser did not find 100 indicators"
	except AssertionError:
	indices = [int(item[0]) for item in extracted_data]
	missing_indices = [i for i in range(1, 101) if i not in indices]
	assert not missing_indices, f"Missing indices: {missing_indices}"

	return extracted_data


	data = parse_pdf(pdf_path="fmti_indicators.pdf")

	with open("fmti_indicators.csv", "w", newline="", encoding="utf-8") as csv_file:
	writer = csv.writer(csv_file)
	writer.writerow(
	[
	"Index",
	"Category",
	"Subcategory",
	"Characteristic",
	"Definition",
	"Notes",
	"References",
	]
	)
	writer.writerows(data)

	csv_file.seek(0)
	lines = csv_file.readlines()