Spaces:
Runtime error
Runtime error
import PyPDF2 | |
import json | |
# Load the medical book PDF | |
pdf_path = "gale_of_medicin.pdf" | |
def extract_text(pdf_path): | |
with open(pdf_path, "rb") as file: | |
reader = PyPDF2.PdfReader(file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
# Extract and save text as JSON dataset | |
text_data = extract_text(pdf_path) | |
dataset = [{"prompt": "Medical Query", "response": text_data}] | |
with open("medical_dataset.json", "w", encoding="utf-8") as f: | |
json.dump(dataset, f, ensure_ascii=False, indent=4) | |
print("β Extracted text saved as `medical_dataset.json`") | |