import PyPDF2 import json # Load the medical book PDF pdf_path = "gale_of_medicin.pdf" def extract_text(pdf_path): with open(pdf_path, "rb") as file: reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text # Extract and save text as JSON dataset text_data = extract_text(pdf_path) dataset = [{"prompt": "Medical Query", "response": text_data}] with open("medical_dataset.json", "w", encoding="utf-8") as f: json.dump(dataset, f, ensure_ascii=False, indent=4) print("✅ Extracted text saved as `medical_dataset.json`")