File size: 635 Bytes
6ab3b20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import PyPDF2
import json

# Load the medical book PDF
pdf_path = "gale_of_medicin.pdf"

def extract_text(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text

# Extract and save text as JSON dataset
text_data = extract_text(pdf_path)

dataset = [{"prompt": "Medical Query", "response": text_data}]
with open("medical_dataset.json", "w", encoding="utf-8") as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

print("βœ… Extracted text saved as `medical_dataset.json`")