medical_chatbot / extract_text.py
nadeen-elsayed's picture
Create extract_text.py
6ab3b20 verified
raw
history blame contribute delete
635 Bytes
import PyPDF2
import json
# Load the medical book PDF
pdf_path = "gale_of_medicin.pdf"
def extract_text(pdf_path):
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
# Extract and save text as JSON dataset
text_data = extract_text(pdf_path)
dataset = [{"prompt": "Medical Query", "response": text_data}]
with open("medical_dataset.json", "w", encoding="utf-8") as f:
json.dump(dataset, f, ensure_ascii=False, indent=4)
print("βœ… Extracted text saved as `medical_dataset.json`")