nadeen-elsayed commited on
Commit
6ab3b20
Β·
verified Β·
1 Parent(s): 05558cb

Create extract_text.py

Browse files
Files changed (1) hide show
  1. extract_text.py +22 -0
extract_text.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import json
3
+
4
+ # Load the medical book PDF
5
+ pdf_path = "gale_of_medicin.pdf"
6
+
7
+ def extract_text(pdf_path):
8
+ with open(pdf_path, "rb") as file:
9
+ reader = PyPDF2.PdfReader(file)
10
+ text = ""
11
+ for page in reader.pages:
12
+ text += page.extract_text() + "\n"
13
+ return text
14
+
15
+ # Extract and save text as JSON dataset
16
+ text_data = extract_text(pdf_path)
17
+
18
+ dataset = [{"prompt": "Medical Query", "response": text_data}]
19
+ with open("medical_dataset.json", "w", encoding="utf-8") as f:
20
+ json.dump(dataset, f, ensure_ascii=False, indent=4)
21
+
22
+ print("βœ… Extracted text saved as `medical_dataset.json`")