Spaces:
Runtime error
Runtime error
Create extract_text.py
Browse files- extract_text.py +22 -0
extract_text.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PyPDF2
|
2 |
+
import json
|
3 |
+
|
4 |
+
# Load the medical book PDF
|
5 |
+
pdf_path = "gale_of_medicin.pdf"
|
6 |
+
|
7 |
+
def extract_text(pdf_path):
|
8 |
+
with open(pdf_path, "rb") as file:
|
9 |
+
reader = PyPDF2.PdfReader(file)
|
10 |
+
text = ""
|
11 |
+
for page in reader.pages:
|
12 |
+
text += page.extract_text() + "\n"
|
13 |
+
return text
|
14 |
+
|
15 |
+
# Extract and save text as JSON dataset
|
16 |
+
text_data = extract_text(pdf_path)
|
17 |
+
|
18 |
+
dataset = [{"prompt": "Medical Query", "response": text_data}]
|
19 |
+
with open("medical_dataset.json", "w", encoding="utf-8") as f:
|
20 |
+
json.dump(dataset, f, ensure_ascii=False, indent=4)
|
21 |
+
|
22 |
+
print("β
Extracted text saved as `medical_dataset.json`")
|