ShebMichel commited on
Commit
e251c7d
·
verified ·
1 Parent(s): 472db8f

Upload exam_data_scrapper.py

Browse files
Files changed (1) hide show
  1. exam_data_scrapper.py +90 -0
exam_data_scrapper.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!pip install python-docx
2
+ #!pip install PyPDF2 --upgrade
3
+
4
+
5
+ import os
6
+ import json
7
+ from PyPDF2 import PdfReader
8
+ from docx import Document
9
+
10
+ def extract_from_pdf(pdf_path):
11
+ """Extract text from a PDF file."""
12
+ pdf_data = ""
13
+ with open(pdf_path, "rb") as pdf_file:
14
+ reader = PdfReader(pdf_file)
15
+ for page_num in range(len(reader.pages)):
16
+ page = reader.pages[page_num]
17
+ pdf_data += page.extract_text()
18
+ return pdf_data
19
+
20
+ def extract_from_json(json_path):
21
+ """Extract data from a JSON file."""
22
+ with open(json_path, "r") as json_file:
23
+ json_data = json.load(json_file)
24
+ return json_data
25
+
26
+ def extract_from_word(word_path):
27
+ """Extract text from a Word (.docx) file."""
28
+ doc = Document(word_path)
29
+ word_data = ""
30
+ for para in doc.paragraphs:
31
+ word_data += para.text + "\n"
32
+ return word_data
33
+
34
+ def extract_data(file_path):
35
+ """Extract data from a file based on its extension."""
36
+ _, file_extension = os.path.splitext(file_path)
37
+
38
+ if file_extension == ".pdf":
39
+ return extract_from_pdf(file_path)
40
+ elif file_extension == ".json":
41
+ return extract_from_json(file_path)
42
+ elif file_extension == ".docx":
43
+ return extract_from_word(file_path)
44
+ else:
45
+ raise ValueError("Unsupported file extension: " + file_extension)
46
+
47
+ def create_data_dictionary(files):
48
+ """Create a dictionary containing data from files based on their extension."""
49
+ data_dict = {}
50
+ for file_path in files:
51
+ try:
52
+ file_data = extract_data(file_path)
53
+ data_dict[file_path] = file_data
54
+ except ValueError as e:
55
+ print(e)
56
+ return data_dict
57
+
58
+ # Usage example
59
+ path = r'C:\Users\00110138\OneDrive - The University of Western Australia\Project\KaggleX FellowshipProgram\code\Exam_Data'
60
+ # Usage example
61
+ files = [str(path)+"/Geology_Geophysics_Exam.pdf", str(path)+"/Geology_Geophysics_Exam.json", str(path)+"/Geology_Geophysics_Exam.docx"]
62
+ exam_data = [files[1]]
63
+ data_dict = create_data_dictionary(exam_data)
64
+ ##
65
+ school_data = ['university','department','course_code','course_title','date','duration','instructor']
66
+ qcm_data = ['question','options', 'answer']
67
+ short_data = ['question','answer']
68
+ #print(data_dict[str(exam_data[0])]['multiple_choice_questions'])
69
+ multiple_choice_questions = data_dict[str(exam_data[0])]['multiple_choice_questions']
70
+ short_answer_questions = data_dict[str(exam_data[0])]['short_answer_questions']
71
+ long_answer_questions = data_dict[str(exam_data[0])]['long_answer_questions']
72
+
73
+ for s_data in school_data:
74
+ print(f" {s_data}: {data_dict[str(exam_data[0])]['header'][str(s_data)]}")
75
+ print(f"***************'school data'************************")
76
+
77
+ for idx,qcm in enumerate(multiple_choice_questions):
78
+ print(f" Index is: {idx} and 'Question': {qcm['question']}")
79
+ print(f" Index is: {idx} and 'Options': {qcm['options']}")
80
+ print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
81
+ print(f"***************'multiple_choice_questions'************************")
82
+ for idx,qcm in enumerate(short_answer_questions):
83
+ print(f" Index is: {idx} and 'Question': {qcm['question']}")
84
+ print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
85
+ print(f"***************' END short_answer_questions'************************")
86
+ print(f"***************' START long_answer_questions'************************")
87
+ for idx,qcm in enumerate(long_answer_questions):
88
+ print(f" Index is: {idx} and 'Question': {qcm['question']}")
89
+ print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
90
+ print(f"***************' END long_answer_questions'************************")