Spaces:

ShebMichel
/

GeoScience_Exam_Marker

Sleeping

App Files Files Community

ShebMichel commited on Nov 1, 2024

Commit

e251c7d

verified ·

1 Parent(s): 472db8f

Upload exam_data_scrapper.py

Browse files

Files changed (1) hide show

exam_data_scrapper.py +90 -0

exam_data_scrapper.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#!pip install python-docx
+#!pip install PyPDF2 --upgrade
+import os
+import json
+from PyPDF2 import PdfReader
+from docx import Document
+def extract_from_pdf(pdf_path):
+    """Extract text from a PDF file."""
+    pdf_data = ""
+    with open(pdf_path, "rb") as pdf_file:
+        reader = PdfReader(pdf_file)
+        for page_num in range(len(reader.pages)):
+            page = reader.pages[page_num]
+            pdf_data += page.extract_text()
+    return pdf_data
+def extract_from_json(json_path):
+    """Extract data from a JSON file."""
+    with open(json_path, "r") as json_file:
+        json_data = json.load(json_file)
+    return json_data
+def extract_from_word(word_path):
+    """Extract text from a Word (.docx) file."""
+    doc = Document(word_path)
+    word_data = ""
+    for para in doc.paragraphs:
+        word_data += para.text + "\n"
+    return word_data
+def extract_data(file_path):
+    """Extract data from a file based on its extension."""
+    _, file_extension = os.path.splitext(file_path)
+    if file_extension == ".pdf":
+        return extract_from_pdf(file_path)
+    elif file_extension == ".json":
+        return extract_from_json(file_path)
+    elif file_extension == ".docx":
+        return extract_from_word(file_path)
+    else:
+        raise ValueError("Unsupported file extension: " + file_extension)
+def create_data_dictionary(files):
+    """Create a dictionary containing data from files based on their extension."""
+    data_dict = {}
+    for file_path in files:
+        try:
+            file_data = extract_data(file_path)
+            data_dict[file_path] = file_data
+        except ValueError as e:
+            print(e)
+    return data_dict
+# Usage example
+path      = r'C:\Users\00110138\OneDrive - The University of Western Australia\Project\KaggleX FellowshipProgram\code\Exam_Data'
+# Usage example
+files     = [str(path)+"/Geology_Geophysics_Exam.pdf", str(path)+"/Geology_Geophysics_Exam.json", str(path)+"/Geology_Geophysics_Exam.docx"]
+exam_data = [files[1]]
+data_dict = create_data_dictionary(exam_data)
+##
+school_data   = ['university','department','course_code','course_title','date','duration','instructor']
+qcm_data      = ['question','options', 'answer']
+short_data    = ['question','answer']
+#print(data_dict[str(exam_data[0])]['multiple_choice_questions'])
+multiple_choice_questions = data_dict[str(exam_data[0])]['multiple_choice_questions']
+short_answer_questions    = data_dict[str(exam_data[0])]['short_answer_questions']
+long_answer_questions     = data_dict[str(exam_data[0])]['long_answer_questions']
+for s_data in school_data:
+   print(f" {s_data}: {data_dict[str(exam_data[0])]['header'][str(s_data)]}")
+print(f"***************'school data'************************")
+for idx,qcm in enumerate(multiple_choice_questions):
+    print(f" Index is: {idx} and 'Question': {qcm['question']}")
+    print(f" Index is: {idx} and 'Options': {qcm['options']}")
+    print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
+print(f"***************'multiple_choice_questions'************************")
+for idx,qcm in enumerate(short_answer_questions):
+    print(f" Index is: {idx} and 'Question': {qcm['question']}")
+    print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
+print(f"***************' END short_answer_questions'************************")
+print(f"***************' START long_answer_questions'************************")
+for idx,qcm in enumerate(long_answer_questions):
+    print(f" Index is: {idx} and 'Question': {qcm['question']}")
+    print(f" Index is: {idx} and 'Answer': {qcm['answer']}")
+print(f"***************' END long_answer_questions'************************")