File size: 1,945 Bytes
dd80d25 da9fcee 4caba26 193c7ed decddfb 0f3972e b2010da 0f3972e b2010da f3dd2c8 4cd6fae 9d7ed12 0fccc5c 4cd6fae c945f59 7f80b30 03fc65f 4caba26 32b4458 1fede0d 32b4458 1fede0d 32b4458 1fede0d 03fc65f 4caba26 a3ac33b d89899f a3ac33b d89899f d4ca6c7 d89899f 193c7ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import os
import pandas as pd
from docx import Document
from pptx import Presentation
def get_questions(file_path, level):
df = pd.read_json(file_path, lines=True)
df = df[df["Level"] == level]
result=[]
for index, row in df.iterrows():
result.append([row["Level"], row["Question"], row["file_name"], row["Final answer"]])
return result
def is_ext(file_path, ext):
return os.path.splitext(file_path)[1].lower() == ext.lower()
def read_file(file_path):
ext = os.path.splitext(file_path)[1].lower()
df = None
if ext == ".csv":
df = pd.read_csv(file_path)
elif ext in (".xls", ".xlsx"):
df = pd.read_excel(file_path)
elif ext in (".json", ".jsonl"):
df = pd.read_json(file_path)
return "" if df is None else df.to_json()
def read_docx(file_path):
doc = Document(file_path)
text = []
for block in doc.element.body:
if block.tag.endswith("p"):
for paragraph in doc.paragraphs:
if paragraph._element == block and paragraph.text:
text.append(paragraph.text + "\n")
elif block.tag.endswith("tbl"):
for table in doc.tables:
if table._element == block:
for row in table.rows:
row_text = []
for cell in row.cells:
row_text.append(cell.text.strip())
text.append(" | ".join(row_text) + "\n")
return "\n".join(text)
def read_pptx(file_path):
prs = Presentation(file_path)
text = []
for slide in prs.slides:
slide_text = []
for shape in slide.shapes:
if hasattr(shape, "text"):
slide_text.append(shape.text)
text.append("\n".join(slide_text))
return "\n\n".join(text) |