File size: 2,180 Bytes
cfe90dc da9fcee 4caba26 193c7ed decddfb 0f3972e cfe90dc c532c90 7284d40 b2010da f3dd2c8 4cd6fae 9d7ed12 221fc7f 4cd6fae c945f59 7f80b30 be7c8ff 03fc65f 4caba26 32b4458 1fede0d 32b4458 fc997be 32b4458 1fede0d 03fc65f 4caba26 be7c8ff 4caba26 a3ac33b 8acf5f0 a3ac33b acc8cb5 d4ca6c7 d89899f 193c7ed be7c8ff 193c7ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import datasets, os
import pandas as pd
from docx import Document
from pptx import Presentation
def get_questions(file_path, level):
#df = pd.read_json(file_path, lines=True)
df = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all", token=True)["validation"]
if level > 0:
df = df[df["Level"] == level]
result=[]
for index, row in df.iterrows():
result.append([row["Question"], row["Level"], row["Final answer"], row["file_name"]])
return result
def is_ext(file_path, ext):
return os.path.splitext(file_path)[1].lower() == ext.lower()
def read_file_json(file_path):
ext = os.path.splitext(file_path)[1].lower()
df = None
if ext == ".csv":
df = pd.read_csv(file_path)
elif ext in (".xls", ".xlsx"):
df = pd.read_excel(file_path)
elif ext in (".json", ".jsonl"):
df = pd.read_json(file_path)
return "" if df is None else df.to_json()
def read_docx_text(file_path):
doc = Document(file_path)
text = []
for block in doc.element.body:
if block.tag.endswith("p"):
for paragraph in doc.paragraphs:
if paragraph._element == block:
if paragraph.style.name.startswith("Heading"):
text.append("\n**" + paragraph.text + "**\n")
elif paragraph.text:
text.append(paragraph.text)
elif block.tag.endswith("tbl"):
for table in doc.tables:
if table._element == block:
for row in table.rows:
row_text = []
for cell in row.cells:
row_text.append(cell.text.strip())
text.append(" | ".join(row_text))
return "\n".join(text)
def read_pptx_text(file_path):
prs = Presentation(file_path)
text = []
for slide in prs.slides:
slide_text = []
for shape in slide.shapes:
if hasattr(shape, "text"):
slide_text.append(shape.text)
text.append("\n".join(slide_text))
return "\n\n".join(text) |