File size: 2,180 Bytes
cfe90dc
da9fcee
4caba26
193c7ed
decddfb
0f3972e
cfe90dc
 
c532c90
7284d40
 
 
b2010da
f3dd2c8
4cd6fae
9d7ed12
221fc7f
4cd6fae
c945f59
 
7f80b30
 
 
be7c8ff
03fc65f
 
4caba26
 
32b4458
1fede0d
32b4458
fc997be
32b4458
1fede0d
03fc65f
4caba26
 
be7c8ff
4caba26
 
 
 
a3ac33b
 
 
8acf5f0
 
 
 
 
a3ac33b
 
 
 
 
 
 
acc8cb5
d4ca6c7
d89899f
193c7ed
be7c8ff
193c7ed
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import datasets, os
import pandas as pd
from docx import Document
from pptx import Presentation

def get_questions(file_path, level):
    #df = pd.read_json(file_path, lines=True)

    df = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all", token=True)["validation"]
    
    if level > 0:
        df = df[df["Level"] == level]
    
    result=[]
    
    for index, row in df.iterrows():
        result.append([row["Question"], row["Level"], row["Final answer"], row["file_name"]])

    return result

def is_ext(file_path, ext):
    return os.path.splitext(file_path)[1].lower() == ext.lower()
    
def read_file_json(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    df = None

    if ext == ".csv":
        df = pd.read_csv(file_path)
    elif ext in (".xls", ".xlsx"):
        df = pd.read_excel(file_path)
    elif ext in (".json", ".jsonl"):
        df = pd.read_json(file_path)

    return "" if df is None else df.to_json()

def read_docx_text(file_path):
    doc = Document(file_path)
    
    text = []

    for block in doc.element.body:
        if block.tag.endswith("p"):
            for paragraph in doc.paragraphs:
                if paragraph._element == block:
                    if paragraph.style.name.startswith("Heading"):
                        text.append("\n**" + paragraph.text + "**\n")
                    elif paragraph.text:
                        text.append(paragraph.text)
        elif block.tag.endswith("tbl"):
            for table in doc.tables:
                if table._element == block:
                    for row in table.rows:
                        row_text = []
                        for cell in row.cells:
                            row_text.append(cell.text.strip())
                        text.append(" | ".join(row_text))
            
    return "\n".join(text)

def read_pptx_text(file_path):
    prs = Presentation(file_path)
    
    text = []
    
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    
    return "\n\n".join(text)