bstraehle commited on
Commit
4caba26
·
verified ·
1 Parent(s): 67a608a

Update util.py

Browse files
Files changed (1) hide show
  1. util.py +31 -3
util.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
2
  import pandas as pd
 
 
3
 
4
  def get_questions(file_path, level):
5
  df = pd.read_json(file_path, lines=True)
@@ -13,10 +15,10 @@ def get_questions(file_path, level):
13
  return result
14
 
15
  def read_file(file_path):
16
- df = None
17
-
18
  ext = os.path.splitext(file_path)[1].lower()
19
 
 
 
20
  if ext == ".csv":
21
  df = pd.read_csv(file_path)
22
  elif ext in (".xls", ".xlsx"):
@@ -24,4 +26,30 @@ def read_file(file_path):
24
  elif ext in (".json", ".jsonl"):
25
  df = pd.read_json(file_path)
26
 
27
- return "" if df is None else df.to_json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import pandas as pd
3
+ from docx import Document
4
+ from pptx import Presentation
5
 
6
  def get_questions(file_path, level):
7
  df = pd.read_json(file_path, lines=True)
 
15
  return result
16
 
17
  def read_file(file_path):
 
 
18
  ext = os.path.splitext(file_path)[1].lower()
19
 
20
+ df = None
21
+
22
  if ext == ".csv":
23
  df = pd.read_csv(file_path)
24
  elif ext in (".xls", ".xlsx"):
 
26
  elif ext in (".json", ".jsonl"):
27
  df = pd.read_json(file_path)
28
 
29
+ return "" if df is None else df.to_json()
30
+
31
+ def read_docx(file_path):
32
+ doc = Document(file_path)
33
+
34
+ text = []
35
+
36
+ for para in doc.paragraphs:
37
+ text.append(para.text)
38
+
39
+ return "\n".join(text)
40
+
41
+ def read_pptx(file_path):
42
+ prs = Presentation(file_path)
43
+
44
+ text = []
45
+
46
+ for slide in prs.slides:
47
+ slide_text = []
48
+
49
+ for shape in slide.shapes:
50
+ if hasattr(shape, "text"):
51
+ slide_text.append(shape.text)
52
+
53
+ text.append("\n".join(slide_text))
54
+
55
+ return "\n\n".join(text)