notabaka commited on
Commit
958bbd7
·
1 Parent(s): c208ca1
Files changed (1) hide show
  1. app.py +10 -5
app.py CHANGED
@@ -38,11 +38,16 @@ def extract_text(doc):
38
  return doc.read().decode('utf-8')
39
 
40
  if doc.name.endswith(".pdf"):
41
- raw = doc.read().decode('latin-1')
42
-
43
- with pdfplumber.open(io.BytesIO(raw)) as pdf:
44
- pages = [page.extract_text() for page in pdf.pages]
45
- return "\n".join(pages)
 
 
 
 
 
46
  if doc.name.endswith('.docx'):
47
  raw_text = doc.read()
48
  return docx2txt.process(raw_text)
 
38
  return doc.read().decode('utf-8')
39
 
40
  if doc.name.endswith(".pdf"):
41
+ raw = doc.read()
42
+
43
+ # Remove null bytes without decoding
44
+ raw = raw.replace(b'\x00', b'')
45
+
46
+ pdf = pdfplumber.open(BytesIO(raw))
47
+ pages = [page.extract_text() for page in pdf.pages]
48
+ return "\n".join(pages)
49
+
50
+
51
  if doc.name.endswith('.docx'):
52
  raw_text = doc.read()
53
  return docx2txt.process(raw_text)