Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -27,12 +27,11 @@ def xml2text(xml):
|
|
27 |
|
28 |
def clean_text(content):
|
29 |
"""Cleans text content based on the 'clean' parameter."""
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
content = content.strip()
|
36 |
return content
|
37 |
|
38 |
|
@@ -66,7 +65,7 @@ def extract_text_from_docx(docx_data, clean=True):
|
|
66 |
text += xml2text(zipf.read(fname))
|
67 |
|
68 |
zipf.close()
|
69 |
-
if clean
|
70 |
text = clean_text(text)
|
71 |
return text, len(text)
|
72 |
|
@@ -100,7 +99,7 @@ def read_document(file, clean=True):
|
|
100 |
for cell in row:
|
101 |
if cell.value is not None:
|
102 |
content += str(cell.value) + ' '
|
103 |
-
if clean
|
104 |
content = clean_text(content)
|
105 |
return content, len(content)
|
106 |
except Exception as e:
|
|
|
27 |
|
28 |
def clean_text(content):
|
29 |
"""Cleans text content based on the 'clean' parameter."""
|
30 |
+
content = content.replace('\n', ' ')
|
31 |
+
content = content.replace('\r', ' ')
|
32 |
+
content = content.replace('\t', ' ')
|
33 |
+
content = content.replace(' ', ' ') # Replace double spaces with single
|
34 |
+
content = content.strip()
|
|
|
35 |
return content
|
36 |
|
37 |
|
|
|
65 |
text += xml2text(zipf.read(fname))
|
66 |
|
67 |
zipf.close()
|
68 |
+
if clean:
|
69 |
text = clean_text(text)
|
70 |
return text, len(text)
|
71 |
|
|
|
99 |
for cell in row:
|
100 |
if cell.value is not None:
|
101 |
content += str(cell.value) + ' '
|
102 |
+
if clean:
|
103 |
content = clean_text(content)
|
104 |
return content, len(content)
|
105 |
except Exception as e:
|