Spaces:
Sleeping
Sleeping
Lucas ARRIESSE
commited on
Commit
·
256eefa
1
Parent(s):
4edd44f
Hopefully fix task hang issues
Browse files- api/docs.py +42 -43
api/docs.py
CHANGED
@@ -41,7 +41,7 @@ KREUZBERG_CONFIG: ExtractionConfig = ExtractionConfig(
|
|
41 |
force_ocr=False, ocr_backend=None, extract_tables=True)
|
42 |
|
43 |
# Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
|
44 |
-
LO_CONVERSION_MUTEX = asyncio.
|
45 |
|
46 |
# Supported file types for text extraction and their MIME type
|
47 |
FORMAT_MIME_TYPES = {
|
@@ -65,54 +65,53 @@ async def convert_file_type(contents: io.BytesIO, filename: str, input_ext: str,
|
|
65 |
filter: The conversion filter to use.
|
66 |
"""
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
)
|
96 |
|
97 |
-
|
98 |
|
99 |
-
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
|
109 |
-
|
110 |
|
111 |
-
|
112 |
-
|
113 |
|
114 |
-
|
115 |
-
|
116 |
|
117 |
|
118 |
async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> list[str]:
|
|
|
41 |
force_ocr=False, ocr_backend=None, extract_tables=True)
|
42 |
|
43 |
# Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
|
44 |
+
LO_CONVERSION_MUTEX = asyncio.Lock()
|
45 |
|
46 |
# Supported file types for text extraction and their MIME type
|
47 |
FORMAT_MIME_TYPES = {
|
|
|
65 |
filter: The conversion filter to use.
|
66 |
"""
|
67 |
|
68 |
+
async with LO_CONVERSION_MUTEX:
|
69 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
70 |
+
dir_path = Path(tmpdir)
|
71 |
+
input_file_path = dir_path / f"{filename}.{input_ext}"
|
72 |
+
output_file_path = dir_path / f"{filename}.{output_ext}"
|
73 |
+
|
74 |
+
# write the memory contents to the input file
|
75 |
+
with open(input_file_path, "wb") as in_file:
|
76 |
+
in_file.write(contents.read())
|
77 |
+
|
78 |
+
out_bytes = io.BytesIO()
|
79 |
+
|
80 |
+
# construct the command
|
81 |
+
command = [
|
82 |
+
"libreoffice",
|
83 |
+
"--headless",
|
84 |
+
"--convert-to", f"{output_ext}:{filter}" if filter else output_ext,
|
85 |
+
"--outdir", tmpdir,
|
86 |
+
str(input_file_path) # Ensure path is a string for subprocess
|
87 |
+
]
|
88 |
+
|
89 |
+
# convert using libreoffice asynchronously
|
90 |
+
process = await asyncio.create_subprocess_exec(
|
91 |
+
*command,
|
92 |
+
stdout=asyncio.subprocess.PIPE,
|
93 |
+
stderr=asyncio.subprocess.PIPE
|
94 |
+
)
|
|
|
95 |
|
96 |
+
stdout, stderr = await process.communicate()
|
97 |
|
98 |
+
exit_code = await process.wait()
|
99 |
|
100 |
+
if exit_code != 0 and not output_file_path.exists():
|
101 |
+
raise subprocess.CalledProcessError(
|
102 |
+
exit_code,
|
103 |
+
command,
|
104 |
+
output=stdout,
|
105 |
+
stderr=stderr
|
106 |
+
)
|
107 |
|
108 |
+
# LO_CONVERSION_MUTEX.release()
|
109 |
|
110 |
+
with open(output_file_path, mode="rb") as out:
|
111 |
+
out_bytes.write(out.read())
|
112 |
|
113 |
+
out_bytes.seek(0)
|
114 |
+
return out_bytes
|
115 |
|
116 |
|
117 |
async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> list[str]:
|