Lucas ARRIESSE commited on
Commit
256eefa
·
1 Parent(s): 4edd44f

Hopefully fix task hang issues

Browse files
Files changed (1) hide show
  1. api/docs.py +42 -43
api/docs.py CHANGED
@@ -41,7 +41,7 @@ KREUZBERG_CONFIG: ExtractionConfig = ExtractionConfig(
41
  force_ocr=False, ocr_backend=None, extract_tables=True)
42
 
43
  # Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
44
- LO_CONVERSION_MUTEX = asyncio.Semaphore(1)
45
 
46
  # Supported file types for text extraction and their MIME type
47
  FORMAT_MIME_TYPES = {
@@ -65,54 +65,53 @@ async def convert_file_type(contents: io.BytesIO, filename: str, input_ext: str,
65
  filter: The conversion filter to use.
66
  """
67
 
68
- await LO_CONVERSION_MUTEX.acquire()
69
-
70
- with tempfile.TemporaryDirectory() as tmpdir:
71
- dir_path = Path(tmpdir)
72
- input_file_path = dir_path / f"{filename}.{input_ext}"
73
- output_file_path = dir_path / f"{filename}.{output_ext}"
74
-
75
- # write the memory contents to the input file
76
- with open(input_file_path, "wb") as in_file:
77
- in_file.write(contents.read())
78
-
79
- out_bytes = io.BytesIO()
80
-
81
- # construct the command
82
- command = [
83
- "libreoffice",
84
- "--headless",
85
- "--convert-to", f"{output_ext}:{filter}" if filter else output_ext,
86
- "--outdir", tmpdir,
87
- str(input_file_path) # Ensure path is a string for subprocess
88
- ]
89
-
90
- # convert using libreoffice asynchronously
91
- process = await asyncio.create_subprocess_exec(
92
- *command,
93
- stdout=asyncio.subprocess.PIPE,
94
- stderr=asyncio.subprocess.PIPE
95
- )
96
 
97
- stdout, stderr = await process.communicate()
98
 
99
- exit_code = await process.wait()
100
 
101
- if exit_code != 0 and not output_file_path.exists():
102
- raise subprocess.CalledProcessError(
103
- exit_code,
104
- command,
105
- output=stdout,
106
- stderr=stderr
107
- )
108
 
109
- LO_CONVERSION_MUTEX.release()
110
 
111
- with open(output_file_path, mode="rb") as out:
112
- out_bytes.write(out.read())
113
 
114
- out_bytes.seek(0)
115
- return out_bytes
116
 
117
 
118
  async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> list[str]:
 
41
  force_ocr=False, ocr_backend=None, extract_tables=True)
42
 
43
  # Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
44
+ LO_CONVERSION_MUTEX = asyncio.Lock()
45
 
46
  # Supported file types for text extraction and their MIME type
47
  FORMAT_MIME_TYPES = {
 
65
  filter: The conversion filter to use.
66
  """
67
 
68
+ async with LO_CONVERSION_MUTEX:
69
+ with tempfile.TemporaryDirectory() as tmpdir:
70
+ dir_path = Path(tmpdir)
71
+ input_file_path = dir_path / f"{filename}.{input_ext}"
72
+ output_file_path = dir_path / f"{filename}.{output_ext}"
73
+
74
+ # write the memory contents to the input file
75
+ with open(input_file_path, "wb") as in_file:
76
+ in_file.write(contents.read())
77
+
78
+ out_bytes = io.BytesIO()
79
+
80
+ # construct the command
81
+ command = [
82
+ "libreoffice",
83
+ "--headless",
84
+ "--convert-to", f"{output_ext}:{filter}" if filter else output_ext,
85
+ "--outdir", tmpdir,
86
+ str(input_file_path) # Ensure path is a string for subprocess
87
+ ]
88
+
89
+ # convert using libreoffice asynchronously
90
+ process = await asyncio.create_subprocess_exec(
91
+ *command,
92
+ stdout=asyncio.subprocess.PIPE,
93
+ stderr=asyncio.subprocess.PIPE
94
+ )
 
95
 
96
+ stdout, stderr = await process.communicate()
97
 
98
+ exit_code = await process.wait()
99
 
100
+ if exit_code != 0 and not output_file_path.exists():
101
+ raise subprocess.CalledProcessError(
102
+ exit_code,
103
+ command,
104
+ output=stdout,
105
+ stderr=stderr
106
+ )
107
 
108
+ # LO_CONVERSION_MUTEX.release()
109
 
110
+ with open(output_file_path, mode="rb") as out:
111
+ out_bytes.write(out.read())
112
 
113
+ out_bytes.seek(0)
114
+ return out_bytes
115
 
116
 
117
  async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> list[str]: