convert_test

Sleeping

App Files Files Community

Bentham commited on Dec 20, 2024

Commit

c262799

verified ·

1 Parent(s): 4accb2f

Update main.py

Browse files

Files changed (1) hide show

main.py +26 -9

main.py CHANGED Viewed

@@ -60,8 +60,7 @@ FORMAT_MAP = {
     '.commonmark': 'commonmark',
     '.cm': 'commonmark',
     '.wiki': 'mediawiki',
-    '.opml': 'opml',
-    '.ppt': 'pptx'
 }
 ALLOWED_EXTENSIONS_FOR_ACCESSIBILITY = list(FORMAT_MAP.keys()) + ['.doc', '.ppt', '.pptx']
@@ -296,8 +295,6 @@ def pdf_to_html(input_filename: str) -> str:
     return str(soup)
 def convert_with_pandoc(input_filename: str, input_format: str) -> str:
-    # On force l'utilisation de --self-contained pour les docx, afin d'inclure les images.
-    # Pour les autres formats, on garde la logique de fallback.
     if input_format == 'docx':
         try:
             output = pypandoc.convert_file(
@@ -309,11 +306,33 @@ def convert_with_pandoc(input_filename: str, input_format: str) -> str:
             )
             return output
         except RuntimeError as e:
-            # Si on ne peut pas faire --self-contained pour docx, on lève une erreur car sinon pas d'images.
             logging.error(f"Pandoc a rencontré une erreur avec --self-contained sur un docx : {str(e)}")
             raise RuntimeError("Impossible de convertir le docx avec --self-contained. Les images ne peuvent pas être traitées.")
     else:
-        # Comportement inchangé pour les autres formats
         try:
             output = pypandoc.convert_file(
                 input_filename,
@@ -334,7 +353,6 @@ def convert_with_pandoc(input_filename: str, input_format: str) -> str:
             )
             return output
 def text_to_html(text: str) -> str:
     lines = text.split('\n')
     html_lines = ['<p>' + line.strip() + '</p>' for line in lines if line.strip()]
@@ -786,9 +804,8 @@ async def convert_file_to_txt(
         elif ext == '.pptx':
             html_content = convert_pptx_to_html(input_filename)
         elif ext == '.ppt':
-            input_format = get_pandoc_format(ext)
             try:
-                html_content = convert_with_pandoc(input_filename, input_format)
             except Exception as e:
                 logging.error(f"Erreur lors de la conversion de .ppt avec pypandoc: {e}")
                 raise HTTPException(status_code=500, detail=f"Erreur lors de la conversion du fichier .ppt: {e}")

     '.commonmark': 'commonmark',
     '.cm': 'commonmark',
     '.wiki': 'mediawiki',
+    '.opml': 'opml'
 }
 ALLOWED_EXTENSIONS_FOR_ACCESSIBILITY = list(FORMAT_MAP.keys()) + ['.doc', '.ppt', '.pptx']
     return str(soup)
 def convert_with_pandoc(input_filename: str, input_format: str) -> str:
     if input_format == 'docx':
         try:
             output = pypandoc.convert_file(
             )
             return output
         except RuntimeError as e:
             logging.error(f"Pandoc a rencontré une erreur avec --self-contained sur un docx : {str(e)}")
             raise RuntimeError("Impossible de convertir le docx avec --self-contained. Les images ne peuvent pas être traitées.")
+    elif os.path.splitext(input_filename)[1].lower() == '.ppt':
+        try:
+            output = pypandoc.convert_file(
+                input_filename,
+                'html',
+                format='auto',
+                outputfile=None,
+                extra_args=['--strip-comments', '--quiet']
+            )
+            return output
+        except RuntimeError as e:
+            logging.error(f"Pandoc a rencontré une erreur avec le format 'auto' sur un ppt : {str(e)}, tentative avec 'ppt'.")
+            try:
+                output = pypandoc.convert_file(
+                    input_filename,
+                    'html',
+                    format='ppt',
+                    outputfile=None,
+                    extra_args=['--strip-comments', '--quiet']
+                )
+                return output
+            except RuntimeError as e:
+                logging.error(f"Pandoc a rencontré une erreur avec le format 'ppt' sur un ppt : {str(e)}.")
+                raise
     else:
         try:
             output = pypandoc.convert_file(
                 input_filename,
             )
             return output
 def text_to_html(text: str) -> str:
     lines = text.split('\n')
     html_lines = ['<p>' + line.strip() + '</p>' for line in lines if line.strip()]
         elif ext == '.pptx':
             html_content = convert_pptx_to_html(input_filename)
         elif ext == '.ppt':
             try:
+                html_content = convert_with_pandoc(input_filename, get_pandoc_format(ext))
             except Exception as e:
                 logging.error(f"Erreur lors de la conversion de .ppt avec pypandoc: {e}")
                 raise HTTPException(status_code=500, detail=f"Erreur lors de la conversion du fichier .ppt: {e}")