File size: 3,629 Bytes
f5cca33
 
 
 
 
 
 
 
2bbdd94
4ad26d7
f5cca33
 
 
 
 
 
 
 
 
 
 
 
 
4ad26d7
f8f2aba
 
 
 
4ad26d7
d26c63c
d2755b8
 
f8f2aba
 
d26c63c
b4b84fd
d26c63c
 
b4b84fd
d26c63c
f8f2aba
 
d2755b8
d26c63c
f8f2aba
4ad26d7
 
 
f8f2aba
4ad26d7
d26c63c
f8f2aba
 
4ad26d7
2bbdd94
d26c63c
2bbdd94
 
 
 
 
5f68413
2bbdd94
d26c63c
 
5f68413
 
 
4ad26d7
d26c63c
4ad26d7
5f68413
f8f2aba
4ad26d7
d26c63c
4ad26d7
 
d26c63c
d2755b8
4ad26d7
f8f2aba
 
 
4ad26d7
f8f2aba
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import uuid
from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
from fastapi.responses import FileResponse, JSONResponse
import pypandoc
import shutil
import logging
import tempfile
import fitz  # PyMuPDF importé ici

# Initialize the logger
logging.basicConfig(level=logging.DEBUG)

# Initialize the FastAPI application
app = FastAPI()

def delete_temp_files(file_paths: list):
    """Function to delete temporary files after the response"""
    for file_path in file_paths:
        if os.path.exists(file_path):
            os.remove(file_path)
            logging.debug(f"Temporary file deleted: {file_path}")

@app.post("/convert/")
async def convert_file_to_txt(
    file: UploadFile = File(...), 
    background_tasks: BackgroundTasks = BackgroundTasks()
):
    try:
        # Original file name and extension
        original_filename = file.filename
        base_filename, ext = os.path.splitext(original_filename)
        ext = ext.lower()

        # Allowed extensions for conversion
        allowed_extensions = [
            '.odt', '.pdf', '.docx', '.html', '.htm', '.md', '.txt', '.rtf', '.epub',
            '.tex', '.xml', '.org', '.commonmark', '.cm', '.wiki', '.opml'
        ]
        
        if ext not in allowed_extensions:
            raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")

        # Create a temporary input file with the correct extension
        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file:
            input_filename = input_tmp_file.name
            with open(input_filename, "wb") as f:
                shutil.copyfileobj(file.file, f)
            logging.debug(f"Uploaded file: {input_filename}")

        # Define the output file name, keeping the same base name but with .txt extension
        unique_id = uuid.uuid4().hex
        output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")

        # PDF to text conversion using PyMuPDF
        if ext == '.pdf':
            text = ""
            with fitz.open(input_filename) as doc:
                for page in doc:
                    text += page.get_text()
            with open(output_filename, "w", encoding="utf-8") as f:
                f.write(text)
            logging.debug(f"PDF conversion successful with PyMuPDF: {output_filename}")
        
        # Other file formats to text conversion using Pandoc
        else:
            output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
            logging.debug(f"Conversion successful: {output_filename}")

        # Check if the .txt file exists
        if not os.path.exists(output_filename):
            logging.error(f"The file {output_filename} was not generated.")
            raise HTTPException(status_code=500, detail="Error during conversion.")

        # Add temporary files to background task for deletion after sending the response
        background_tasks.add_task(delete_temp_files, [input_filename, output_filename])

        # Return the converted file to the client, with the same base name and .txt extension
        return FileResponse(output_filename, filename=f"{base_filename}.txt")

    except HTTPException as http_exc:
        logging.error(f"HTTP error during conversion: {str(http_exc.detail)}")
        return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail})
    except Exception as e:
        logging.error(f"Error during conversion: {str(e)}")
        return JSONResponse(status_code=500, content={"message": f"Internal error: {str(e)}"})