convertToTXT / main.py
Bentham's picture
Update main.py
2bbdd94 verified
import os
import uuid
from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
from fastapi.responses import FileResponse, JSONResponse
import pypandoc
import shutil
import logging
import tempfile
import fitz # PyMuPDF importé ici
# Initialize the logger
logging.basicConfig(level=logging.DEBUG)
# Initialize the FastAPI application
app = FastAPI()
def delete_temp_files(file_paths: list):
"""Function to delete temporary files after the response"""
for file_path in file_paths:
if os.path.exists(file_path):
os.remove(file_path)
logging.debug(f"Temporary file deleted: {file_path}")
@app.post("/convert/")
async def convert_file_to_txt(
file: UploadFile = File(...),
background_tasks: BackgroundTasks = BackgroundTasks()
):
try:
# Original file name and extension
original_filename = file.filename
base_filename, ext = os.path.splitext(original_filename)
ext = ext.lower()
# Allowed extensions for conversion
allowed_extensions = [
'.odt', '.pdf', '.docx', '.html', '.htm', '.md', '.txt', '.rtf', '.epub',
'.tex', '.xml', '.org', '.commonmark', '.cm', '.wiki', '.opml'
]
if ext not in allowed_extensions:
raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
# Create a temporary input file with the correct extension
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file:
input_filename = input_tmp_file.name
with open(input_filename, "wb") as f:
shutil.copyfileobj(file.file, f)
logging.debug(f"Uploaded file: {input_filename}")
# Define the output file name, keeping the same base name but with .txt extension
unique_id = uuid.uuid4().hex
output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
# PDF to text conversion using PyMuPDF
if ext == '.pdf':
text = ""
with fitz.open(input_filename) as doc:
for page in doc:
text += page.get_text()
with open(output_filename, "w", encoding="utf-8") as f:
f.write(text)
logging.debug(f"PDF conversion successful with PyMuPDF: {output_filename}")
# Other file formats to text conversion using Pandoc
else:
output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
logging.debug(f"Conversion successful: {output_filename}")
# Check if the .txt file exists
if not os.path.exists(output_filename):
logging.error(f"The file {output_filename} was not generated.")
raise HTTPException(status_code=500, detail="Error during conversion.")
# Add temporary files to background task for deletion after sending the response
background_tasks.add_task(delete_temp_files, [input_filename, output_filename])
# Return the converted file to the client, with the same base name and .txt extension
return FileResponse(output_filename, filename=f"{base_filename}.txt")
except HTTPException as http_exc:
logging.error(f"HTTP error during conversion: {str(http_exc.detail)}")
return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail})
except Exception as e:
logging.error(f"Error during conversion: {str(e)}")
return JSONResponse(status_code=500, content={"message": f"Internal error: {str(e)}"})