Spaces:
Sleeping
Sleeping
import os | |
import uuid | |
from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks | |
from fastapi.responses import FileResponse, JSONResponse | |
import pypandoc | |
import shutil | |
import logging | |
import tempfile | |
import fitz # PyMuPDF importé ici | |
# Initialize the logger | |
logging.basicConfig(level=logging.DEBUG) | |
# Initialize the FastAPI application | |
app = FastAPI() | |
def delete_temp_files(file_paths: list): | |
"""Function to delete temporary files after the response""" | |
for file_path in file_paths: | |
if os.path.exists(file_path): | |
os.remove(file_path) | |
logging.debug(f"Temporary file deleted: {file_path}") | |
async def convert_file_to_txt( | |
file: UploadFile = File(...), | |
background_tasks: BackgroundTasks = BackgroundTasks() | |
): | |
try: | |
# Original file name and extension | |
original_filename = file.filename | |
base_filename, ext = os.path.splitext(original_filename) | |
ext = ext.lower() | |
# Allowed extensions for conversion | |
allowed_extensions = [ | |
'.odt', '.pdf', '.docx', '.html', '.htm', '.md', '.txt', '.rtf', '.epub', | |
'.tex', '.xml', '.org', '.commonmark', '.cm', '.wiki', '.opml' | |
] | |
if ext not in allowed_extensions: | |
raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}") | |
# Create a temporary input file with the correct extension | |
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file: | |
input_filename = input_tmp_file.name | |
with open(input_filename, "wb") as f: | |
shutil.copyfileobj(file.file, f) | |
logging.debug(f"Uploaded file: {input_filename}") | |
# Define the output file name, keeping the same base name but with .txt extension | |
unique_id = uuid.uuid4().hex | |
output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt") | |
# PDF to text conversion using PyMuPDF | |
if ext == '.pdf': | |
text = "" | |
with fitz.open(input_filename) as doc: | |
for page in doc: | |
text += page.get_text() | |
with open(output_filename, "w", encoding="utf-8") as f: | |
f.write(text) | |
logging.debug(f"PDF conversion successful with PyMuPDF: {output_filename}") | |
# Other file formats to text conversion using Pandoc | |
else: | |
output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename) | |
logging.debug(f"Conversion successful: {output_filename}") | |
# Check if the .txt file exists | |
if not os.path.exists(output_filename): | |
logging.error(f"The file {output_filename} was not generated.") | |
raise HTTPException(status_code=500, detail="Error during conversion.") | |
# Add temporary files to background task for deletion after sending the response | |
background_tasks.add_task(delete_temp_files, [input_filename, output_filename]) | |
# Return the converted file to the client, with the same base name and .txt extension | |
return FileResponse(output_filename, filename=f"{base_filename}.txt") | |
except HTTPException as http_exc: | |
logging.error(f"HTTP error during conversion: {str(http_exc.detail)}") | |
return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail}) | |
except Exception as e: | |
logging.error(f"Error during conversion: {str(e)}") | |
return JSONResponse(status_code=500, content={"message": f"Internal error: {str(e)}"}) | |