Spaces:
Sleeping
Sleeping
File size: 3,629 Bytes
f5cca33 2bbdd94 4ad26d7 f5cca33 4ad26d7 f8f2aba 4ad26d7 d26c63c d2755b8 f8f2aba d26c63c b4b84fd d26c63c b4b84fd d26c63c f8f2aba d2755b8 d26c63c f8f2aba 4ad26d7 f8f2aba 4ad26d7 d26c63c f8f2aba 4ad26d7 2bbdd94 d26c63c 2bbdd94 5f68413 2bbdd94 d26c63c 5f68413 4ad26d7 d26c63c 4ad26d7 5f68413 f8f2aba 4ad26d7 d26c63c 4ad26d7 d26c63c d2755b8 4ad26d7 f8f2aba 4ad26d7 f8f2aba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
import uuid
from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
from fastapi.responses import FileResponse, JSONResponse
import pypandoc
import shutil
import logging
import tempfile
import fitz # PyMuPDF importé ici
# Initialize the logger
logging.basicConfig(level=logging.DEBUG)
# Initialize the FastAPI application
app = FastAPI()
def delete_temp_files(file_paths: list):
"""Function to delete temporary files after the response"""
for file_path in file_paths:
if os.path.exists(file_path):
os.remove(file_path)
logging.debug(f"Temporary file deleted: {file_path}")
@app.post("/convert/")
async def convert_file_to_txt(
file: UploadFile = File(...),
background_tasks: BackgroundTasks = BackgroundTasks()
):
try:
# Original file name and extension
original_filename = file.filename
base_filename, ext = os.path.splitext(original_filename)
ext = ext.lower()
# Allowed extensions for conversion
allowed_extensions = [
'.odt', '.pdf', '.docx', '.html', '.htm', '.md', '.txt', '.rtf', '.epub',
'.tex', '.xml', '.org', '.commonmark', '.cm', '.wiki', '.opml'
]
if ext not in allowed_extensions:
raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
# Create a temporary input file with the correct extension
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file:
input_filename = input_tmp_file.name
with open(input_filename, "wb") as f:
shutil.copyfileobj(file.file, f)
logging.debug(f"Uploaded file: {input_filename}")
# Define the output file name, keeping the same base name but with .txt extension
unique_id = uuid.uuid4().hex
output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")
# PDF to text conversion using PyMuPDF
if ext == '.pdf':
text = ""
with fitz.open(input_filename) as doc:
for page in doc:
text += page.get_text()
with open(output_filename, "w", encoding="utf-8") as f:
f.write(text)
logging.debug(f"PDF conversion successful with PyMuPDF: {output_filename}")
# Other file formats to text conversion using Pandoc
else:
output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
logging.debug(f"Conversion successful: {output_filename}")
# Check if the .txt file exists
if not os.path.exists(output_filename):
logging.error(f"The file {output_filename} was not generated.")
raise HTTPException(status_code=500, detail="Error during conversion.")
# Add temporary files to background task for deletion after sending the response
background_tasks.add_task(delete_temp_files, [input_filename, output_filename])
# Return the converted file to the client, with the same base name and .txt extension
return FileResponse(output_filename, filename=f"{base_filename}.txt")
except HTTPException as http_exc:
logging.error(f"HTTP error during conversion: {str(http_exc.detail)}")
return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail})
except Exception as e:
logging.error(f"Error during conversion: {str(e)}")
return JSONResponse(status_code=500, content={"message": f"Internal error: {str(e)}"})
|