Spaces:

Bentham
/

convertToTXT

Sleeping

App Files Files Community

convertToTXT / main.py

Bentham

Update main.py

2bbdd94 verified 4 months ago

raw

history blame contribute delete

3.63 kB

	import os
	import uuid
	from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
	from fastapi.responses import FileResponse, JSONResponse
	import pypandoc
	import shutil
	import logging
	import tempfile
	import fitz # PyMuPDF importé ici

	# Initialize the logger
	logging.basicConfig(level=logging.DEBUG)

	# Initialize the FastAPI application
	app = FastAPI()

	def delete_temp_files(file_paths: list):
	"""Function to delete temporary files after the response"""
	for file_path in file_paths:
	if os.path.exists(file_path):
	os.remove(file_path)
	logging.debug(f"Temporary file deleted: {file_path}")

	@app.post("/convert/")
	async def convert_file_to_txt(
	file: UploadFile = File(...),
	background_tasks: BackgroundTasks = BackgroundTasks()
	):
	try:
	# Original file name and extension
	original_filename = file.filename
	base_filename, ext = os.path.splitext(original_filename)
	ext = ext.lower()

	# Allowed extensions for conversion
	allowed_extensions = [
	'.odt', '.pdf', '.docx', '.html', '.htm', '.md', '.txt', '.rtf', '.epub',
	'.tex', '.xml', '.org', '.commonmark', '.cm', '.wiki', '.opml'
	]

	if ext not in allowed_extensions:
	raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")

	# Create a temporary input file with the correct extension
	with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as input_tmp_file:
	input_filename = input_tmp_file.name
	with open(input_filename, "wb") as f:
	shutil.copyfileobj(file.file, f)
	logging.debug(f"Uploaded file: {input_filename}")

	# Define the output file name, keeping the same base name but with .txt extension
	unique_id = uuid.uuid4().hex
	output_filename = os.path.join(tempfile.gettempdir(), f"{base_filename}_{unique_id}.txt")

	# PDF to text conversion using PyMuPDF
	if ext == '.pdf':
	text = ""
	with fitz.open(input_filename) as doc:
	for page in doc:
	text += page.get_text()
	with open(output_filename, "w", encoding="utf-8") as f:
	f.write(text)
	logging.debug(f"PDF conversion successful with PyMuPDF: {output_filename}")

	# Other file formats to text conversion using Pandoc
	else:
	output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
	logging.debug(f"Conversion successful: {output_filename}")

	# Check if the .txt file exists
	if not os.path.exists(output_filename):
	logging.error(f"The file {output_filename} was not generated.")
	raise HTTPException(status_code=500, detail="Error during conversion.")

	# Add temporary files to background task for deletion after sending the response
	background_tasks.add_task(delete_temp_files, [input_filename, output_filename])

	# Return the converted file to the client, with the same base name and .txt extension
	return FileResponse(output_filename, filename=f"{base_filename}.txt")

	except HTTPException as http_exc:
	logging.error(f"HTTP error during conversion: {str(http_exc.detail)}")
	return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail})
	except Exception as e:
	logging.error(f"Error during conversion: {str(e)}")
	return JSONResponse(status_code=500, content={"message": f"Internal error: {str(e)}"})