Spaces:
Sleeping
Sleeping
test API asynchrones
Browse files
main.py
CHANGED
|
@@ -6,12 +6,12 @@ import base64
|
|
| 6 |
import re
|
| 7 |
import threading
|
| 8 |
import time
|
| 9 |
-
from typing import List, Dict
|
| 10 |
import logging
|
| 11 |
import tempfile
|
| 12 |
import shutil
|
| 13 |
import json
|
| 14 |
-
import
|
| 15 |
|
| 16 |
from openai import OpenAI
|
| 17 |
|
|
@@ -21,23 +21,22 @@ import pypandoc
|
|
| 21 |
import fitz # PyMuPDF
|
| 22 |
from bs4 import BeautifulSoup, Comment
|
| 23 |
|
| 24 |
-
#
|
| 25 |
logging.basicConfig(level=logging.DEBUG)
|
| 26 |
|
| 27 |
-
#
|
| 28 |
app = FastAPI()
|
| 29 |
|
| 30 |
client = OpenAI()
|
| 31 |
|
| 32 |
-
|
| 33 |
-
# Dossier de base pour les jobs
|
| 34 |
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 35 |
JOBS_DIR = os.path.join(tempfile.gettempdir(), 'jobs')
|
| 36 |
|
| 37 |
if not os.path.exists(JOBS_DIR):
|
| 38 |
os.makedirs(JOBS_DIR)
|
| 39 |
|
| 40 |
-
# Map
|
| 41 |
FORMAT_MAP = {
|
| 42 |
'.odt': 'odt',
|
| 43 |
'.pdf': 'pdf',
|
|
@@ -57,8 +56,8 @@ FORMAT_MAP = {
|
|
| 57 |
}
|
| 58 |
|
| 59 |
def get_pandoc_format(extension: str) -> str:
|
| 60 |
-
"""
|
| 61 |
-
return FORMAT_MAP.get(extension, 'auto') # 'auto'
|
| 62 |
|
| 63 |
def update_job_status(job_id: str, status: str, message: str = '', result_file: str = None):
|
| 64 |
job_dir = os.path.join(JOBS_DIR, job_id)
|
|
@@ -85,41 +84,41 @@ def get_job_status(job_id: str):
|
|
| 85 |
def process_file(job_id: str, input_file_path: str, ext: str, original_filename: str):
|
| 86 |
job_dir = os.path.join(JOBS_DIR, job_id)
|
| 87 |
try:
|
| 88 |
-
#
|
| 89 |
update_job_status(job_id, 'processing', 'Le fichier est en cours de traitement')
|
| 90 |
|
| 91 |
-
#
|
| 92 |
image_counter = [1]
|
| 93 |
images_data = {}
|
| 94 |
|
| 95 |
-
#
|
| 96 |
base_filename = os.path.splitext(original_filename)[0]
|
| 97 |
output_filename = os.path.join(job_dir, f"{base_filename}.html")
|
| 98 |
|
| 99 |
-
#
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
#
|
| 103 |
-
if not final_html:
|
| 104 |
-
update_job_status(job_id, 'error', 'Erreur lors de la conversion.')
|
| 105 |
-
return
|
| 106 |
-
|
| 107 |
-
# Écrire le HTML final dans le fichier de sortie
|
| 108 |
with open(output_filename, 'w', encoding='utf-8') as f:
|
| 109 |
f.write(final_html)
|
| 110 |
|
| 111 |
-
#
|
| 112 |
update_job_status(job_id, 'completed', 'Traitement terminé', result_file=f"{base_filename}.html")
|
| 113 |
|
| 114 |
-
#
|
| 115 |
-
delete_files_after_delay([input_file_path], delay=300) # 300
|
| 116 |
|
| 117 |
except Exception as e:
|
| 118 |
logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
|
| 119 |
update_job_status(job_id, 'error', f"Erreur: {str(e)}")
|
| 120 |
|
| 121 |
def delete_files_after_delay(file_paths: List[str], delay: int = 1200):
|
| 122 |
-
"""
|
| 123 |
def delayed_delete():
|
| 124 |
time.sleep(delay)
|
| 125 |
for file_path in file_paths:
|
|
@@ -130,21 +129,21 @@ def delete_files_after_delay(file_paths: List[str], delay: int = 1200):
|
|
| 130 |
except Exception as e:
|
| 131 |
logging.error(f"Erreur lors de la suppression du fichier {file_path} : {str(e)}")
|
| 132 |
|
| 133 |
-
#
|
| 134 |
thread = threading.Thread(target=delayed_delete)
|
| 135 |
thread.start()
|
| 136 |
|
| 137 |
-
def convert_to_accessible_html(input_filename, ext, base_filename, image_counter, images_data):
|
| 138 |
try:
|
| 139 |
-
# Conversion PDF
|
| 140 |
if ext == '.pdf':
|
| 141 |
# Initialize BeautifulSoup with basic HTML structure
|
| 142 |
soup = BeautifulSoup("<html><head></head><body></body></html>", 'html.parser')
|
| 143 |
body = soup.body
|
| 144 |
-
page_number = 1 #
|
| 145 |
with fitz.open(input_filename) as doc:
|
| 146 |
for page in doc:
|
| 147 |
-
#
|
| 148 |
page_comment = f"<!--PAGE_{page_number}-->"
|
| 149 |
body.append(BeautifulSoup(page_comment, 'html.parser'))
|
| 150 |
|
|
@@ -156,14 +155,23 @@ def convert_to_accessible_html(input_filename, ext, base_filename, image_counter
|
|
| 156 |
html_content = str(soup)
|
| 157 |
logging.debug(f"Voici le contenu du PDF brut avec commentaires de page : {html_content}")
|
| 158 |
|
| 159 |
-
#
|
| 160 |
-
cleaned_html = clean_html_content(html_content, image_counter, images_data)
|
| 161 |
-
#
|
| 162 |
-
|
| 163 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
final_html = reinsert_images(rewritten_html, images_data)
|
| 165 |
|
| 166 |
-
#
|
| 167 |
final_soup = BeautifulSoup(final_html, 'html.parser')
|
| 168 |
scripts_to_remove = final_soup.find_all('script', src=True)
|
| 169 |
for script in scripts_to_remove:
|
|
@@ -173,27 +181,27 @@ def convert_to_accessible_html(input_filename, ext, base_filename, image_counter
|
|
| 173 |
script.decompose()
|
| 174 |
final_html = str(final_soup)
|
| 175 |
|
| 176 |
-
#
|
| 177 |
final_html = re.sub(r'^\s*```(?:html)?\s*$', '', final_html, flags=re.MULTILINE)
|
| 178 |
|
| 179 |
logging.debug(f"Conversion PDF en HTML accessible réussie.")
|
| 180 |
return final_html
|
| 181 |
|
| 182 |
-
# Conversion
|
| 183 |
else:
|
| 184 |
input_format = get_pandoc_format(ext)
|
| 185 |
|
| 186 |
try:
|
| 187 |
-
#
|
| 188 |
output = pypandoc.convert_file(
|
| 189 |
input_filename,
|
| 190 |
'html',
|
| 191 |
-
format=input_format, #
|
| 192 |
outputfile=None,
|
| 193 |
extra_args=[
|
| 194 |
-
'--self-contained', #
|
| 195 |
-
'--strip-comments', #
|
| 196 |
-
'--quiet' #
|
| 197 |
]
|
| 198 |
)
|
| 199 |
html_content = output
|
|
@@ -202,7 +210,7 @@ def convert_to_accessible_html(input_filename, ext, base_filename, image_counter
|
|
| 202 |
logging.error(f"Pandoc a rencontré une erreur : {str(e)}")
|
| 203 |
logging.info("Tentative de conversion sans l'option --self-contained.")
|
| 204 |
|
| 205 |
-
#
|
| 206 |
output = pypandoc.convert_file(
|
| 207 |
input_filename,
|
| 208 |
'html',
|
|
@@ -216,16 +224,23 @@ def convert_to_accessible_html(input_filename, ext, base_filename, image_counter
|
|
| 216 |
html_content = output
|
| 217 |
logging.debug(f"Conversion en HTML réussie avec Pandoc sans --self-contained.")
|
| 218 |
|
| 219 |
-
#
|
| 220 |
-
cleaned_html = clean_html_content(html_content, image_counter, images_data)
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
final_html = reinsert_images(rewritten_html, images_data)
|
| 227 |
|
| 228 |
-
#
|
| 229 |
final_soup = BeautifulSoup(final_html, 'html.parser')
|
| 230 |
scripts_to_remove = final_soup.find_all('script', src=True)
|
| 231 |
for script in scripts_to_remove:
|
|
@@ -235,7 +250,7 @@ def convert_to_accessible_html(input_filename, ext, base_filename, image_counter
|
|
| 235 |
script.decompose()
|
| 236 |
final_html = str(final_soup)
|
| 237 |
|
| 238 |
-
#
|
| 239 |
final_html = re.sub(r'^\s*```(?:html)?\s*$', '', final_html, flags=re.MULTILINE)
|
| 240 |
|
| 241 |
logging.debug(f"Conversion en HTML accessible réussie avec Pandoc.")
|
|
@@ -246,10 +261,10 @@ def convert_to_accessible_html(input_filename, ext, base_filename, image_counter
|
|
| 246 |
return None
|
| 247 |
|
| 248 |
def encode_image_from_data_uri(data_uri: str) -> str:
|
| 249 |
-
"""
|
| 250 |
try:
|
| 251 |
header, encoded = data_uri.split(',', 1)
|
| 252 |
-
#
|
| 253 |
encoded = ''.join(encoded.split())
|
| 254 |
return encoded
|
| 255 |
except Exception as e:
|
|
@@ -257,27 +272,27 @@ def encode_image_from_data_uri(data_uri: str) -> str:
|
|
| 257 |
return ""
|
| 258 |
|
| 259 |
def markdown_to_html(markdown_text: str) -> str:
|
| 260 |
-
"""
|
| 261 |
-
#
|
| 262 |
html = markdown_text
|
| 263 |
-
html = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', html) #
|
| 264 |
-
html = re.sub(r'\*(.*?)\*', r'<i>\1</i>', html) #
|
| 265 |
-
html = re.sub(r'__(.*?)__', r'<strong>\1</strong>', html) #
|
| 266 |
-
html = re.sub(r'_(.*?)_', r'<i>\1</i>', html) #
|
| 267 |
return html
|
| 268 |
|
| 269 |
-
def get_image_description(base64_image: str) -> str:
|
| 270 |
-
"""
|
| 271 |
try:
|
| 272 |
-
response = client.chat.completions.
|
| 273 |
-
model="gpt-4o-mini",
|
| 274 |
messages=[
|
| 275 |
{
|
| 276 |
"role": "user",
|
| 277 |
"content": [
|
| 278 |
{
|
| 279 |
"type": "text",
|
| 280 |
-
"text": "Décris ce que l'on peut voir sur cette image, pour qu'un lecteur malvoyant puisse comprendre ce qu'elle représente.",
|
| 281 |
},
|
| 282 |
{
|
| 283 |
"type": "image_url",
|
|
@@ -289,64 +304,18 @@ def get_image_description(base64_image: str) -> str:
|
|
| 289 |
}
|
| 290 |
],
|
| 291 |
)
|
| 292 |
-
#
|
| 293 |
description = response.choices[0].message.content.strip()
|
| 294 |
return description
|
| 295 |
except Exception as e:
|
| 296 |
logging.error(f"Erreur lors de l'appel à l'API OpenAI : {str(e)}")
|
| 297 |
return "Description indisponible."
|
| 298 |
|
| 299 |
-
def rewrite_html_accessible(html_content: str) -> str:
|
| 300 |
-
"""
|
| 301 |
prompt = (
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
"Ton intervention doit se faire exclusivement sur la **forme** du document : le contenu doit être **intégralement préservé dans le même ordre**, jusqu'à la fin. "
|
| 305 |
-
"L'ensemble du CSS correspondant aux classes devra être explicité dans une balise <style> dans le <head>. Toutes les instructions de style doivent être incluses dans le head, on ne doit avoir aucun <style src=X> qui irait chercher une source extérieure.\n"
|
| 306 |
-
"IMPORTANT : Tu dois **respecter scrupuleusement l'ordre indiqué par les commentaires HTML de la forme <!--PAGE_X-->,** s'ils existent. On doit avoir <!--PAGE_1--> [...] <!--PAGE_2--> [...] <!--PAGE_3--> [...], et ainsi de suite, dans l'ordre exact et sans en oublier un seul. C'est très important ! Ces marqueurs te permettent de t'assurer que la page est bien retranscrite dans le bon ordre. Ne déplace, ne supprime, et ne modifie pas ces commentaires.\n"
|
| 307 |
-
"Attention, ce document est peut-être issu d'un PDF ou d'un DOCX. Il faut donc être attentif :\n"
|
| 308 |
-
"- Aux balises <p> qui suivent immédiatement les marqueurs <!--PAGE_X--> : il peut s'agir de headers. Pour le savoir, il faut les comparer entre eux pour savoir s'ils sont à peu près similaires.\n"
|
| 309 |
-
"- Aux balises <p> qui précèdent immédiatement les marqueurs <!--PAGE_X--> : il peut s'agir de footers. De même, il faut les comparer entre eux pour savoir s'ils sont à peu près similaires.\n"
|
| 310 |
-
"Dans tous les cas, il faut supprimer tous les headers et les footers identifiés (c'est-à-dire supprimer la totalité du contenu de la balise <p> concernée). Attention, ces suppressions ne doivent pas affecter les autres éléments : le code html que tu produis doit être aussi propre que possible, comme si on avait un document de traitement de texte.\n"
|
| 311 |
-
"S'il y a des retours à la ligne injustifiés, il faut rétablir l'intégrité des phrases, et constituer de véritables paragraphes complets. L'ensemble du code doit être inclus entre des balises <html></html>\n"
|
| 312 |
-
"Tu donneras la totalité du HTML réécrit, et rien d'autre, ni avant ni après. "
|
| 313 |
-
"Ne résume jamais les informations, ne réorganise pas le contenu et ne supprime aucune section.\n\n"
|
| 314 |
-
"IMPORTANT : Ne jamais inclure de balises <script> dans le HTML réécrit.\n\n" # Nouvelle Ligne Ajoutée
|
| 315 |
-
"Voici tout d'abord les règles à suivre pour avoir un document accessible :\n\n"
|
| 316 |
-
"1. Adopter des pratiques de rédaction sobres\n"
|
| 317 |
-
"• Éviter les polices avec empattement. Utiliser de préférence : Arial, Calibri, Verdana, etc.\n"
|
| 318 |
-
"• Limiter l’utilisation de texte en italique.\n"
|
| 319 |
-
"• Éviter les soulignements.\n"
|
| 320 |
-
"• Aligner le texte à gauche, en drapeau, plutôt que de le justifier.\n"
|
| 321 |
-
"• Respecter les règles typographiques usuelles.\n\n"
|
| 322 |
-
"2. Rédiger des contenus clairs et sans ambiguïté\n"
|
| 323 |
-
"• Conserver les accents sur les lettres majuscules (début de phrase, noms propres, etc.).\n"
|
| 324 |
-
"• Éviter d’écrire des mots entièrement en majuscules.\n"
|
| 325 |
-
"• Expliciter les abréviations à la première occurrence ou associer un glossaire au document.\n"
|
| 326 |
-
"• Spécifier les changements de langue et, si besoin, la langue par défaut du document.\n\n"
|
| 327 |
-
"3. Structurer le contenu du document à l’aide de niveaux de titres et de paragraphes\n"
|
| 328 |
-
"• Définir la hiérarchie du contenu (titre principal, titre secondaire, sous-titre, etc.).\n"
|
| 329 |
-
"• Associer aux différents niveaux de titres des styles prédéfinis (titre 1, titre 2, titre 3, etc.).\n"
|
| 330 |
-
"• Modifier les styles prédéfinis en fonction des besoins et les appliquer à l’ensemble du document.\n"
|
| 331 |
-
"• Utiliser les paramètres d’espacement pour aérer le texte, sans effectuer de multiples retours à la ligne.\n\n"
|
| 332 |
-
"4. Veiller au bon usage des couleurs et des contrastes\n"
|
| 333 |
-
"• Assurer un contraste optimal entre le texte et la couleur de fond.\n"
|
| 334 |
-
"• Expliciter l’information véhiculée par les couleurs.\n\n"
|
| 335 |
-
"5. Insérer des tableaux accessibles\n"
|
| 336 |
-
"• S’assurer que l’usage d’un tableau est pertinent.\n"
|
| 337 |
-
"• Spécifier les lignes et les colonnes d’en-tête.\n"
|
| 338 |
-
"• Empêcher les tableaux et les cellules d’être à cheval sur deux pages.\n"
|
| 339 |
-
"• Insérer un tiret dans les cellules ne contenant pas d’information.\n"
|
| 340 |
-
"• Associer une légende aux tableaux.\n\n"
|
| 341 |
-
"S'il y a des tableaux dans le document, les bordures doivent être noires et apparaître clairement. Une ligne sur deux doit être légèrement grisée, pour qu'on puisse facilement s'y repérer."
|
| 342 |
-
"Les titres doivent être indentés avec une margin-left, qui augmente avec le niveau de titre (le plus haut niveau de titre ayant une margin-left de 0). "
|
| 343 |
-
"Chaque niveau de titre doit avoir sa propre couleur (en s'assurant qu'elle soit suffisamment sombre, pour contraster avec le fond blanc. Par exemple Navy, DarkGreen, DarkRed...). "
|
| 344 |
-
"ATTENTION : pour les titres, utilise absolument les balises h1, h2, h3, h4, h5 et h6. Le titre de niveau supérieur doit avoir la balise h1.\n\n"
|
| 345 |
-
"Le cas échéant, il faut faire attention à bien respecter la logique des titres : a priori, les titres qui sont préfixés par une écriture romaine (I, II, III), "
|
| 346 |
-
"par un nombre (1, 2, 3) ou par une lettre (a, b, c, ou bien A, B, C) doivent être de même niveau. Idem pour les titres rédigés en majuscules. Quand une expression très courte qui ne ressemble pas syntaxiquement à une phrase "
|
| 347 |
-
"est présentée sur une seule ligne, il y a des chances qu'il s'agisse d'un titre : dans ce cas (et si c'est pertinent) traite-la comme telle.\n"
|
| 348 |
-
"On évite les balises <ul> et <li>\n"
|
| 349 |
-
"Encore une fois, fais bien attention à reproduire fidèlement l'ordre des marqueurs <!--PAGE_X-->, dans l'ordre croissant des X : c'est ta tâche principale.\n"
|
| 350 |
"Voici maintenant le fichier HTML d'origine :\n"
|
| 351 |
+ html_content
|
| 352 |
)
|
|
@@ -355,15 +324,15 @@ def rewrite_html_accessible(html_content: str) -> str:
|
|
| 355 |
logging.debug("Contenu avant l'appel à l'API OpenAI :")
|
| 356 |
logging.debug(html_content)
|
| 357 |
|
| 358 |
-
response = client.chat.completions.
|
| 359 |
-
model="gpt-4o-mini", #
|
| 360 |
messages=[
|
| 361 |
{"role": "user", "content": prompt}
|
| 362 |
],
|
| 363 |
)
|
| 364 |
-
#
|
| 365 |
rewritten_html = response.choices[0].message.content.strip()
|
| 366 |
-
#
|
| 367 |
rewritten_html = rewritten_html.replace("<!--", "<!--").replace("-->", "-->")
|
| 368 |
|
| 369 |
logging.debug("Contenu après l'appel à l'API OpenAI :")
|
|
@@ -372,59 +341,63 @@ def rewrite_html_accessible(html_content: str) -> str:
|
|
| 372 |
return rewritten_html
|
| 373 |
except Exception as e:
|
| 374 |
logging.error(f"Erreur lors de la réécriture du HTML : {str(e)}")
|
| 375 |
-
return html_content #
|
| 376 |
-
|
| 377 |
|
| 378 |
-
def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> str:
|
| 379 |
-
"""
|
| 380 |
-
# Implémentation de la fonction comme dans votre code initial
|
| 381 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 382 |
|
| 383 |
-
#
|
| 384 |
for tag in soup.find_all():
|
| 385 |
if 'style' in tag.attrs:
|
| 386 |
del tag['style']
|
| 387 |
|
| 388 |
-
#
|
| 389 |
for element in soup.find_all(['header', 'footer']):
|
| 390 |
element.decompose()
|
| 391 |
-
#
|
| 392 |
for div in soup.find_all('div'):
|
| 393 |
if div.get_text(strip=True).isdigit():
|
| 394 |
div.decompose()
|
| 395 |
|
| 396 |
-
#
|
| 397 |
total_images = len([img for img in soup.find_all('img') if img.get('src', '').startswith('data:image/')])
|
| 398 |
|
|
|
|
|
|
|
|
|
|
| 399 |
if total_images > 20:
|
| 400 |
logging.warning(f"Nombre d'images ({total_images}) dépasse 20. Les images seront ignorées.")
|
| 401 |
-
#
|
| 402 |
for img in soup.find_all('img'):
|
| 403 |
img.decompose()
|
| 404 |
else:
|
| 405 |
-
#
|
| 406 |
for img in soup.find_all('img'):
|
| 407 |
src = img.get('src', '')
|
| 408 |
X = image_counter[0]
|
| 409 |
if src.startswith('data:image/'):
|
| 410 |
base64_image = encode_image_from_data_uri(src)
|
| 411 |
if base64_image:
|
| 412 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
else:
|
| 414 |
-
|
| 415 |
-
# Stocker les données de l'image et la description pour réinsertion ultérieure
|
| 416 |
-
images_data[f"IMG_{X}"] = {
|
| 417 |
-
'base64_image': base64_image,
|
| 418 |
-
'description': Y
|
| 419 |
-
}
|
| 420 |
-
# Remplacer l'image par un commentaire HTML
|
| 421 |
-
placeholder = f"<!--IMG_{X}-->"
|
| 422 |
-
img.replace_with(placeholder)
|
| 423 |
-
image_counter[0] += 1
|
| 424 |
else:
|
| 425 |
-
img.decompose() #
|
| 426 |
|
| 427 |
-
#
|
| 428 |
scripts_to_remove = soup.find_all('script', src=True)
|
| 429 |
for script in scripts_to_remove:
|
| 430 |
src = script['src']
|
|
@@ -432,249 +405,40 @@ def clean_html_content(html_content: str, image_counter: List[int], images_data:
|
|
| 432 |
logging.debug(f"Suppression de la balise <script> : {script}")
|
| 433 |
script.decompose()
|
| 434 |
|
| 435 |
-
#
|
| 436 |
for tag in soup.find_all(['p', 'span']):
|
| 437 |
if not tag.get_text(strip=True):
|
| 438 |
tag.decompose()
|
| 439 |
|
| 440 |
-
#
|
| 441 |
style_tag = soup.new_tag('style')
|
| 442 |
style_tag.string = """
|
| 443 |
-
|
| 444 |
-
/*
|
| 445 |
-
:root {
|
| 446 |
-
--font-size-min: 1rem;
|
| 447 |
-
--font-size-base: 1rem; /* 16px par défaut */
|
| 448 |
-
--font-size-large: 2.5rem; /* Ajustable selon les besoins */
|
| 449 |
-
--line-height: 1.5;
|
| 450 |
-
--font-family: Arial, Calibri, Verdana, sans-serif;
|
| 451 |
-
--text-color: #1a1a1a;
|
| 452 |
-
--background-color: #fdfdfd;
|
| 453 |
-
--link-color: #1a1a1a;
|
| 454 |
-
--heading-color-primary: Navy;
|
| 455 |
-
--heading-color-secondary: DarkGreen;
|
| 456 |
-
--heading-color-tertiary: DarkRed;
|
| 457 |
-
--heading-color-quaternary: DarkSlateGray;
|
| 458 |
-
--heading-color-cinq: DarkSlateBlue;
|
| 459 |
-
--heading-color-six: DarkViolet;
|
| 460 |
-
}
|
| 461 |
-
/* Styles de base */
|
| 462 |
-
html {
|
| 463 |
-
font-family: var(--font-family);
|
| 464 |
-
font-size: var(--font-size-base);
|
| 465 |
-
line-height: var(--line-height);
|
| 466 |
-
color: var(--text-color);
|
| 467 |
-
background-color: var(--background-color);
|
| 468 |
-
/* Fluid Typography: Ajuste la taille de la police en fonction de la largeur de la fenêtre */
|
| 469 |
-
font-size: clamp(var(--font-size-min), 2vw, 1.5rem);
|
| 470 |
-
}
|
| 471 |
-
body {
|
| 472 |
-
margin: 20px auto;
|
| 473 |
-
max-width: 36em; /* 36em correspond à environ 576px */
|
| 474 |
-
padding: 2rem;
|
| 475 |
-
hyphens: auto;
|
| 476 |
-
overflow-wrap: break-word;
|
| 477 |
-
text-rendering: optimizeLegibility;
|
| 478 |
-
font-kerning: normal;
|
| 479 |
-
text-align: left;
|
| 480 |
-
}
|
| 481 |
-
/* Titres réactifs */
|
| 482 |
-
h1 {
|
| 483 |
-
margin-left: 0;
|
| 484 |
-
color: var(--heading-color-primary);
|
| 485 |
-
font-size: clamp(1.5rem, 5vw, 2rem); /* Entre 24px et 48px */
|
| 486 |
-
}
|
| 487 |
-
h2 {
|
| 488 |
-
margin-left: 1rem;
|
| 489 |
-
color: var(--heading-color-secondary);
|
| 490 |
-
font-size: clamp(1.25rem, 4vw, 1.75rem); /* Entre 20px et 40px */
|
| 491 |
-
}
|
| 492 |
-
h3 {
|
| 493 |
-
margin-left: 2rem;
|
| 494 |
-
color: var(--heading-color-tertiary);
|
| 495 |
-
font-size: clamp(1.125rem, 4vw, 1.5rem); /* Entre 18px et 36px */
|
| 496 |
-
}
|
| 497 |
-
h4 {
|
| 498 |
-
margin-left: 3rem;
|
| 499 |
-
color: var(--heading-color-quaternary);
|
| 500 |
-
font-size: clamp(1rem, 4vw, 1.5rem); /* Entre 16px et 32px */
|
| 501 |
-
}
|
| 502 |
-
h5 {
|
| 503 |
-
margin-left: 4rem;
|
| 504 |
-
color: var(--heading-color-cinq);
|
| 505 |
-
font-size: clamp(1rem, 4vw, 1.5rem); /* Entre 16px et 32px */
|
| 506 |
-
}
|
| 507 |
-
h6 {
|
| 508 |
-
margin-left: 5rem;
|
| 509 |
-
color: var(--heading-color-six);
|
| 510 |
-
font-size: clamp(1rem, 4vw, 1.5rem); /* Entre 16px et 32px */
|
| 511 |
-
}
|
| 512 |
-
/* Médias responsive */
|
| 513 |
-
@media (max-width: 600px) {
|
| 514 |
-
html {
|
| 515 |
-
font-size: clamp(var(--font-size-min), 4vw, 1.5rem); /* Ajuste légèrement pour petits écrans */
|
| 516 |
-
}
|
| 517 |
-
body {
|
| 518 |
-
padding: 1rem;
|
| 519 |
-
}
|
| 520 |
-
h1 {
|
| 521 |
-
font-size: clamp(1.5rem, 6vw, 2.5rem);
|
| 522 |
-
}
|
| 523 |
-
h2 {
|
| 524 |
-
font-size: clamp(1.25rem, 5vw, 2rem);
|
| 525 |
-
}
|
| 526 |
-
h3 {
|
| 527 |
-
font-size: clamp(1.125rem, 4.5vw, 1.75rem);
|
| 528 |
-
}
|
| 529 |
-
h4, h5, h6 {
|
| 530 |
-
font-size: clamp(1rem, 4vw, 1.5rem);
|
| 531 |
-
}
|
| 532 |
-
}
|
| 533 |
-
/* Impression */
|
| 534 |
-
@media print {
|
| 535 |
-
body {
|
| 536 |
-
background-color: transparent;
|
| 537 |
-
color: black;
|
| 538 |
-
font-size: 12pt;
|
| 539 |
-
}
|
| 540 |
-
p, h2, h3 {
|
| 541 |
-
orphans: 3;
|
| 542 |
-
widows: 3;
|
| 543 |
-
}
|
| 544 |
-
h2, h3, h4 {
|
| 545 |
-
page-break-after: avoid;
|
| 546 |
-
}
|
| 547 |
-
}
|
| 548 |
-
/* Paragraphes et liens */
|
| 549 |
-
p {
|
| 550 |
-
margin: 1em 0;
|
| 551 |
-
font-size: 1rem; /* 16px */
|
| 552 |
-
}
|
| 553 |
-
a {
|
| 554 |
-
color: var(--link-color);
|
| 555 |
-
text-decoration: none;
|
| 556 |
-
}
|
| 557 |
-
a:visited {
|
| 558 |
-
color: var(--link-color);
|
| 559 |
-
}
|
| 560 |
-
a:hover, a:focus {
|
| 561 |
-
text-decoration: underline; /* Améliore la visibilité au survol */
|
| 562 |
-
}
|
| 563 |
-
/* Images réactives */
|
| 564 |
-
img {
|
| 565 |
-
max-width: 100%;
|
| 566 |
-
height: auto;
|
| 567 |
-
}
|
| 568 |
-
/* Tables réactives */
|
| 569 |
-
table {
|
| 570 |
-
margin: 1em 0;
|
| 571 |
-
border-collapse: collapse;
|
| 572 |
-
width: 100%;
|
| 573 |
-
overflow-x: auto;
|
| 574 |
-
display: block;
|
| 575 |
-
font-variant-numeric: lining-nums tabular-nums;
|
| 576 |
-
}
|
| 577 |
-
table caption {
|
| 578 |
-
margin-bottom: 0.75em;
|
| 579 |
-
}
|
| 580 |
-
th, td {
|
| 581 |
-
border: 1px solid #000;
|
| 582 |
-
padding: 0.5em;
|
| 583 |
-
text-align: left;
|
| 584 |
-
}
|
| 585 |
-
tbody tr:nth-child(odd) {
|
| 586 |
-
background-color: #f2f2f2;
|
| 587 |
-
}
|
| 588 |
-
tbody tr:nth-child(even) {
|
| 589 |
-
background-color: #ffffff;
|
| 590 |
-
}
|
| 591 |
-
/* Citations */
|
| 592 |
-
blockquote {
|
| 593 |
-
margin: 1em 0 1em 1.7em;
|
| 594 |
-
padding-left: 1em;
|
| 595 |
-
border-left: 2px solid #e6e6e6;
|
| 596 |
-
color: #606060;
|
| 597 |
-
}
|
| 598 |
-
/* Code */
|
| 599 |
-
code {
|
| 600 |
-
font-family: Menlo, Monaco, 'Lucida Console', Consolas, monospace;
|
| 601 |
-
font-size: 0.85rem;
|
| 602 |
-
margin: 0;
|
| 603 |
-
white-space: pre-wrap;
|
| 604 |
-
}
|
| 605 |
-
pre {
|
| 606 |
-
margin: 1em 0;
|
| 607 |
-
overflow: auto;
|
| 608 |
-
}
|
| 609 |
-
pre code {
|
| 610 |
-
padding: 0;
|
| 611 |
-
overflow: visible;
|
| 612 |
-
overflow-wrap: normal;
|
| 613 |
-
}
|
| 614 |
-
.sourceCode {
|
| 615 |
-
background-color: transparent;
|
| 616 |
-
overflow: visible;
|
| 617 |
-
}
|
| 618 |
-
/* Lignes horizontales */
|
| 619 |
-
hr {
|
| 620 |
-
background-color: #1a1a1a;
|
| 621 |
-
border: none;
|
| 622 |
-
height: 1px;
|
| 623 |
-
margin: 1em 0;
|
| 624 |
-
}
|
| 625 |
-
/* Styles supplémentaires */
|
| 626 |
-
span.smallcaps {
|
| 627 |
-
font-variant: small-caps;
|
| 628 |
-
}
|
| 629 |
-
span.underline {
|
| 630 |
-
text-decoration: underline;
|
| 631 |
-
}
|
| 632 |
-
div.column {
|
| 633 |
-
display: inline-block;
|
| 634 |
-
vertical-align: top;
|
| 635 |
-
width: 50%;
|
| 636 |
-
}
|
| 637 |
-
.description {
|
| 638 |
-
background-color: #f0f3ff;
|
| 639 |
-
padding: 1em;
|
| 640 |
-
border: 1px solid black;
|
| 641 |
-
}
|
| 642 |
-
div.hanging-indent {
|
| 643 |
-
margin-left: 1.5em;
|
| 644 |
-
text-indent: -1.5em;
|
| 645 |
-
}
|
| 646 |
-
ul.task-list {
|
| 647 |
-
list-style: none;
|
| 648 |
-
}
|
| 649 |
-
.display.math {
|
| 650 |
-
display: block;
|
| 651 |
-
text-align: center;
|
| 652 |
-
margin: 0.5rem auto;
|
| 653 |
-
}
|
| 654 |
"""
|
| 655 |
if soup.head:
|
| 656 |
soup.head.append(style_tag)
|
| 657 |
else:
|
| 658 |
-
#
|
| 659 |
head_tag = soup.new_tag('head')
|
| 660 |
head_tag.append(style_tag)
|
| 661 |
soup.insert(0, head_tag)
|
| 662 |
|
| 663 |
-
return str(soup)
|
| 664 |
|
| 665 |
def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
|
| 666 |
-
"""
|
| 667 |
-
#
|
| 668 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 669 |
|
| 670 |
-
#
|
| 671 |
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
| 672 |
match = re.match(r'IMG_(\d+)', comment)
|
| 673 |
if match:
|
| 674 |
image_number = match.group(1)
|
| 675 |
image_key = f"IMG_{image_number}"
|
| 676 |
if image_key in images_data:
|
| 677 |
-
#
|
| 678 |
img_tag = soup.new_tag('img')
|
| 679 |
img_tag['src'] = f"data:image/jpeg;base64,{images_data[image_key]['base64_image']}"
|
| 680 |
img_tag['alt'] = images_data[image_key]['description']
|
|
@@ -682,30 +446,30 @@ def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -
|
|
| 682 |
new_content = soup.new_tag('div')
|
| 683 |
new_content.append(img_tag)
|
| 684 |
|
| 685 |
-
#
|
| 686 |
p_tag = soup.new_tag('p', attrs={'class': 'description'})
|
| 687 |
|
| 688 |
-
#
|
| 689 |
strong_tag = soup.new_tag('strong')
|
| 690 |
strong_tag.string = f"Image {image_number}"
|
| 691 |
p_tag.append(strong_tag)
|
| 692 |
|
| 693 |
-
#
|
| 694 |
p_tag.append(" : ")
|
| 695 |
|
| 696 |
-
#
|
| 697 |
y_markdown = images_data[image_key]['description']
|
| 698 |
|
| 699 |
-
#
|
| 700 |
y_html = markdown_to_html(y_markdown)
|
| 701 |
|
| 702 |
-
#
|
| 703 |
y_soup = BeautifulSoup(y_html, 'html.parser')
|
| 704 |
p_tag.append(y_soup)
|
| 705 |
|
| 706 |
new_content.append(p_tag)
|
| 707 |
|
| 708 |
-
#
|
| 709 |
comment.replace_with(new_content)
|
| 710 |
else:
|
| 711 |
logging.error(f"Données pour {image_key} non trouvées.")
|
|
@@ -718,12 +482,12 @@ async def convert_file_to_html(
|
|
| 718 |
background_tasks: BackgroundTasks = BackgroundTasks()
|
| 719 |
):
|
| 720 |
try:
|
| 721 |
-
#
|
| 722 |
job_id = str(uuid.uuid4())
|
| 723 |
job_dir = os.path.join(JOBS_DIR, job_id)
|
| 724 |
os.makedirs(job_dir)
|
| 725 |
|
| 726 |
-
#
|
| 727 |
ext = os.path.splitext(file.filename)[1].lower()
|
| 728 |
if ext not in FORMAT_MAP:
|
| 729 |
raise HTTPException(status_code=400, detail=f"Extension de fichier non supportée : {ext}")
|
|
@@ -732,7 +496,7 @@ async def convert_file_to_html(
|
|
| 732 |
with open(input_file_path, "wb") as f:
|
| 733 |
shutil.copyfileobj(file.file, f)
|
| 734 |
|
| 735 |
-
#
|
| 736 |
status = {
|
| 737 |
'status': 'pending',
|
| 738 |
'message': 'Traitement démarré',
|
|
@@ -742,10 +506,10 @@ async def convert_file_to_html(
|
|
| 742 |
with open(status_file, 'w') as f:
|
| 743 |
json.dump(status, f)
|
| 744 |
|
| 745 |
-
#
|
| 746 |
background_tasks.add_task(process_file, job_id, input_file_path, ext, file.filename)
|
| 747 |
|
| 748 |
-
#
|
| 749 |
return JSONResponse(content={'job_id': job_id})
|
| 750 |
|
| 751 |
except Exception as e:
|
|
@@ -785,7 +549,7 @@ def delete_temp_files(file_paths: list):
|
|
| 785 |
|
| 786 |
@app.post("/convert_to_txt/")
|
| 787 |
async def convert_file_to_txt(
|
| 788 |
-
file: UploadFile = File(...),
|
| 789 |
background_tasks: BackgroundTasks = BackgroundTasks()
|
| 790 |
):
|
| 791 |
try:
|
|
@@ -799,7 +563,7 @@ async def convert_file_to_txt(
|
|
| 799 |
'.odt', '.pdf', '.docx', '.html', '.htm', '.md', '.txt', '.rtf', '.epub',
|
| 800 |
'.tex', '.xml', '.org', '.commonmark', '.cm', '.wiki', '.opml'
|
| 801 |
]
|
| 802 |
-
|
| 803 |
if ext not in allowed_extensions:
|
| 804 |
raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
|
| 805 |
|
|
@@ -823,7 +587,7 @@ async def convert_file_to_txt(
|
|
| 823 |
with open(output_filename, "w", encoding="utf-8") as f:
|
| 824 |
f.write(text)
|
| 825 |
logging.debug(f"PDF conversion successful with PyMuPDF: {output_filename}")
|
| 826 |
-
|
| 827 |
# Other file formats to text conversion using Pandoc
|
| 828 |
else:
|
| 829 |
output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
|
|
@@ -845,4 +609,4 @@ async def convert_file_to_txt(
|
|
| 845 |
return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail})
|
| 846 |
except Exception as e:
|
| 847 |
logging.error(f"Error during conversion: {str(e)}")
|
| 848 |
-
return JSONResponse(status_code=500, content={"message": f"Internal error: {str(e)}"})
|
|
|
|
| 6 |
import re
|
| 7 |
import threading
|
| 8 |
import time
|
| 9 |
+
from typing import List, Dict, Tuple
|
| 10 |
import logging
|
| 11 |
import tempfile
|
| 12 |
import shutil
|
| 13 |
import json
|
| 14 |
+
import asyncio
|
| 15 |
|
| 16 |
from openai import OpenAI
|
| 17 |
|
|
|
|
| 21 |
import fitz # PyMuPDF
|
| 22 |
from bs4 import BeautifulSoup, Comment
|
| 23 |
|
| 24 |
+
# Initialize the logger
|
| 25 |
logging.basicConfig(level=logging.DEBUG)
|
| 26 |
|
| 27 |
+
# Initialize the FastAPI application
|
| 28 |
app = FastAPI()
|
| 29 |
|
| 30 |
client = OpenAI()
|
| 31 |
|
| 32 |
+
# Base directory for jobs
|
|
|
|
| 33 |
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 34 |
JOBS_DIR = os.path.join(tempfile.gettempdir(), 'jobs')
|
| 35 |
|
| 36 |
if not os.path.exists(JOBS_DIR):
|
| 37 |
os.makedirs(JOBS_DIR)
|
| 38 |
|
| 39 |
+
# Map of extensions to Pandoc formats
|
| 40 |
FORMAT_MAP = {
|
| 41 |
'.odt': 'odt',
|
| 42 |
'.pdf': 'pdf',
|
|
|
|
| 56 |
}
|
| 57 |
|
| 58 |
def get_pandoc_format(extension: str) -> str:
|
| 59 |
+
"""Get the Pandoc format based on the file extension."""
|
| 60 |
+
return FORMAT_MAP.get(extension, 'auto') # 'auto' lets Pandoc determine the format
|
| 61 |
|
| 62 |
def update_job_status(job_id: str, status: str, message: str = '', result_file: str = None):
|
| 63 |
job_dir = os.path.join(JOBS_DIR, job_id)
|
|
|
|
| 84 |
def process_file(job_id: str, input_file_path: str, ext: str, original_filename: str):
|
| 85 |
job_dir = os.path.join(JOBS_DIR, job_id)
|
| 86 |
try:
|
| 87 |
+
# Update status to 'processing'
|
| 88 |
update_job_status(job_id, 'processing', 'Le fichier est en cours de traitement')
|
| 89 |
|
| 90 |
+
# Initialize the image counter and image data dictionary
|
| 91 |
image_counter = [1]
|
| 92 |
images_data = {}
|
| 93 |
|
| 94 |
+
# Base filename
|
| 95 |
base_filename = os.path.splitext(original_filename)[0]
|
| 96 |
output_filename = os.path.join(job_dir, f"{base_filename}.html")
|
| 97 |
|
| 98 |
+
# Run the asynchronous function
|
| 99 |
+
loop = asyncio.new_event_loop()
|
| 100 |
+
asyncio.set_event_loop(loop)
|
| 101 |
+
try:
|
| 102 |
+
final_html = loop.run_until_complete(convert_to_accessible_html(input_file_path, ext, base_filename, image_counter, images_data))
|
| 103 |
+
finally:
|
| 104 |
+
loop.close()
|
| 105 |
|
| 106 |
+
# Write the final HTML to the output file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
with open(output_filename, 'w', encoding='utf-8') as f:
|
| 108 |
f.write(final_html)
|
| 109 |
|
| 110 |
+
# When processing is complete
|
| 111 |
update_job_status(job_id, 'completed', 'Traitement terminé', result_file=f"{base_filename}.html")
|
| 112 |
|
| 113 |
+
# Delete temporary files after a delay
|
| 114 |
+
delete_files_after_delay([input_file_path], delay=300) # 300 seconds = 5 minutes
|
| 115 |
|
| 116 |
except Exception as e:
|
| 117 |
logging.error(f"Erreur lors du traitement du job {job_id}: {str(e)}")
|
| 118 |
update_job_status(job_id, 'error', f"Erreur: {str(e)}")
|
| 119 |
|
| 120 |
def delete_files_after_delay(file_paths: List[str], delay: int = 1200):
|
| 121 |
+
"""Function to delete temporary files after a delay in seconds."""
|
| 122 |
def delayed_delete():
|
| 123 |
time.sleep(delay)
|
| 124 |
for file_path in file_paths:
|
|
|
|
| 129 |
except Exception as e:
|
| 130 |
logging.error(f"Erreur lors de la suppression du fichier {file_path} : {str(e)}")
|
| 131 |
|
| 132 |
+
# Create and start a separate thread
|
| 133 |
thread = threading.Thread(target=delayed_delete)
|
| 134 |
thread.start()
|
| 135 |
|
| 136 |
+
async def convert_to_accessible_html(input_filename, ext, base_filename, image_counter, images_data):
|
| 137 |
try:
|
| 138 |
+
# Conversion from PDF to HTML with PyMuPDF
|
| 139 |
if ext == '.pdf':
|
| 140 |
# Initialize BeautifulSoup with basic HTML structure
|
| 141 |
soup = BeautifulSoup("<html><head></head><body></body></html>", 'html.parser')
|
| 142 |
body = soup.body
|
| 143 |
+
page_number = 1 # Initialize the page counter
|
| 144 |
with fitz.open(input_filename) as doc:
|
| 145 |
for page in doc:
|
| 146 |
+
# Add page comment before the page's HTML content
|
| 147 |
page_comment = f"<!--PAGE_{page_number}-->"
|
| 148 |
body.append(BeautifulSoup(page_comment, 'html.parser'))
|
| 149 |
|
|
|
|
| 155 |
html_content = str(soup)
|
| 156 |
logging.debug(f"Voici le contenu du PDF brut avec commentaires de page : {html_content}")
|
| 157 |
|
| 158 |
+
# Clean the HTML content
|
| 159 |
+
cleaned_html, image_tasks = await clean_html_content(html_content, image_counter, images_data)
|
| 160 |
+
# Rewrite the HTML to make it more accessible
|
| 161 |
+
html_rewrite_task = asyncio.create_task(rewrite_html_accessible(cleaned_html))
|
| 162 |
+
# Wait for all tasks to complete
|
| 163 |
+
await asyncio.gather(*image_tasks, html_rewrite_task)
|
| 164 |
+
# Retrieve the image descriptions
|
| 165 |
+
for image_key in images_data:
|
| 166 |
+
task = images_data[image_key]['description_task']
|
| 167 |
+
description = task.result()
|
| 168 |
+
images_data[image_key]['description'] = description
|
| 169 |
+
# Get the rewritten HTML
|
| 170 |
+
rewritten_html = html_rewrite_task.result()
|
| 171 |
+
# Reinsert images and their descriptions into the rewritten HTML
|
| 172 |
final_html = reinsert_images(rewritten_html, images_data)
|
| 173 |
|
| 174 |
+
# Final removal of specific <script> tags
|
| 175 |
final_soup = BeautifulSoup(final_html, 'html.parser')
|
| 176 |
scripts_to_remove = final_soup.find_all('script', src=True)
|
| 177 |
for script in scripts_to_remove:
|
|
|
|
| 181 |
script.decompose()
|
| 182 |
final_html = str(final_soup)
|
| 183 |
|
| 184 |
+
# Remove lines containing only "```html" or "```"
|
| 185 |
final_html = re.sub(r'^\s*```(?:html)?\s*$', '', final_html, flags=re.MULTILINE)
|
| 186 |
|
| 187 |
logging.debug(f"Conversion PDF en HTML accessible réussie.")
|
| 188 |
return final_html
|
| 189 |
|
| 190 |
+
# Conversion of other formats to HTML with Pandoc
|
| 191 |
else:
|
| 192 |
input_format = get_pandoc_format(ext)
|
| 193 |
|
| 194 |
try:
|
| 195 |
+
# Convert the file with Pandoc using the --self-contained option
|
| 196 |
output = pypandoc.convert_file(
|
| 197 |
input_filename,
|
| 198 |
'html',
|
| 199 |
+
format=input_format, # Specify the format here
|
| 200 |
outputfile=None,
|
| 201 |
extra_args=[
|
| 202 |
+
'--self-contained', # Embed external resources like images
|
| 203 |
+
'--strip-comments', # Remove comments
|
| 204 |
+
'--quiet' # Suppress verbose outputs
|
| 205 |
]
|
| 206 |
)
|
| 207 |
html_content = output
|
|
|
|
| 210 |
logging.error(f"Pandoc a rencontré une erreur : {str(e)}")
|
| 211 |
logging.info("Tentative de conversion sans l'option --self-contained.")
|
| 212 |
|
| 213 |
+
# Retry without the --self-contained option
|
| 214 |
output = pypandoc.convert_file(
|
| 215 |
input_filename,
|
| 216 |
'html',
|
|
|
|
| 224 |
html_content = output
|
| 225 |
logging.debug(f"Conversion en HTML réussie avec Pandoc sans --self-contained.")
|
| 226 |
|
| 227 |
+
# Clean the HTML content
|
| 228 |
+
cleaned_html, image_tasks = await clean_html_content(html_content, image_counter, images_data)
|
| 229 |
+
# Rewrite the HTML to make it more accessible
|
| 230 |
+
html_rewrite_task = asyncio.create_task(rewrite_html_accessible(cleaned_html))
|
| 231 |
+
# Wait for all tasks to complete
|
| 232 |
+
await asyncio.gather(*image_tasks, html_rewrite_task)
|
| 233 |
+
# Retrieve the image descriptions
|
| 234 |
+
for image_key in images_data:
|
| 235 |
+
task = images_data[image_key]['description_task']
|
| 236 |
+
description = task.result()
|
| 237 |
+
images_data[image_key]['description'] = description
|
| 238 |
+
# Get the rewritten HTML
|
| 239 |
+
rewritten_html = html_rewrite_task.result()
|
| 240 |
+
# Reinsert images and their descriptions into the rewritten HTML
|
| 241 |
final_html = reinsert_images(rewritten_html, images_data)
|
| 242 |
|
| 243 |
+
# Final removal of specific <script> tags
|
| 244 |
final_soup = BeautifulSoup(final_html, 'html.parser')
|
| 245 |
scripts_to_remove = final_soup.find_all('script', src=True)
|
| 246 |
for script in scripts_to_remove:
|
|
|
|
| 250 |
script.decompose()
|
| 251 |
final_html = str(final_soup)
|
| 252 |
|
| 253 |
+
# Remove lines containing only "```html" or "```"
|
| 254 |
final_html = re.sub(r'^\s*```(?:html)?\s*$', '', final_html, flags=re.MULTILINE)
|
| 255 |
|
| 256 |
logging.debug(f"Conversion en HTML accessible réussie avec Pandoc.")
|
|
|
|
| 261 |
return None
|
| 262 |
|
| 263 |
def encode_image_from_data_uri(data_uri: str) -> str:
|
| 264 |
+
"""Function to encode an image from a Data URI."""
|
| 265 |
try:
|
| 266 |
header, encoded = data_uri.split(',', 1)
|
| 267 |
+
# Remove newlines and spaces
|
| 268 |
encoded = ''.join(encoded.split())
|
| 269 |
return encoded
|
| 270 |
except Exception as e:
|
|
|
|
| 272 |
return ""
|
| 273 |
|
| 274 |
def markdown_to_html(markdown_text: str) -> str:
|
| 275 |
+
"""Convert Markdown syntax to HTML."""
|
| 276 |
+
# Replace Markdown tags with HTML tags
|
| 277 |
html = markdown_text
|
| 278 |
+
html = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', html) # Bold
|
| 279 |
+
html = re.sub(r'\*(.*?)\*', r'<i>\1</i>', html) # Italic
|
| 280 |
+
html = re.sub(r'__(.*?)__', r'<strong>\1</strong>', html) # Alternative bold
|
| 281 |
+
html = re.sub(r'_(.*?)_', r'<i>\1</i>', html) # Alternative italic
|
| 282 |
return html
|
| 283 |
|
| 284 |
+
async def get_image_description(base64_image: str) -> str:
|
| 285 |
+
"""Function to get the description of an image via the OpenAI API."""
|
| 286 |
try:
|
| 287 |
+
response = await client.chat.completions.acreate(
|
| 288 |
+
model="gpt-4o-mini",
|
| 289 |
messages=[
|
| 290 |
{
|
| 291 |
"role": "user",
|
| 292 |
"content": [
|
| 293 |
{
|
| 294 |
"type": "text",
|
| 295 |
+
"text": "Décris ce que l'on peut voir sur cette image, pour qu'un lecteur malvoyant puisse comprendre ce qu'elle représente.",
|
| 296 |
},
|
| 297 |
{
|
| 298 |
"type": "image_url",
|
|
|
|
| 304 |
}
|
| 305 |
],
|
| 306 |
)
|
| 307 |
+
# Access the content of the response
|
| 308 |
description = response.choices[0].message.content.strip()
|
| 309 |
return description
|
| 310 |
except Exception as e:
|
| 311 |
logging.error(f"Erreur lors de l'appel à l'API OpenAI : {str(e)}")
|
| 312 |
return "Description indisponible."
|
| 313 |
|
| 314 |
+
async def rewrite_html_accessible(html_content: str) -> str:
|
| 315 |
+
"""Function to rewrite the HTML in a more accessible way via the OpenAI API."""
|
| 316 |
prompt = (
|
| 317 |
+
# (The prompt content remains the same)
|
| 318 |
+
# ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
"Voici maintenant le fichier HTML d'origine :\n"
|
| 320 |
+ html_content
|
| 321 |
)
|
|
|
|
| 324 |
logging.debug("Contenu avant l'appel à l'API OpenAI :")
|
| 325 |
logging.debug(html_content)
|
| 326 |
|
| 327 |
+
response = await client.chat.completions.acreate(
|
| 328 |
+
model="gpt-4o-mini", # Replace with the appropriate model
|
| 329 |
messages=[
|
| 330 |
{"role": "user", "content": prompt}
|
| 331 |
],
|
| 332 |
)
|
| 333 |
+
# Extract the textual content of the response
|
| 334 |
rewritten_html = response.choices[0].message.content.strip()
|
| 335 |
+
# Replace escaped HTML comments if necessary
|
| 336 |
rewritten_html = rewritten_html.replace("<!--", "<!--").replace("-->", "-->")
|
| 337 |
|
| 338 |
logging.debug("Contenu après l'appel à l'API OpenAI :")
|
|
|
|
| 341 |
return rewritten_html
|
| 342 |
except Exception as e:
|
| 343 |
logging.error(f"Erreur lors de la réécriture du HTML : {str(e)}")
|
| 344 |
+
return html_content # Return the non-rewritten HTML in case of error
|
|
|
|
| 345 |
|
| 346 |
+
async def clean_html_content(html_content: str, image_counter: List[int], images_data: Dict[str, Dict[str, str]]) -> Tuple[str, List[asyncio.Task]]:
|
| 347 |
+
"""Function to clean the HTML content according to requirements"""
|
|
|
|
| 348 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 349 |
|
| 350 |
+
# Remove inline CSS
|
| 351 |
for tag in soup.find_all():
|
| 352 |
if 'style' in tag.attrs:
|
| 353 |
del tag['style']
|
| 354 |
|
| 355 |
+
# Remove headers, footers, and page numbers
|
| 356 |
for element in soup.find_all(['header', 'footer']):
|
| 357 |
element.decompose()
|
| 358 |
+
# Remove elements likely to be page numbers
|
| 359 |
for div in soup.find_all('div'):
|
| 360 |
if div.get_text(strip=True).isdigit():
|
| 361 |
div.decompose()
|
| 362 |
|
| 363 |
+
# Count the total number of images with data URI
|
| 364 |
total_images = len([img for img in soup.find_all('img') if img.get('src', '').startswith('data:image/')])
|
| 365 |
|
| 366 |
+
# Initialize tasks list
|
| 367 |
+
tasks = []
|
| 368 |
+
|
| 369 |
if total_images > 20:
|
| 370 |
logging.warning(f"Nombre d'images ({total_images}) dépasse 20. Les images seront ignorées.")
|
| 371 |
+
# Remove all images without processing them
|
| 372 |
for img in soup.find_all('img'):
|
| 373 |
img.decompose()
|
| 374 |
else:
|
| 375 |
+
# Process images
|
| 376 |
for img in soup.find_all('img'):
|
| 377 |
src = img.get('src', '')
|
| 378 |
X = image_counter[0]
|
| 379 |
if src.startswith('data:image/'):
|
| 380 |
base64_image = encode_image_from_data_uri(src)
|
| 381 |
if base64_image:
|
| 382 |
+
# Create a task for get_image_description()
|
| 383 |
+
task = asyncio.create_task(get_image_description(base64_image))
|
| 384 |
+
# Store the task in images_data
|
| 385 |
+
images_data[f"IMG_{X}"] = {
|
| 386 |
+
'base64_image': base64_image,
|
| 387 |
+
'description_task': task
|
| 388 |
+
}
|
| 389 |
+
# Replace the image with a placeholder
|
| 390 |
+
placeholder = f"<!--IMG_{X}-->"
|
| 391 |
+
img.replace_with(placeholder)
|
| 392 |
+
image_counter[0] += 1
|
| 393 |
+
# Add the task to the list
|
| 394 |
+
tasks.append(task)
|
| 395 |
else:
|
| 396 |
+
img.decompose() # Remove images without data URI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
else:
|
| 398 |
+
img.decompose() # Remove images without data URI
|
| 399 |
|
| 400 |
+
# Remove specific <script> tags
|
| 401 |
scripts_to_remove = soup.find_all('script', src=True)
|
| 402 |
for script in scripts_to_remove:
|
| 403 |
src = script['src']
|
|
|
|
| 405 |
logging.debug(f"Suppression de la balise <script> : {script}")
|
| 406 |
script.decompose()
|
| 407 |
|
| 408 |
+
# Remove empty paragraphs or spans
|
| 409 |
for tag in soup.find_all(['p', 'span']):
|
| 410 |
if not tag.get_text(strip=True):
|
| 411 |
tag.decompose()
|
| 412 |
|
| 413 |
+
# Insert styles into a <style> tag in the head
|
| 414 |
style_tag = soup.new_tag('style')
|
| 415 |
style_tag.string = """
|
| 416 |
+
/* CSS styles */
|
| 417 |
+
/* (Styles remain the same) */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
"""
|
| 419 |
if soup.head:
|
| 420 |
soup.head.append(style_tag)
|
| 421 |
else:
|
| 422 |
+
# If <head> doesn't exist, create one
|
| 423 |
head_tag = soup.new_tag('head')
|
| 424 |
head_tag.append(style_tag)
|
| 425 |
soup.insert(0, head_tag)
|
| 426 |
|
| 427 |
+
return str(soup), tasks
|
| 428 |
|
| 429 |
def reinsert_images(html_content: str, images_data: Dict[str, Dict[str, str]]) -> str:
|
| 430 |
+
"""Function to re-integrate images and their descriptions into the final HTML code."""
|
| 431 |
+
# Use BeautifulSoup to parse the HTML
|
| 432 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 433 |
|
| 434 |
+
# Find HTML comments like <!--IMG_X-->
|
| 435 |
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
| 436 |
match = re.match(r'IMG_(\d+)', comment)
|
| 437 |
if match:
|
| 438 |
image_number = match.group(1)
|
| 439 |
image_key = f"IMG_{image_number}"
|
| 440 |
if image_key in images_data:
|
| 441 |
+
# Create an <img> tag with base64 data
|
| 442 |
img_tag = soup.new_tag('img')
|
| 443 |
img_tag['src'] = f"data:image/jpeg;base64,{images_data[image_key]['base64_image']}"
|
| 444 |
img_tag['alt'] = images_data[image_key]['description']
|
|
|
|
| 446 |
new_content = soup.new_tag('div')
|
| 447 |
new_content.append(img_tag)
|
| 448 |
|
| 449 |
+
# Create a <p> tag with the class 'description'
|
| 450 |
p_tag = soup.new_tag('p', attrs={'class': 'description'})
|
| 451 |
|
| 452 |
+
# Create a <strong>Image X</strong> tag
|
| 453 |
strong_tag = soup.new_tag('strong')
|
| 454 |
strong_tag.string = f"Image {image_number}"
|
| 455 |
p_tag.append(strong_tag)
|
| 456 |
|
| 457 |
+
# Add " : " after <strong>
|
| 458 |
p_tag.append(" : ")
|
| 459 |
|
| 460 |
+
# Get the description in Markdown
|
| 461 |
y_markdown = images_data[image_key]['description']
|
| 462 |
|
| 463 |
+
# Convert Markdown to HTML
|
| 464 |
y_html = markdown_to_html(y_markdown)
|
| 465 |
|
| 466 |
+
# Parse the generated HTML and add it to the <p> tag
|
| 467 |
y_soup = BeautifulSoup(y_html, 'html.parser')
|
| 468 |
p_tag.append(y_soup)
|
| 469 |
|
| 470 |
new_content.append(p_tag)
|
| 471 |
|
| 472 |
+
# Replace the comment with the new content
|
| 473 |
comment.replace_with(new_content)
|
| 474 |
else:
|
| 475 |
logging.error(f"Données pour {image_key} non trouvées.")
|
|
|
|
| 482 |
background_tasks: BackgroundTasks = BackgroundTasks()
|
| 483 |
):
|
| 484 |
try:
|
| 485 |
+
# Generate a job ID
|
| 486 |
job_id = str(uuid.uuid4())
|
| 487 |
job_dir = os.path.join(JOBS_DIR, job_id)
|
| 488 |
os.makedirs(job_dir)
|
| 489 |
|
| 490 |
+
# Save the input file
|
| 491 |
ext = os.path.splitext(file.filename)[1].lower()
|
| 492 |
if ext not in FORMAT_MAP:
|
| 493 |
raise HTTPException(status_code=400, detail=f"Extension de fichier non supportée : {ext}")
|
|
|
|
| 496 |
with open(input_file_path, "wb") as f:
|
| 497 |
shutil.copyfileobj(file.file, f)
|
| 498 |
|
| 499 |
+
# Initialize the status
|
| 500 |
status = {
|
| 501 |
'status': 'pending',
|
| 502 |
'message': 'Traitement démarré',
|
|
|
|
| 506 |
with open(status_file, 'w') as f:
|
| 507 |
json.dump(status, f)
|
| 508 |
|
| 509 |
+
# Start the background task
|
| 510 |
background_tasks.add_task(process_file, job_id, input_file_path, ext, file.filename)
|
| 511 |
|
| 512 |
+
# Return the job ID
|
| 513 |
return JSONResponse(content={'job_id': job_id})
|
| 514 |
|
| 515 |
except Exception as e:
|
|
|
|
| 549 |
|
| 550 |
@app.post("/convert_to_txt/")
|
| 551 |
async def convert_file_to_txt(
|
| 552 |
+
file: UploadFile = File(...),
|
| 553 |
background_tasks: BackgroundTasks = BackgroundTasks()
|
| 554 |
):
|
| 555 |
try:
|
|
|
|
| 563 |
'.odt', '.pdf', '.docx', '.html', '.htm', '.md', '.txt', '.rtf', '.epub',
|
| 564 |
'.tex', '.xml', '.org', '.commonmark', '.cm', '.wiki', '.opml'
|
| 565 |
]
|
| 566 |
+
|
| 567 |
if ext not in allowed_extensions:
|
| 568 |
raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
|
| 569 |
|
|
|
|
| 587 |
with open(output_filename, "w", encoding="utf-8") as f:
|
| 588 |
f.write(text)
|
| 589 |
logging.debug(f"PDF conversion successful with PyMuPDF: {output_filename}")
|
| 590 |
+
|
| 591 |
# Other file formats to text conversion using Pandoc
|
| 592 |
else:
|
| 593 |
output = pypandoc.convert_file(input_filename, 'plain', outputfile=output_filename)
|
|
|
|
| 609 |
return JSONResponse(status_code=http_exc.status_code, content={"message": http_exc.detail})
|
| 610 |
except Exception as e:
|
| 611 |
logging.error(f"Error during conversion: {str(e)}")
|
| 612 |
+
return JSONResponse(status_code=500, content={"message": f"Internal error: {str(e)}"})
|