yasmine110 commited on
Commit
0586248
·
verified ·
1 Parent(s): 4ad7a66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +293 -151
app.py CHANGED
@@ -1,155 +1,297 @@
1
- from flask import Flask, render_template, request, redirect, url_for, send_file
2
- from werkzeug.utils import secure_filename
3
- import os
 
 
 
 
 
 
 
 
4
  from transformers import pipeline
5
- import PyPDF2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from docx import Document
7
- import pandas as pd
8
- import pptx
9
- from io import StringIO, BytesIO
10
- import tempfile
11
-
12
- app = Flask(__name__)
13
-
14
- # Configuration
15
- UPLOAD_FOLDER = 'uploads'
16
- ALLOWED_EXTENSIONS = {'pdf', 'docx', 'pptx', 'xlsx'}
17
- app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
18
-
19
- # Charger le modèle de traduction (à adapter selon le modèle choisi)
20
- translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en") # Exemple français-anglais
21
-
22
- def allowed_file(filename):
23
- return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
24
-
25
- def extract_text_from_pdf(file_path):
26
- text = ""
27
- with open(file_path, 'rb') as file:
28
- reader = PyPDF2.PdfReader(file)
29
- for page in reader.pages:
30
- text += page.extract_text() + "\n"
31
- return text
32
-
33
- def extract_text_from_docx(file_path):
34
- doc = Document(file_path)
35
- return "\n".join([para.text for para in doc.paragraphs])
36
-
37
- def extract_text_from_pptx(file_path):
38
- prs = pptx.Presentation(file_path)
39
- text = []
40
- for slide in prs.slides:
41
- for shape in slide.shapes:
42
- if hasattr(shape, "text"):
43
- text.append(shape.text)
44
- return "\n".join(text)
45
-
46
- def extract_text_from_excel(file_path):
47
- df = pd.read_excel(file_path)
48
- return df.to_string()
49
-
50
- def translate_document(file_path, target_language='en'):
51
- # Détection du type de fichier
52
- ext = file_path.rsplit('.', 1)[1].lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- # Extraction du texte
55
- if ext == 'pdf':
56
- text = extract_text_from_pdf(file_path)
57
- elif ext == 'docx':
58
- text = extract_text_from_docx(file_path)
59
- elif ext == 'pptx':
60
- text = extract_text_from_pptx(file_path)
61
- elif ext in {'xlsx', 'xls'}:
62
- text = extract_text_from_excel(file_path)
63
  else:
64
- return None
65
-
66
- # Traduction du texte
67
- translated_text = translator(text, tgt_lang=target_language)[0]['translation_text']
68
-
69
- # Création d'un nouveau document avec le texte traduit
70
- output = BytesIO()
71
- if ext == 'pdf':
72
- # Pour PDF, on crée un nouveau fichier texte (simplifié)
73
- with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as tmp:
74
- tmp.write(translated_text.encode('utf-8'))
75
- return tmp.name
76
- elif ext == 'docx':
77
- doc = Document()
78
- doc.add_paragraph(translated_text)
79
- doc.save(output)
80
- elif ext == 'pptx':
81
- prs = pptx.Presentation()
82
- slide = prs.slides.add_slide(prs.slide_layouts[1])
83
- slide.shapes[0].text = translated_text
84
- prs.save(output)
85
- elif ext in {'xlsx', 'xls'}:
86
- df = pd.read_csv(StringIO(translated_text))
87
- df.to_excel(output, index=False)
88
-
89
- output.seek(0)
90
- return output
91
-
92
- @app.route('/', methods=['GET', 'POST'])
93
- def upload_file():
94
- if request.method == 'POST':
95
- # Vérification du fichier
96
- if 'file' not in request.files:
97
- return redirect(request.url)
98
- file = request.files['file']
99
- if file.filename == '':
100
- return redirect(request.url)
101
-
102
- if file and allowed_file(file.filename):
103
- filename = secure_filename(file.filename)
104
- filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
105
- file.save(filepath)
106
-
107
- # Récupération de la langue cible
108
- target_lang = request.form.get('target_lang', 'en')
109
-
110
- # Traduction du document
111
- translated_doc = translate_document(filepath, target_lang)
112
-
113
- if isinstance(translated_doc, str):
114
- # Fichier texte (PDF transformé en TXT)
115
- return send_file(
116
- translated_doc,
117
- as_attachment=True,
118
- download_name=filename.rsplit('.', 1)[0] + '_translated.txt',
119
- mimetype='text/plain'
120
- )
121
- else:
122
- # Autres formats
123
- return send_file(
124
- translated_doc,
125
- as_attachment=True,
126
- download_name=filename.rsplit('.', 1)[0] + '_translated.' + filename.rsplit('.', 1)[1],
127
- mimetype=file.mimetype
128
- )
129
-
130
- return '''
131
- <!doctype html>
132
- <html>
133
- <head>
134
- <title>Document Translation Service</title>
135
- </head>
136
- <body>
137
- <h1>Upload a document for translation</h1>
138
- <form method=post enctype=multipart/form-data>
139
- <input type=file name=file>
140
- <select name=target_lang>
141
- <option value="en">English</option>
142
- <option value="fr">French</option>
143
- <option value="es">Spanish</option>
144
- <option value="de">German</option>
145
- <option value="it">Italian</option>
146
- </select>
147
- <input type=submit value=Translate>
148
- </form>
149
- </body>
150
- </html>
151
- '''
152
-
153
- if __name__ == '__main__':
154
- os.makedirs(UPLOAD_FOLDER, exist_ok=True)
155
- app.run(debug=True)
 
1
+ Yasmineben
2
+ yasmineben0327
3
+ En ligne
4
+
5
+ Yasmineben — 21/04/2025 21:37
6
+ CCCCCCCC
7
+ Rayhane — Hier à 21:13
8
+ HIII
9
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Form
10
+ from fastapi.responses import HTMLResponse
11
+ from fastapi.staticfiles import StaticFiles
12
  from transformers import pipeline
13
+ import textwrap
14
+ import fitz # PyMuPDF for PDF handling
15
+ Afficher plus
16
+ message.txt
17
+ 6 Ko
18
+ Code python
19
+ Rayhane — Hier à 21:22
20
+ <!DOCTYPE html>
21
+ <html lang="fr">
22
+ <head>
23
+ <meta charset="UTF-8">
24
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
25
+ <title>AI Document Translator Pro</title>
26
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
27
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
28
+ <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
29
+ <script src="https://cdn.jsdelivr.net/npm/sweetalert2@11"></script>
30
+ <style>
31
+ :root {
32
+ --primary-color: #4361ee;
33
+ --secondary-color: #3f37c9;
34
+ --accent-color: #4cc9f0;
35
+ --dark-color: #1a1a2e;
36
+ --light-color: #f8f9fa;
37
+ }
38
+
39
+ body {
40
+ background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
41
+ color: var(--dark-color);
42
+ min-height: 100vh;
43
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
44
+ }
45
+
46
+ .main-container {
47
+ max-width: 800px;
48
+ background: white;
49
+ padding: 2rem;
50
+ border-radius: 15px;
51
+ box-shadow: 0 10px 30px rgba(0, 0, 0, 0.1);
52
+ margin: 2rem auto;
53
+ }
54
+
55
+ .header {
56
+ text-align: center;
57
+ margin-bottom: 2rem;
58
+ position: relative;
59
+ }
60
+
61
+ .logo {
62
+ font-size: 2.5rem;
63
+ color: var(--primary-color);
64
+ margin-bottom: 0.5rem;
65
+ }
66
+
67
+ .title {
68
+ font-weight: 700;
69
+ color: var(--dark-color);
70
+ margin-bottom: 0.5rem;
71
+ }
72
+
73
+ .subtitle {
74
+ color: #6c757d;
75
+ font-size: 1rem;
76
+ }
77
+
78
+ .upload-section {
79
+ background: rgba(67, 97, 238, 0.05);
80
+ border: 2px dashed rgba(67, 97, 238, 0.3);
81
+ border-radius: 10px;
82
+ padding: 2rem;
83
+ margin-bottom: 2rem;
84
+ transition: all 0.3s ease;
85
+ }
86
+
87
+ .upload-section:hover {
88
+ border-color: var(--primary-color);
89
+ background: rgba(67, 97, 238, 0.1);
90
+ }
91
+
92
+ .file-info {
93
+ background: var(--light-color);
94
+ padding: 0.5rem;
95
+ border-radius: 5px;
96
+ font-size: 0.9rem;
97
+ margin-top: 1rem;
98
+ }
99
+
100
+ .form-control, .form-select {
101
+ border-radius: 8px;
102
+ padding: 0.75rem 1rem;
103
+ border: 1px solid #ced4da;
104
+ transition: all 0.3s ease;
105
+ }
106
+
107
+ .form-control:focus, .form-select:focus {
108
+ border-color: var(--primary-color);
109
+ box-shadow: 0 0 0 0.25rem rgba(67, 97, 238, 0.25);
110
+ }
111
+
112
+ .btn-primary {
113
+ background-color: var(--primary-color);
114
+ border-color: var(--primary-color);
115
+ border-radius: 8px;
116
+ padding: 0.75rem 1.5rem;
117
+ font-weight: 600;
118
+ transition: all 0.3s ease;
119
+ }
120
+ ... (252lignes restantes)
121
+ Réduire
122
+ message.txt
123
+ 14 Ko
124
+ hadi html
125
+ fastapi
126
+ uvicorn
127
+ transformers
128
+ torch
129
+ PyMuPDF
130
+ python-docx
131
+ openpyxl
132
+ python-pptx
133
+ textwrap3
134
+ python-multipart
135
+ sentencepiece
136
+ hadi requirments.txt
137
+ 
138
+ Rayhane
139
+ rayhane0778_65551
140
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Form
141
+ from fastapi.responses import HTMLResponse
142
+ from fastapi.staticfiles import StaticFiles
143
+ from transformers import pipeline
144
+ import textwrap
145
+ import fitz # PyMuPDF for PDF handling
146
  from docx import Document
147
+ import openpyxl # For Excel
148
+ from pptx import Presentation
149
+ from fastapi.middleware.cors import CORSMiddleware
150
+ from functools import lru_cache
151
+ import os
152
+ from io import BytesIO
153
+
154
+ # Initialize FastAPI app
155
+ app = FastAPI()
156
+
157
+ # Enable CORS to allow frontend communication
158
+ app.add_middleware(
159
+ CORSMiddleware,
160
+ allow_origins=["*"],
161
+ allow_credentials=True,
162
+ allow_methods=["*"],
163
+ allow_headers=["*"],
164
+ )
165
+
166
+ # Directory for static files
167
+ STATIC_DIR = "static"
168
+
169
+ # Ensure the directory exists
170
+ if not os.path.exists(STATIC_DIR):
171
+ os.makedirs(STATIC_DIR)
172
+
173
+ # Serve static files correctly
174
+ app.mount("/static", StaticFiles(directory=STATIC_DIR, html=True), name="static")
175
+
176
+ @app.get("/", response_class=HTMLResponse)
177
+ async def read_root():
178
+ index_path = os.path.join(STATIC_DIR, "index.html")
179
+ try:
180
+ with open(index_path, "r", encoding="utf-8") as file:
181
+ return HTMLResponse(content=file.read())
182
+ except FileNotFoundError:
183
+ raise HTTPException(status_code=404, detail="index.html not found in static folder.")
184
+
185
+ # Supported languages
186
+ LANGUAGE_CODES = {
187
+ "Anglais": "en",
188
+ "Francais": "fr",
189
+ "Arabe": "ar",
190
+ "Espagnol": "es",
191
+ }
192
+
193
+ # Available translation models
194
+ AVAILABLE_MODELS = {
195
+ "fr-en": "Helsinki-NLP/opus-mt-fr-en",
196
+ "en-fr": "Helsinki-NLP/opus-mt-en-fr",
197
+ "ar-en": "Helsinki-NLP/opus-mt-ar-en",
198
+ "en-ar": "Helsinki-NLP/opus-mt-en-ar",
199
+ "es-en": "Helsinki-NLP/opus-mt-es-en",
200
+ "en-es": "Helsinki-NLP/opus-mt-en-es",
201
+ }
202
+
203
+ # Cache models for better performance
204
+ @lru_cache(maxsize=10)
205
+ def load_translator(src_code: str, tgt_code: str):
206
+ model_key = f"{src_code}-{tgt_code}"
207
 
208
+ if model_key in AVAILABLE_MODELS:
209
+ return pipeline("translation", model=AVAILABLE_MODELS[model_key])
210
+
211
+ elif src_code != "en" and tgt_code != "en":
212
+ return (
213
+ pipeline("translation", model=AVAILABLE_MODELS.get(f"{src_code}-en")),
214
+ pipeline("translation", model=AVAILABLE_MODELS.get(f"en-{tgt_code}"))
215
+ )
216
+
217
  else:
218
+ raise ValueError(f"No model available for {src_code} -> {tgt_code}")
219
+
220
+ # Function to split text into chunks
221
+ def chunk_text(text, max_length=400):
222
+ return textwrap.wrap(text, max_length)
223
+
224
+ # Function to extract text from files
225
+ def extract_text(file: UploadFile):
226
+ try:
227
+ file_bytes = file.file.read()
228
+ file_stream = BytesIO(file_bytes)
229
+
230
+ if file.filename.endswith(".txt"):
231
+ return file_bytes.decode("utf-8")
232
+
233
+ elif file.filename.endswith(".pdf"):
234
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
235
+ return "\n".join([page.get_text() for page in doc])
236
+
237
+ elif file.filename.endswith(".docx"):
238
+ doc = Document(file_stream)
239
+ return "\n".join([para.text for para in doc.paragraphs])
240
+
241
+ elif file.filename.endswith(".xlsx"):
242
+ wb = openpyxl.load_workbook(file_stream)
243
+ text = ""
244
+ for sheet in wb.sheetnames:
245
+ ws = wb[sheet]
246
+ for row in ws.iter_rows():
247
+ text += "\t".join([str(cell.value or "") for cell in row]) + "\n"
248
+ return text
249
+
250
+ elif file.filename.endswith(".pptx"):
251
+ prs = Presentation(file_stream)
252
+ text = ""
253
+ for slide in prs.slides:
254
+ for shape in slide.shapes:
255
+ if hasattr(shape, "text"):
256
+ text += shape.text + "\n"
257
+ return text
258
+
259
+ else:
260
+ raise HTTPException(status_code=400, detail="Unsupported file format.")
261
+
262
+ except Exception as e:
263
+ raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")
264
+
265
+ # Correctly defined POST route for file upload
266
+ @app.post("/upload/")
267
+ async def upload_file(
268
+ file: UploadFile = File(...),
269
+ src_lang: str = Form(...),
270
+ tgt_lang: str = Form(...)
271
+ ):
272
+ text = extract_text(file)
273
+
274
+ if not text.strip():
275
+ raise HTTPException(status_code=400, detail="No text extracted from the file.")
276
+
277
+ src_code = LANGUAGE_CODES.get(src_lang)
278
+ tgt_code = LANGUAGE_CODES.get(tgt_lang)
279
+
280
+ if not src_code or not tgt_code:
281
+ raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")
282
+
283
+ try:
284
+ translator = load_translator(src_code, tgt_code)
285
+
286
+ if isinstance(translator, tuple):
287
+ translator1, translator2 = translator
288
+ intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
289
+ translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])
290
+ else:
291
+ translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
292
+
293
+ return {"translated_text": translated_text}
294
+
295
+ except Exception as e:
296
+ raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")
297
+