kuroiikimono commited on
Commit
d3b3956
Β·
verified Β·
1 Parent(s): 1710e14

Upload main (1).py

Browse files
Files changed (1) hide show
  1. main (1).py +515 -0
main (1).py ADDED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import zipfile, shutil, time
3
+ import os
4
+ import hashlib
5
+ from streamlit_pdf_viewer import pdf_viewer
6
+ from streamlit import runtime
7
+ from streamlit.runtime.scriptrunner import get_script_run_ctx
8
+ from streamlit_js_eval import streamlit_js_eval
9
+ import secrets
10
+
11
+ import threading
12
+ from streamlit.runtime.scriptrunner import add_script_run_ctx
13
+ import streamlit.components.v1 as components
14
+ from streamlit.runtime import get_instance
15
+
16
+ from pypdf import PdfReader
17
+ import glob
18
+ import logging
19
+
20
+
21
+ def get_remote_ip() -> str:
22
+ """Get remote ip."""
23
+
24
+ try:
25
+ ctx = get_script_run_ctx()
26
+ if ctx is None:
27
+ return None
28
+
29
+ session_info = runtime.get_instance().get_client(ctx.session_id)
30
+ if session_info is None:
31
+ return None
32
+ except Exception as e:
33
+ return None
34
+
35
+ return session_info.request.remote_ip
36
+
37
+
38
+ # colab side make dir
39
+ def my_makedirs(path):
40
+ if not os.path.isdir(path):
41
+ os.makedirs(path)
42
+
43
+ def heart_beat():
44
+ """
45
+ Heartbeat function to track whether the session is alive
46
+ """
47
+ thread = threading.Timer(interval=2, function=heart_beat)
48
+
49
+ # insert context to the current thread, needed for
50
+ # getting session specific attributes like st.session_state
51
+
52
+ add_script_run_ctx(thread)
53
+
54
+ # context is required to get session_id of the calling
55
+ # thread (which would be the script thread)
56
+ ctx = get_script_run_ctx()
57
+
58
+ # this is the main runtime, contains all the sessions
59
+ runtime = get_instance()
60
+
61
+ if runtime.is_active_session(session_id=ctx.session_id):
62
+ logging.info(f"{ctx.session_id} is alive.")
63
+ thread.start()
64
+ else:
65
+ print(st.session_state.uniq)
66
+ if os.path.isdir(f"removefolder/{st.session_state.uniq}"):
67
+ shutil.rmtree(f"removefolder/{st.session_state.uniq}")
68
+ logging.info(f"{ctx.session_id} is gone.")
69
+ return
70
+
71
+ # JavaScript to detect browser exit
72
+ EXIT_JS = """
73
+ <script>
74
+ window.addEventListener('beforeunload', function (event) {
75
+ fetch('/close_session', {method: 'POST'}).then(response => {
76
+ return response.text();
77
+ }).then(data => {
78
+ console.log(data);
79
+ });
80
+ });
81
+ </script>
82
+ """
83
+
84
+ # Embed the JavaScript in the Streamlit app
85
+ components.html(EXIT_JS)
86
+
87
+ def main():
88
+
89
+ if 'uniq' not in st.session_state:
90
+ st.session_state.uniq = secrets.token_urlsafe()
91
+
92
+ temp_dir = st.session_state.uniq
93
+ my_makedirs(f"removefolder/{temp_dir}")
94
+
95
+ flag = True
96
+ if 'count' not in st.session_state:
97
+ st.session_state.count = 0
98
+ #tempolary
99
+ if 'temp' not in st.session_state:
100
+ st.session_state.temp = 0
101
+
102
+ if 'lang' not in st.session_state:
103
+ st.session_state.lang = ""
104
+ if 'result' not in st.session_state:
105
+ st.session_state.result = ""
106
+
107
+ obj_0 = st.empty()
108
+ obj_1 = st.empty()
109
+
110
+ obj_0.header("`PDF file uploader`")
111
+ st.markdown(f"The remote ip is `{get_remote_ip()}`")
112
+
113
+ uploaded_file = obj_1.file_uploader("UPLOAD your .pdf file", type="pdf")
114
+ ####
115
+ if uploaded_file is not None:
116
+ flag = False
117
+ st.success("PDF file translator")
118
+ # hashed
119
+ raw_filename = uploaded_file.name
120
+ intext_0 = f'<span style="color:LavenderBlush;background:Orchid">{raw_filename}</span>'
121
+ st.write(intext_0, unsafe_allow_html=True)
122
+ hashed_filename = hashlib.sha1(raw_filename.encode())
123
+ uploadedfilename = hashed_filename.hexdigest()
124
+ if "uploadedfilename" not in st.session_state:
125
+ st.session_state.uploadedfilename = uploadedfilename
126
+
127
+ if "book" not in st.session_state:
128
+ # pdf_viewer(input=uploaded_file.getvalue(), width=700, height=500)
129
+
130
+ my_makedirs(
131
+ f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}"
132
+ )
133
+
134
+ with open(
135
+ f'removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf',
136
+ 'wb') as file:
137
+ file.write(uploaded_file.getvalue())
138
+ # pdf_viewer(input=f'{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf', width=700, height=500)
139
+
140
+ # read from PDF file
141
+ PDF = glob.glob(
142
+ f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf"
143
+ )
144
+
145
+ doc = PdfReader(PDF[0])
146
+ # meta = doc.metadata
147
+ page_count = len(doc.pages)
148
+
149
+ book = [] # PDF text data pool
150
+ progressbar1 = st.empty()
151
+ my_bar1 = progressbar1.progress(0)
152
+ for index, page in enumerate(doc.pages):
153
+ page_text = page.extract_text()
154
+ book.append((index, page_text))
155
+ done = int(((index + 1) / page_count) * 100)
156
+ my_bar1.progress(done,
157
+ text=f"Reading Page Number : {index + 1}")
158
+ st.session_state.book = book
159
+ my_bar1.empty()
160
+ if os.path.isfile(
161
+ f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf"
162
+ ):
163
+ shutil.rmtree(
164
+ f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/"
165
+ )
166
+
167
+ ########
168
+ reload_bt = st.empty()
169
+ if reload_bt.button("Upload another PDF file"):
170
+ for key in st.session_state.keys():
171
+ if key == "count" or key == "temp" or key == "lang":
172
+ continue
173
+ else:
174
+ del st.session_state[key]
175
+ shutil.rmtree(f"removefolder/{temp_dir}")
176
+ # page reload
177
+ streamlit_js_eval(js_expressions="parent.window.location.reload()")
178
+ st.markdown("----")
179
+
180
+ plain_text1 = " 𓃠 select target language 𓃠 "
181
+ var_text1 = f'##### <span style="color:green">{plain_text1}</span>'
182
+
183
+ select = st.empty()
184
+ select.write(var_text1, unsafe_allow_html=True)
185
+
186
+ # select language
187
+ st.markdown("""
188
+ `ja`: **Japanese**,
189
+ `en`: **English**,
190
+ `fr`: **French**,
191
+ `zb-TW`: **Chinese (traditional)**,
192
+ `zh-CN`: **Chinese (simplified)**,
193
+ `ru`: **Russian**,
194
+ `ko`: **Korean**,
195
+ `vi`: **Vietnamese**,
196
+ `th`: **Thai**,
197
+ `ca`: **Catalan**,
198
+ `si`: **Sinhalese**
199
+ """)
200
+ lang_code = [
201
+ "select language",
202
+ "Japanese",
203
+ "English",
204
+ "French",
205
+ "Chinese traditional",
206
+ "Chinese simplified",
207
+ "Russian",
208
+ "Korean",
209
+ "Vietnamese",
210
+ "Thai",
211
+ "Catalan",
212
+ "Sinhalese",
213
+ ]
214
+ sel = st.empty()
215
+ #language = sel.radio(
216
+ # label='translate to',
217
+ # options=lang_code,
218
+ # index=0,
219
+ # key = f"select_lang{st.session_state.count}")
220
+ language = sel.selectbox(
221
+ 'translate to',
222
+ lang_code,
223
+ index=0,
224
+ #placeholder = "select language",
225
+ key=f"select_lang{st.session_state.count}")
226
+
227
+ statename = f"select_lang{st.session_state.count}"
228
+ if "target_lang" not in st.session_state:
229
+ st.session_state.target_lang = "UNSELECTED"
230
+
231
+ def reset_selected_lang():
232
+ st.session_state[statename] = "select language"
233
+
234
+ st.button('Reset Language', on_click=reset_selected_lang)
235
+
236
+ area = st.empty()
237
+ if flag:
238
+ if "select_lang" in st.session_state:
239
+ if st.session_state.select_lang != "select language":
240
+ area2 = st.empty()
241
+ plain_text2 = "☟Reset Language☟"
242
+ empty_text = "☟ ☟"
243
+ var_text2 = f'<span style="color:#FF69B4">{plain_text2}</span>'
244
+ while flag:
245
+ area2.write(var_text2, unsafe_allow_html=True)
246
+ time.sleep(0.9)
247
+ area2.write(empty_text)
248
+ time.sleep(0.5)
249
+
250
+ while flag:
251
+ area.text("π“€€ upload PDF file π“€€")
252
+ time.sleep(1)
253
+ area.text("π“€₯ π“€₯")
254
+ time.sleep(0.8)
255
+ else:
256
+ if f"select_lang{st.session_state.count}" in st.session_state:
257
+ statename = f"select_lang{st.session_state.count}"
258
+ if st.session_state[statename] != "select language":
259
+ plain_text2 = "Reset Language"
260
+ var_text2 = f'<span style="color:gray">β–² `{plain_text2}`</span>'
261
+ area.write(var_text2, unsafe_allow_html=True)
262
+
263
+ obj_0.empty()
264
+ obj_1.empty() # uploader hide
265
+
266
+ # pdf translator
267
+ #------------------------------------------
268
+ st.markdown("----")
269
+ st.success("translator")
270
+
271
+ if "book" in st.session_state:
272
+ book_data = st.session_state.book
273
+ page_count = len(book_data)
274
+ else:
275
+ page_count = 0
276
+
277
+ st.text(f"PDF total pages : {page_count}")
278
+
279
+ progressbar = st.empty()
280
+ my_bar = progressbar.progress(0)
281
+
282
+ #3
283
+ # from google.colab import output
284
+ import re
285
+ #from googletrans import Translator
286
+ from deep_translator import GoogleTranslator
287
+
288
+ title_name = re.sub("\.| |%|@|\"|\'", "_", f"{uploaded_file.name}")
289
+
290
+ if st.session_state.temp != int(st.session_state.count):
291
+ st.session_state.lang = "init"
292
+ st.session_state.temp = int(st.session_state.count)
293
+
294
+ if language not in lang_code[1:]:
295
+ language = None
296
+
297
+ if st.session_state.lang != language and language is not None:
298
+ st.session_state.count += 1
299
+ st.session_state.result = ""
300
+ st.session_state.lang = language
301
+
302
+ my_makedirs(
303
+ f"removefolder/{temp_dir}/work_{st.session_state.count}")
304
+
305
+ to = ""
306
+ match language:
307
+ case "Japanese":
308
+ to = "ja"
309
+ case "English":
310
+ to = "en"
311
+ case "French":
312
+ to = "fr"
313
+ case "Chinese traditional":
314
+ to = "zh-TW"
315
+ case "Chinese simplified":
316
+ to = "zh-CN"
317
+ case "Russian":
318
+ to = "ru"
319
+ case "Korean":
320
+ to = "ko"
321
+ case "Vietnamese":
322
+ to = "vi"
323
+ case "Thai":
324
+ to = "th"
325
+ case "Catalan":
326
+ to = "ca"
327
+ case "Sinhalese":
328
+ to = "si"
329
+ case _:
330
+ to = "unknown"
331
+
332
+ st.info(f"translate to [ {language} ]")
333
+
334
+ st.session_state.target_lang = to
335
+
336
+ work_area1 = st.empty()
337
+ work_area2 = st.empty()
338
+ #--------------------------------------
339
+
340
+ for index, page in enumerate(book_data):
341
+ page_text = page[1]
342
+ # print("\nPage Number:" + str(index))
343
+ done = int(((index + 1) / page_count) * 100)
344
+ my_bar.progress(done,
345
+ text=f"Working Page Number : {index + 1}")
346
+ # print(len(page_text))
347
+ # text_list = [s for s in page_text.split('\n') if s]
348
+ page_text = re.sub('\.', '.π“‚€', page_text)
349
+ text_list = [s for s in page_text.split('π“‚€')]
350
+ if len(text_list) < 1:
351
+ continue
352
+
353
+ limit = 0
354
+ temp_list = []
355
+ line_number = []
356
+
357
+ for n, line in enumerate(text_list):
358
+ limit += 1
359
+ if limit > 10:
360
+ limit = 0
361
+
362
+
363
+ # output.clear()
364
+
365
+ line2 = re.sub(r"\s+", " ", line)
366
+ if line2 == "":
367
+ continue
368
+ temp_list.append((n, line2))
369
+
370
+ if len(temp_list) == 15 or n == len(text_list) - 1:
371
+ text_ = ""
372
+ all_text_orig = ""
373
+ all_text_done = ""
374
+ for i, t in enumerate(temp_list):
375
+ if t[1] != " ":
376
+ line_number.append(t[0])
377
+ text_ += 'π“‚€' + t[1].strip()
378
+ temp_list.clear()
379
+
380
+ text_2 = text_
381
+ text_ = re.sub('π“‚€', "", text_)
382
+ while (re.search('π“‚€', text_2)):
383
+ num = line_number.pop(0)
384
+ rep_words = f"𓃐NO:{num}| "
385
+ text_2 = text_2.replace('π“‚€', rep_words, 1)
386
+ line_number.clear()
387
+
388
+ # print(re.sub("𓃐","\n", text_2))
389
+ #ts = Translator()
390
+ all_text_orig = f":::info\n𓃰{index + 1:05d}" + f"-{n}" + f";\n:::\n{text_}\n"
391
+
392
+ for times in range(0, 5):
393
+
394
+ try:
395
+ tsd = GoogleTranslator(
396
+ source="auto",
397
+ target=to).translate(text=text_)
398
+ if tsd == None:
399
+ tsd = text_
400
+ #tsd = ts.translate(text_, src="en", dest="ja")
401
+ #translated_text = ts.translate(line, src="en", dest="ja").text
402
+ all_text_done = f":::info\n𓆏{index + 1:05d}" + f"-{n}" + f";\n:::\n{tsd}\n"
403
+ #all_text_done = f"**{index:05d}" + f"-{n}" + "; " + tsd.text + "\n"
404
+
405
+ # all_text_orig += str(n) + "; " + tsd.pronunciation + "\n"
406
+ # print(index,n, line)
407
+ # print(index,n, tsd.text)
408
+
409
+ # print(all_text_orig)
410
+ # print(all_text_done + "\n")
411
+ if type(all_text_orig) is str and type(
412
+ all_text_done) is str:
413
+
414
+ # intext_1 = f'<span style="color:DimGray;background:GhostWhite">{all_text_orig}</span>'
415
+ # work_area1.markdown(intext_1, unsafe_allow_html=True)
416
+ # intext_2 = f'<span style="color:LavenderBlush;background:LightGray">{all_text_done}</span>'
417
+ work_area2.write(f"{all_text_done}")
418
+ # work_area2.markdown(intext_2, unsafe_allow_html=True)
419
+
420
+ with open(
421
+ f"removefolder/{temp_dir}/work_{st.session_state.count}/reuseMarkdown.txt",
422
+ "a") as tempf:
423
+ tempf.write(all_text_orig + "\n\n" +
424
+ all_text_done + "\n\n")
425
+
426
+ # st.session_state.result += all_text_orig + "\n\n"
427
+ # st.session_state.result += all_text_done + "\n\n"
428
+
429
+ # print(n, tsd.pronunciation)
430
+ with open(
431
+ f"removefolder/{temp_dir}/work_{st.session_state.count}/{title_name}_done.txt",
432
+ "a") as f:
433
+ f.write(all_text_orig + all_text_done +
434
+ "\n")
435
+ with open(
436
+ f"removefolder/{temp_dir}/work_{st.session_state.count}/{title_name}_done_{language}.txt",
437
+ "a") as f:
438
+ f.write(all_text_done + "\n")
439
+
440
+ break
441
+
442
+ except Exception as e:
443
+ print(e)
444
+ time.sleep(3)
445
+ continue
446
+
447
+ with open(
448
+ f"removefolder/{temp_dir}/work_{st.session_state.count}/{title_name}_orig.txt",
449
+ "a") as f:
450
+ f.write(all_text_orig + "\n")
451
+
452
+ st.balloons()
453
+ st.markdown("----")
454
+
455
+ my_makedirs(f"removefolder/{temp_dir}/download_section")
456
+ shutil.move(
457
+ f"removefolder/{temp_dir}/work_{st.session_state.count}/reuseMarkdown.txt",
458
+ f"removefolder/{temp_dir}/download_section/reuseMarkdown_{st.session_state.count}.txt"
459
+ )
460
+
461
+ shutil.make_archive(
462
+ f'removefolder/{temp_dir}/download_section/{st.session_state.uploadedfilename}_{st.session_state.count}',\
463
+ format='zip',\
464
+ root_dir=f'removefolder/{temp_dir}/work_{st.session_state.count}'\
465
+ )
466
+ shutil.rmtree(
467
+ f"removefolder/{temp_dir}/work_{st.session_state.count}")
468
+
469
+ #--------------------------------------
470
+
471
+ st.success("Download translated text files")
472
+ st.write(intext_0, unsafe_allow_html=True)
473
+ # plain_text3 = f"[ {st.session_state.target_lang} ] : translated text files"
474
+ plain_text3 = f"[ {language} ] : translated text files"
475
+ var_text3 = f'##### <span style="color:#FF69B4">{plain_text3}</span>'
476
+
477
+ translated = st.empty()
478
+ translated.write(var_text3, unsafe_allow_html=True)
479
+
480
+ if os.path.isfile(
481
+ f'removefolder/{temp_dir}/download_section/{st.session_state.uploadedfilename}_{st.session_state.count}.zip'
482
+ ):
483
+ with open(
484
+ f"removefolder/{temp_dir}/download_section/{st.session_state.uploadedfilename}_{st.session_state.count}.zip",
485
+ "rb") as fpath:
486
+ btn = st.download_button(
487
+ label=f"DOWNLOAD .zip file",
488
+ data=fpath,
489
+ file_name=
490
+ f"{st.session_state.uploadedfilename}_{st.session_state.count}.zip",
491
+ mime="application/zip")
492
+
493
+ plain_text4 = "download zipfile"
494
+ var_text4 = f'<span style="color:gray">β–² `{plain_text4}` 𓁉 </span>'
495
+ st.write(var_text4, unsafe_allow_html=True)
496
+
497
+ st.markdown("----")
498
+
499
+ plain_text5 = " 𓀑 results 𓁙 "
500
+ var_text5 = f'##### <span style="color:#20B2AA">{plain_text5}</span>'
501
+ st.write(var_text5, unsafe_allow_html=True)
502
+
503
+ tempf = open(
504
+ f"removefolder/{temp_dir}/download_section/reuseMarkdown_{st.session_state.count}.txt"
505
+ )
506
+ all_result = tempf.read()
507
+ tempf.close()
508
+ st.write(intext_0, unsafe_allow_html=True)
509
+ st.write(all_result, unsafe_allow_html=True)
510
+ # st.write(st.session_state.result, unsafe_allow_html=True)
511
+
512
+ if __name__ == "__main__":
513
+ heart_beat()
514
+
515
+ main()