|
import streamlit as st |
|
import zipfile, shutil, time |
|
import os |
|
import hashlib |
|
|
|
from streamlit import runtime |
|
from streamlit.runtime.scriptrunner import get_script_run_ctx |
|
from streamlit_js_eval import streamlit_js_eval |
|
import secrets |
|
|
|
import threading |
|
from streamlit.runtime.scriptrunner import add_script_run_ctx |
|
|
|
from streamlit.runtime import get_instance |
|
|
|
|
|
import fitz |
|
import glob |
|
import logging |
|
import io |
|
import gc |
|
import pprint |
|
|
|
WHITE = fitz.pdfcolor["white"] |
|
|
|
textflags = fitz.TEXT_DEHYPHENATE |
|
|
|
def get_size(path): |
|
size = os.path.getsize(path) |
|
if size < 1024: |
|
return f"{size} bytes" |
|
elif size < pow(1024,2): |
|
return f"{round(size/1024, 2)} KB" |
|
elif size < pow(1024,3): |
|
return f"{round(size/(pow(1024,2)), 2)} MB" |
|
elif size < pow(1024,4): |
|
return f"{round(size/(pow(1024,3)), 2)} GB" |
|
|
|
def get_remote_ip() -> str: |
|
"""Get remote ip.""" |
|
|
|
try: |
|
ctx = get_script_run_ctx() |
|
if ctx is None: |
|
return None |
|
|
|
session_info = runtime.get_instance().get_client(ctx.session_id) |
|
if session_info is None: |
|
return None |
|
except Exception as e: |
|
return None |
|
|
|
return session_info.request.remote_ip |
|
|
|
|
|
|
|
def my_makedirs(path): |
|
if not os.path.isdir(path): |
|
os.makedirs(path) |
|
|
|
def heart_beat(): |
|
""" |
|
Heartbeat function to track whether the session is alive |
|
""" |
|
thread = threading.Timer(interval=30, function=heart_beat) |
|
|
|
|
|
|
|
|
|
add_script_run_ctx(thread) |
|
|
|
|
|
|
|
ctx = get_script_run_ctx() |
|
|
|
|
|
runtime = get_instance() |
|
|
|
if runtime.is_active_session(session_id=ctx.session_id): |
|
logging.info(f"{ctx.session_id} is alive.") |
|
thread.start() |
|
else: |
|
if 'uniq' in st.session_state: |
|
if os.path.isdir(f"removefolder/{st.session_state.uniq}"): |
|
shutil.rmtree(f"removefolder/{st.session_state.uniq}") |
|
logging.info(f"{ctx.session_id} is gone.") |
|
return |
|
|
|
|
|
EXIT_JS = """ |
|
<script> |
|
window.addEventListener('beforeunload', function (event) { |
|
fetch('/close_session', {method: 'POST'}).then(response => { |
|
return response.text(); |
|
}).then(data => { |
|
console.log(data); |
|
}); |
|
}); |
|
</script> |
|
""" |
|
|
|
|
|
|
|
streamlit_js_eval(js_expressions = EXIT_JS) |
|
|
|
def split_pdf_by_pages(doc, output_dir=None): |
|
""" |
|
PDFを1ページずつ分割し、メモリまたはディスクに保存します。 |
|
|
|
Args: |
|
doc (fitz.Document): 入力PDFドキュメント。 |
|
output_dir (str, optional): 分割したPDFを保存するディレクトリパス。Noneの場合、メモリ上に保存します。 |
|
|
|
Returns: |
|
dict: ページ番号をキー、PDFデータを値とする辞書(output_dirがNoneの場合)。 |
|
int: 総ページ数。 |
|
""" |
|
total_pages = doc.page_count |
|
split_files = {} |
|
|
|
for page_num in range(total_pages): |
|
try: |
|
|
|
new_doc = fitz.open() |
|
new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num) |
|
new_doc.subset_fonts() |
|
|
|
if output_dir: |
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
output_path = os.path.join(output_dir, f"{page_num:08}.pdf") |
|
new_doc.save(output_path) |
|
split_files[page_num] = output_path |
|
new_doc.close() |
|
del new_doc |
|
gc.collect() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
print(f"Error processing page {page_num}: {e}") |
|
raise |
|
|
|
return split_files, total_pages |
|
|
|
def merge_pdfs_in_groups(split_files, group_size): |
|
""" |
|
split_files の PDF を group_size ごとにグループ化してマージします。 |
|
Args: |
|
split_files (list): 分割された PDF ファイルのリスト(ファイルパスまたはバイトストリーム)。 |
|
group_size (int): 1つのPDFにまとめるページ数。 |
|
Returns: |
|
dict: 最終的にマージされたPDFデータを含む辞書。キーは0、値はio.BytesIOオブジェクト。 |
|
""" |
|
waiting = st.empty() |
|
waiting.write("waiting...") |
|
intermediate_files = [] |
|
try: |
|
|
|
for group_idx in range(0, len(split_files), group_size): |
|
|
|
new_doc = fitz.open() |
|
try: |
|
|
|
group_files = split_files[group_idx:group_idx + group_size] |
|
for file_index, file in enumerate(group_files): |
|
try: |
|
pdf_file = fitz.open(file) |
|
new_doc.insert_pdf(pdf_file) |
|
pdf_file.close() |
|
del pdf_file |
|
gc.collect() |
|
except Exception as e: |
|
print(f"Error processing file {file}: {e}") |
|
continue |
|
|
|
|
|
new_doc.subset_fonts() |
|
|
|
|
|
output_pdf = io.BytesIO() |
|
new_doc.save( |
|
output_pdf, |
|
deflate=True, |
|
garbage=4, |
|
deflate_fonts=True, |
|
use_objstms=1 |
|
) |
|
output_pdf.seek(0) |
|
|
|
|
|
intermediate_files.append(output_pdf) |
|
print(f"Merged group {group_idx // group_size + 1} processed.") |
|
finally: |
|
new_doc.close() |
|
|
|
|
|
final_doc = fitz.open() |
|
try: |
|
for i, intermediate_file in enumerate(intermediate_files): |
|
|
|
waiting.write(f"working {int((i + 1) / len(intermediate_files) * 100)}%") |
|
try: |
|
intermediate_pdf = fitz.open(stream=intermediate_file.getvalue(), filetype="pdf") |
|
final_doc.insert_pdf(intermediate_pdf) |
|
intermediate_pdf.close() |
|
intermediate_file.close() |
|
del intermediate_file |
|
gc.collect() |
|
print(f"Intermediate PDF {i + 1} merged into final document.") |
|
except Exception as e: |
|
print(f"Error merging intermediate PDF {i + 1}: {e}") |
|
continue |
|
|
|
final_doc.subset_fonts() |
|
|
|
|
|
final_output_pdf = io.BytesIO() |
|
final_doc.save( |
|
final_output_pdf, |
|
deflate=True, |
|
garbage=4, |
|
deflate_fonts=True, |
|
use_objstms=1 |
|
) |
|
final_output_pdf.seek(0) |
|
finally: |
|
final_doc.close() |
|
|
|
del waiting |
|
merged_files = {} |
|
merged_files[0] = final_output_pdf |
|
print("Final merged PDF created successfully.") |
|
return merged_files |
|
except Exception as e: |
|
print(f"Error during merging process: {e}") |
|
raise |
|
|
|
def main(): |
|
|
|
if 'uniq' not in st.session_state: |
|
st.session_state.uniq = secrets.token_urlsafe() |
|
|
|
temp_dir = st.session_state.uniq |
|
my_makedirs(f"removefolder/{temp_dir}") |
|
|
|
flag = True |
|
if 'count' not in st.session_state: |
|
st.session_state.count = 0 |
|
|
|
if 'temp' not in st.session_state: |
|
st.session_state.temp = 0 |
|
|
|
if 'lang' not in st.session_state: |
|
st.session_state.lang = "" |
|
if 'result' not in st.session_state: |
|
st.session_state.result = "" |
|
if 'split_pdfs' not in st.session_state: |
|
st.session_state.split_pdfs = [] |
|
|
|
apptitle = st.empty() |
|
langs = st.empty() |
|
description = st.empty() |
|
obj_0 = st.empty() |
|
obj_1 = st.empty() |
|
apptitle.header("PDF file Translator 𓁨 β version. PDF 👉 translated PDF", divider='gray') |
|
langs.write('This App can translate to <[`Japanese`](https://rentry.co/fp5nmkir) , `English`, `French`, `Chinese (traditional)`, `Chinese (simplified)`, `Russian`, `Korean`, `Vietnamese`, `Thai`, `Catalan`, `Sinhalese`, `Nepall`, `Esperanto`>') |
|
description.markdown(""" |
|
It's easy to use. Just upload:outbox_tray:, select the language, and download the resulting .pdf file. |
|
|
|
After uploading a PDF file and selecting the translation language, you can wait a while. |
|
|
|
1. upload PDF. |
|
2. select language you want to translate to. |
|
3. just wait. |
|
4. completed then download PDF. END. |
|
|
|
🏴☠️Note : <span style="color: red; ">The file size will be more than double the original PDF.</span> 🍄 |
|
|
|
|
|
When you press the download button, the PDF file will be downloaded, and if you select another translation language, the process will be repeated.:leftwards_arrow_with_hook: |
|
|
|
The uploaded PDF file data will disappear <u>when you close the browser tab.</u> :thought_balloon: :eyes: |
|
|
|
**Only PDF files can be uploaded.** |
|
|
|
This translation app is useful for people who want to translate something or want to read something but cannot read it unless it is translated, and who want to quickly check the original text and the translation by comparing them in pairs. :yin_yang: |
|
|
|
**Even if the PDF file has many pages, there is no limit to the number of pages or characters.** |
|
|
|
<u>The untranslated data will be retained until the browser is closed, but once the app page is closed, the connection will be cut off and the data will be deleted.</u>:broken_heart: |
|
|
|
#### FAQ :coffee: |
|
|
|
:baby: **Q** : Does the translated text have information about paragraphs? :coffee: |
|
|
|
:robot_face: **A** : YES. |
|
|
|
The text extracted from the original text has the same position indexed with `:::info` as the translated text. |
|
|
|
Regardless of the contents of the uploaded PDF or document, counting starts from zero and you can see which page or sentence you are in. |
|
The original text has an `𓃰` : elephant mark after `:::info`, which are unicode characters representing Egyptian hieroglyphics. |
|
|
|
𓃰page.1 block:00000-0; |
|
|
|
Similarly, the translation of the original text is followed by a `𓆏` : frog mark. |
|
|
|
𓆏page.1 block:00000-0; |
|
|
|
~~:teapot: **Tips** : If you have a text editor with a pattern replacement function,~~ |
|
~~you can use the characters starting with~~ |
|
~~`:::info`~~ |
|
~~and ending with the line that contains only~~ |
|
~~`:::`~~ |
|
~~as a pattern to remove.~~ |
|
|
|
~~Try searching Google for keywords such as `grep and replace`.~~ |
|
|
|
:teapot: **Tips** : Other solution: |
|
**Google translate** can translate PDF under 100 pages and under 10MB.So if you want to translate big one, you should divide pages. |
|
Try [ THIS app ](https://huggingface.co/spaces/kuroiikimono/Splitpdf_streamlit2) |
|
|
|
|
|
""", unsafe_allow_html=True) |
|
|
|
obj_0.header("`PDF file uploader` (1st step)") |
|
|
|
|
|
uploaded_file = obj_1.file_uploader("UPLOAD your .pdf file", type="pdf") |
|
|
|
if uploaded_file is not None: |
|
flag = False |
|
st.success("PDF file translator") |
|
|
|
raw_filename = uploaded_file.name |
|
|
|
intext_0 = f'<span style="color:LavenderBlush;background:Orchid">{raw_filename}</span>' |
|
st.write(intext_0, unsafe_allow_html=True) |
|
hashed_filename = hashlib.sha1(raw_filename.encode()) |
|
uploadedfilename = hashed_filename.hexdigest() |
|
if "uploadedfilename" not in st.session_state: |
|
st.session_state.uploadedfilename = uploadedfilename |
|
|
|
if "book" not in st.session_state: |
|
|
|
|
|
my_makedirs( |
|
f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}" |
|
) |
|
|
|
with open( |
|
f'removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf', |
|
'wb') as file: |
|
file.write(uploaded_file.getvalue()) |
|
if os.path.isfile(f'removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf'): |
|
file_size = get_size(f'removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf') |
|
st.write("OK : " + file_size) |
|
|
|
|
|
|
|
|
|
PDF = glob.glob( |
|
f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf" |
|
) |
|
|
|
|
|
doc = fitz.open(PDF[0]) |
|
|
|
|
|
page_count = len(doc) |
|
|
|
ocg_xref = doc.add_ocg("White", on=True) |
|
|
|
book = [] |
|
progressbar1 = st.empty() |
|
|
|
my_bar1 = progressbar1.progress(0) |
|
|
|
|
|
xml_line_blocks = st.empty() |
|
for index, page in enumerate(doc): |
|
|
|
blocks = page.get_text("blocks", flags=textflags) |
|
|
|
|
|
if blocks == []: |
|
|
|
book.append((index, "", (0,0,0,0))) |
|
xml_line_blocks.write("") |
|
else: |
|
|
|
for block in blocks: |
|
bbox = block[:4] |
|
origin = block[4] |
|
book.append((index, origin, bbox)) |
|
xml_line_blocks.write(origin) |
|
page.draw_rect(bbox, color=None, fill=WHITE, fill_opacity=1, oc=ocg_xref) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
done = int(((index + 1) / page_count) * 100) |
|
my_bar1.progress(done, text=f"Reading Page Number : {index + 1}") |
|
xml_line_blocks.empty() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
split_files, total_pages = split_pdf_by_pages(doc, output_dir=f"removefolder/{temp_dir}/split_pdfs_{st.session_state.count}") |
|
st.session_state.split_pdfs = split_files |
|
|
|
doc.close() |
|
st.session_state.book = book |
|
my_bar1.empty() |
|
if os.path.isfile( |
|
f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf" |
|
): |
|
shutil.rmtree( |
|
f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/" |
|
) |
|
|
|
|
|
reload_bt = st.empty() |
|
if reload_bt.button("Upload another PDF file"): |
|
|
|
|
|
|
|
|
|
|
|
shutil.rmtree(f"removefolder/{temp_dir}") |
|
|
|
streamlit_js_eval(js_expressions="parent.window.location.reload()") |
|
st.markdown("----") |
|
|
|
plain_text1 = " 𓃠 select target language 𓃠 " |
|
var_text1 = f'##### <span style="color:green">{plain_text1}</span>' |
|
|
|
select = st.empty() |
|
select.write(var_text1, unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
`ja`: **Japanese**, |
|
`en`: **English**, |
|
`fr`: **French**, |
|
`zb-TW`: **Chinese (traditional)**, |
|
`zh-CN`: **Chinese (simplified)**, |
|
`ru`: **Russian**, |
|
`ko`: **Korean**, |
|
`vi`: **Vietnamese**, |
|
`th`: **Thai**, |
|
`ca`: **Catalan**, |
|
`si`: **Sinhalese**, |
|
`ne`: **Nepall**, |
|
`eo`: **Esperanto** |
|
""") |
|
lang_code = [ |
|
"select language", |
|
"Japanese", |
|
"English", |
|
"French", |
|
"Chinese traditional", |
|
"Chinese simplified", |
|
"Russian", |
|
"Korean", |
|
"Vietnamese", |
|
"Thai", |
|
"Catalan", |
|
"Sinhalese", |
|
"Nepall", |
|
"Esperanto", |
|
] |
|
sel = st.empty() |
|
language = sel.radio( |
|
label='translate to', |
|
options=lang_code, |
|
index=0, |
|
key = f"select_lang{st.session_state.count}", |
|
horizontal=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
statename = f"select_lang{st.session_state.count}" |
|
if "target_lang" not in st.session_state: |
|
st.session_state.target_lang = "UNSELECTED" |
|
|
|
def reset_selected_lang(): |
|
st.session_state[statename] = "select language" |
|
|
|
st.button('Reset Language', on_click=reset_selected_lang) |
|
|
|
area = st.empty() |
|
if flag: |
|
if "select_lang" in st.session_state: |
|
if st.session_state.select_lang != "select language": |
|
area2 = st.empty() |
|
plain_text2 = "☟Reset Language☟" |
|
empty_text = "☟ ☟" |
|
var_text2 = f'<span style="color:#FF69B4">{plain_text2}</span>' |
|
while flag: |
|
area2.write(var_text2, unsafe_allow_html=True) |
|
time.sleep(0.9) |
|
area2.write(empty_text) |
|
time.sleep(0.5) |
|
|
|
while flag: |
|
area.text("𓀤 upload PDF file 𓀤") |
|
time.sleep(1) |
|
area.text("𓀥 𓀥") |
|
time.sleep(0.8) |
|
else: |
|
if f"select_lang{st.session_state.count}" in st.session_state: |
|
statename = f"select_lang{st.session_state.count}" |
|
if st.session_state[statename] != "select language": |
|
plain_text2 = "Reset Language" |
|
var_text2 = f'<span style="color:gray">▲ `{plain_text2}`</span>' |
|
area.write(var_text2, unsafe_allow_html=True) |
|
|
|
obj_0.empty() |
|
obj_1.empty() |
|
|
|
|
|
|
|
st.markdown("----") |
|
st.success("translator") |
|
|
|
if "book" in st.session_state: |
|
book_data = st.session_state.book |
|
block_count = len(book_data) |
|
page_index = [x[0] for x in book_data] |
|
page_counts = len(set(page_index)) |
|
else: |
|
block_count = 0 |
|
page_count = 0 |
|
|
|
st.text(f"PDF : {page_counts} pages total blocks : {block_count}") |
|
|
|
progressbar = st.empty() |
|
my_bar = progressbar.progress(0) |
|
|
|
|
|
|
|
import re |
|
|
|
from deep_translator import GoogleTranslator |
|
|
|
title_name = re.sub("\.| |%|@|\"|\'", "_", f"{uploaded_file.name}") |
|
|
|
if st.session_state.temp != int(st.session_state.count): |
|
st.session_state.lang = "init" |
|
st.session_state.temp = int(st.session_state.count) |
|
|
|
if language not in lang_code[1:]: |
|
language = None |
|
|
|
if st.session_state.lang != language and language is not None: |
|
st.session_state.count += 1 |
|
st.session_state.result = "" |
|
st.session_state.lang = language |
|
|
|
description.empty() |
|
|
|
my_makedirs( |
|
f"removefolder/{temp_dir}/work_{st.session_state.count}") |
|
|
|
to = "" |
|
match language: |
|
case "Japanese": |
|
to = "ja" |
|
case "English": |
|
to = "en" |
|
case "French": |
|
to = "fr" |
|
case "Chinese traditional": |
|
to = "zh-TW" |
|
case "Chinese simplified": |
|
to = "zh-CN" |
|
case "Russian": |
|
to = "ru" |
|
case "Korean": |
|
to = "ko" |
|
case "Vietnamese": |
|
to = "vi" |
|
case "Thai": |
|
to = "th" |
|
case "Catalan": |
|
to = "ca" |
|
case "Sinhalese": |
|
to = "si" |
|
case "Nepall": |
|
to = "ne" |
|
case "Esperanto": |
|
to = "eo" |
|
case _: |
|
to = "unknown" |
|
|
|
st.info(f"translate to [ {language} ]") |
|
|
|
note_text = f'<span style="color:DimGray;background:GhostWhite">If you close the browser tab, all the files you uploaded and the files you are working on in the translation process will be completely deleted.\ |
|
If you try playing the YouTube video below and the sound stops, you will know that the browser tab has been closed or the connection has been lost.</span>' |
|
st.markdown(note_text, unsafe_allow_html=True) |
|
st.video("https://youtu.be/FItvc3QvQBw?si=KflrlSEu1mJybAum") |
|
|
|
st.session_state.target_lang = to |
|
|
|
plain_text5 = " 𓀡 results 𓁙 " |
|
var_text5 = f'##### <span style="color:#20B2AA">{plain_text5}</span>' |
|
st.write(var_text5, unsafe_allow_html=True) |
|
st.write(intext_0, unsafe_allow_html=True) |
|
|
|
work_area1 = st.empty() |
|
work_area2 = st.empty() |
|
|
|
|
|
block_list = [] |
|
for index, block in enumerate(book_data): |
|
page_bbox = block[2] |
|
page_text = block[1] |
|
page_index = int(block[0]) |
|
|
|
done = int(((index + 1) / block_count) * 100) |
|
my_bar.progress(done, |
|
text=f"page:{page_index + 1} Working block Number : {index + 1}") |
|
|
|
|
|
page_text = re.sub('\.', '.𓂀', page_text) |
|
text_list = [s for s in page_text.split('𓂀')] |
|
if len(text_list) < 1: |
|
continue |
|
|
|
limit = 0 |
|
temp_list = [] |
|
line_number = [] |
|
block_text = "" |
|
|
|
for n, line in enumerate(text_list): |
|
|
|
line2 = re.sub(r"\s+", " ", line) |
|
if line2 == "": |
|
continue |
|
temp_list.append((n, line2)) |
|
|
|
if len(temp_list) == 14 or n == len(text_list) - 1: |
|
limit += 1 |
|
if limit == 2: |
|
limit = 0 |
|
time.sleep(1) |
|
|
|
text_ = "" |
|
all_text_orig = "" |
|
all_text_done = "" |
|
for i, t in enumerate(temp_list): |
|
if t[1] != " ": |
|
line_number.append(t[0]) |
|
text_ += '𓂀' + t[1].strip() |
|
temp_list.clear() |
|
|
|
text_2 = text_ |
|
text_ = re.sub('𓂀', "", text_) |
|
|
|
|
|
|
|
|
|
line_number.clear() |
|
|
|
|
|
|
|
all_text_orig = f":::info\n𓃰page.{page_index} block:{index + 1:05d}" + f"-{n}" + f";\n:::\n{text_}\n" |
|
|
|
for _ in range(0, 5): |
|
|
|
try: |
|
tsd = GoogleTranslator( |
|
source="auto", |
|
target=to).translate(text=text_) |
|
if tsd == None: |
|
tsd = text_ |
|
block_text += tsd + " " |
|
|
|
|
|
all_text_done = f":::info\n𓆏page.{page_index} block:{index + 1:05d}" + f"-{n}" + f";\n:::\n{tsd}\n" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if type(all_text_orig) is str and type(all_text_done) is str: |
|
|
|
|
|
|
|
intext_1 = f'<span style="color:DimGray;background:GhostWhite">{all_text_orig}</span>' |
|
|
|
work_area1.markdown(intext_1, unsafe_allow_html=True) |
|
|
|
|
|
work_area2.write(f"{all_text_done}") |
|
|
|
|
|
|
|
|
|
with open( |
|
f"removefolder/{temp_dir}/work_{st.session_state.count}/reuseMarkdown.txt", |
|
"a") as tempf: |
|
tempf.write(all_text_orig + "\n\n" + |
|
all_text_done + "\n\n") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
break |
|
|
|
except Exception as e: |
|
print(e) |
|
time.sleep(4) |
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
block_list.append([page_index, page_bbox, block_text]) |
|
|
|
output_dir = f"removefolder/{temp_dir}/split_pdfs_translated_{language}_{st.session_state.count}" |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
progressbar2 = st.empty() |
|
my_bar2 = progressbar2.progress(0) |
|
translated_pdfs = [] |
|
for index, block in enumerate(block_list): |
|
page_bbox = block[1] |
|
page_text = block[2] |
|
page_index = int(block[0]) |
|
|
|
done = int(((index + 1) / len(block_list)) * 100) |
|
my_bar2.progress(done, |
|
text=f"creating PDF\npage:{page_index + 1} Working block Number : {index + 1}") |
|
|
|
|
|
input = st.session_state.split_pdfs[page_index] |
|
output = os.path.join(output_dir, f"translated_{page_index:08}.pdf") |
|
if not os.path.isfile(output): |
|
try: |
|
doc = fitz.open(input) |
|
except: |
|
continue |
|
if not output in translated_pdfs: |
|
translated_pdfs.append(output) |
|
else: |
|
try: |
|
doc = fitz.open(output) |
|
except: |
|
continue |
|
|
|
ocg_xref = doc.add_ocg(st.session_state.lang, on=True) |
|
|
|
page = doc[0] |
|
if page_bbox != (0,0,0,0): |
|
|
|
page.insert_htmlbox( |
|
page_bbox, |
|
f"{page_text}", |
|
|
|
oc=ocg_xref) |
|
try: |
|
doc.subset_fonts() |
|
if not os.path.isfile(output): |
|
doc.save(output, deflate=True, garbage=4, deflate_fonts=True, use_objstms=1) |
|
else: |
|
|
|
doc.save(output, incremental=True,encryption=fitz.PDF_ENCRYPT_KEEP, deflate=True, deflate_fonts=True, use_objstms=1) |
|
except Exception as e: |
|
print(e) |
|
print(output) |
|
finally: |
|
doc.close() |
|
del doc |
|
gc.collect() |
|
|
|
total_pages = len(st.session_state.split_pdfs) |
|
merged_files = merge_pdfs_in_groups(translated_pdfs, 10) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.balloons() |
|
work_area2.write("completed.𓁙") |
|
st.markdown("----") |
|
|
|
|
|
st.success("Download translated PDF file") |
|
st.write(intext_0, unsafe_allow_html=True) |
|
my_makedirs(f"removefolder/{temp_dir}/download_section") |
|
|
|
for group_num, pdf_data in merged_files.items(): |
|
output_path = os.path.join(f"removefolder/{temp_dir}/download_section", f"translated_{group_num + 1}.pdf") |
|
if isinstance(pdf_data, io.BytesIO): |
|
with open(output_path, "wb") as f: |
|
f.write(pdf_data.getvalue()) |
|
|
|
del pdf_data |
|
gc.collect() |
|
else: |
|
raise ValueErorr("pdf_data is not a valid BytesIO object") |
|
|
|
st.subheader("Translated PDF is here") |
|
|
|
if os.path.isfile(output_path): |
|
if len(raw_filename) > 20: |
|
out_filename = raw_filename[:10] + "--" |
|
else: |
|
out_filename = raw_filename |
|
|
|
f_size = get_size(output_path) |
|
|
|
with open(output_path, "rb") as fpath: |
|
bt = st.download_button( |
|
label=f"DOWNLOAD translated .pdf file ... {f_size}", |
|
data=fpath, |
|
file_name=f"translated_{out_filename}-[{language}].pdf", |
|
mime="application/pdf" |
|
) |
|
plain_text6 = "download pdf file" |
|
var_text6 = f'<span style="color:gray">▲ `{plain_text6}` 𓁉 </span>' |
|
st.write(var_text6, unsafe_allow_html=True) |
|
|
|
|
|
shutil.move( |
|
f"removefolder/{temp_dir}/work_{st.session_state.count}/reuseMarkdown.txt", |
|
f"removefolder/{temp_dir}/download_section/reuseMarkdown_{st.session_state.count}.txt" |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
shutil.rmtree( |
|
f"removefolder/{temp_dir}/work_{st.session_state.count}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plain_text3 = f"[ {language} ] : translated text " |
|
var_text3 = f'##### <span style="color:#FF69B4">{plain_text3}</span>' |
|
|
|
translated = st.empty() |
|
translated.write(var_text3, unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.markdown("----") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
heart_beat() |
|
|
|
main() |