Spaces:

kuroiikimono
/

PDF_toPDF_translator

Running

File size: 39,319 Bytes

import streamlit as st
import zipfile, shutil, time
import os
import hashlib
#from streamlit_pdf_viewer import pdf_viewer
from streamlit import runtime
from streamlit.runtime.scriptrunner import get_script_run_ctx
from streamlit_js_eval import streamlit_js_eval
import secrets

import threading
from streamlit.runtime.scriptrunner import add_script_run_ctx
#import streamlit.components.v1 as components
from streamlit.runtime import get_instance

#from pypdf import PdfReader
import fitz # pymupdf
import glob
import logging
import io
import gc
import pprint

WHITE = fitz.pdfcolor["white"]
# This flag ensures that text will be dehyphenated after extraction.
textflags = fitz.TEXT_DEHYPHENATE

def get_size(path):
        size = os.path.getsize(path)
        if size < 1024:
            return f"{size} bytes"
        elif size < pow(1024,2):
            return f"{round(size/1024, 2)} KB"
        elif size < pow(1024,3):
            return f"{round(size/(pow(1024,2)), 2)} MB"
        elif size < pow(1024,4):
            return f"{round(size/(pow(1024,3)), 2)} GB"

def get_remote_ip() -> str:
    """Get remote ip."""

    try:
        ctx = get_script_run_ctx()
        if ctx is None:
            return None

        session_info = runtime.get_instance().get_client(ctx.session_id)
        if session_info is None:
            return None
    except Exception as e:
        return None

    return session_info.request.remote_ip


# colab side make dir
def my_makedirs(path):
    if not os.path.isdir(path):
        os.makedirs(path)

def heart_beat():
    """
    Heartbeat function to track whether the session is alive
    """
    thread = threading.Timer(interval=30, function=heart_beat)

    # insert context to the current thread, needed for
    # getting session specific attributes like st.session_state

    add_script_run_ctx(thread)

    # context is required to get session_id of the calling
    # thread (which would be the script thread)
    ctx = get_script_run_ctx()

    # this is the main runtime, contains all the sessions
    runtime = get_instance()

    if runtime.is_active_session(session_id=ctx.session_id):
        logging.info(f"{ctx.session_id} is alive.")
        thread.start()
    else:
        if 'uniq' in st.session_state:
            if os.path.isdir(f"removefolder/{st.session_state.uniq}"):
                shutil.rmtree(f"removefolder/{st.session_state.uniq}")
                logging.info(f"{ctx.session_id} is gone.")
        return

# JavaScript to detect browser exit
EXIT_JS = """
<script>
    window.addEventListener('beforeunload', function (event) {
        fetch('/close_session', {method: 'POST'}).then(response => {
            return response.text();
        }).then(data => {
            console.log(data);
        });
    });
</script>
"""

# Embed the JavaScript in the Streamlit app
#components.html(EXIT_JS)
streamlit_js_eval(js_expressions = EXIT_JS)

def split_pdf_by_pages(doc, output_dir=None):
    """
    PDFを1ページずつ分割し、メモリまたはディスクに保存します。

    Args:
        doc (fitz.Document): 入力PDFドキュメント。
        output_dir (str, optional): 分割したPDFを保存するディレクトリパス。Noneの場合、メモリ上に保存します。

    Returns:
        dict: ページ番号をキー、PDFデータを値とする辞書（output_dirがNoneの場合）。
        int: 総ページ数。
    """
    total_pages = doc.page_count
    split_files = {}

    for page_num in range(total_pages):
        try:
            # 新しいPDFドキュメントを作成
            new_doc = fitz.open()
            new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
            new_doc.subset_fonts()

            if output_dir:
                # ディスクに保存
                os.makedirs(output_dir, exist_ok=True)
                output_path = os.path.join(output_dir, f"{page_num:08}.pdf")
                new_doc.save(output_path)
                split_files[page_num] = output_path
                new_doc.close()
                del new_doc
                gc.collect()
            #else:
            #    # メモリ上に保存
            #    output_pdf = io.BytesIO()
            #    new_doc.save(output_pdf)
            #    output_pdf.seek(0)
            #    split_files[page_num] = output_pdf

            #print(f"Page {page_num} processed.")  # 処理状況を表示

        except Exception as e:
            print(f"Error processing page {page_num}: {e}")
            raise

    return split_files, total_pages

def merge_pdfs_in_groups(split_files, group_size):
    """
    split_files の PDF を group_size ごとにグループ化してマージします。
    Args:
        split_files (list): 分割された PDF ファイルのリスト（ファイルパスまたはバイトストリーム）。
        group_size (int): 1つのPDFにまとめるページ数。
    Returns:
        dict: 最終的にマージされたPDFデータを含む辞書。キーは0、値はio.BytesIOオブジェクト。
    """
    waiting = st.empty()
    waiting.write("waiting...")
    intermediate_files = []  # 中間生成されたPDFを一時保存するリスト
    try:
        # Step 1: split_files を group_size ごとに分割して中間PDFを作成
        for group_idx in range(0, len(split_files), group_size):
            # 新しいPDFドキュメントを作成
            new_doc = fitz.open()
            try:
                # 現在のグループのファイルを処理
                group_files = split_files[group_idx:group_idx + group_size]
                for file_index, file in enumerate(group_files):
                    try:
                        pdf_file = fitz.open(file)  # PDFを開く
                        new_doc.insert_pdf(pdf_file)  # 新しいPDFに挿入
                        pdf_file.close()  # ファイルを閉じる
                        del pdf_file  # 明示的に削除
                        gc.collect()  # メモリ解放
                    except Exception as e:
                        print(f"Error processing file {file}: {e}")
                        continue
                
                # フォントの最適化
                new_doc.subset_fonts()
                
                # メモリ上に保存
                output_pdf = io.BytesIO()
                new_doc.save(
                    output_pdf,
                    deflate=True,
                    garbage=4,
                    deflate_fonts=True,
                    use_objstms=1
                )
                output_pdf.seek(0)
                
                # 中間PDFをリストに保存
                intermediate_files.append(output_pdf)
                print(f"Merged group {group_idx // group_size + 1} processed.")  # 処理状況を表示
            finally:
                new_doc.close()  # ドキュメントを閉じる
            
        # Step 2: 中間PDFをすべて結合して最終的なPDFを作成
        final_doc = fitz.open()
        try:
            for i, intermediate_file in enumerate(intermediate_files):
                #waiting.write(f"{i + 1} / {len(intermediate_files)}")
                waiting.write(f"working {int((i + 1) / len(intermediate_files) * 100)}%")
                try:
                    intermediate_pdf = fitz.open(stream=intermediate_file.getvalue(), filetype="pdf")
                    final_doc.insert_pdf(intermediate_pdf)
                    intermediate_pdf.close()
                    intermediate_file.close()  # 中間ファイルを閉じる
                    del intermediate_file  # 明示的に削除
                    gc.collect()  # メモリ解放
                    print(f"Intermediate PDF {i + 1} merged into final document.")
                except Exception as e:
                    print(f"Error merging intermediate PDF {i + 1}: {e}")
                    continue
            
            final_doc.subset_fonts()
            
            # 最終PDFをメモリ上に保存
            final_output_pdf = io.BytesIO()
            final_doc.save(
                final_output_pdf,
                deflate=True,
                garbage=4,
                deflate_fonts=True,
                use_objstms=1
            )
            final_output_pdf.seek(0)
        finally:
            final_doc.close()  # ドキュメントを閉じる
        
        del waiting
        merged_files = {}
        merged_files[0] = final_output_pdf
        print("Final merged PDF created successfully.")
        return merged_files
    except Exception as e:
        print(f"Error during merging process: {e}")
        raise

def main():

    if 'uniq' not in st.session_state:
        st.session_state.uniq = secrets.token_urlsafe()

    temp_dir = st.session_state.uniq
    my_makedirs(f"removefolder/{temp_dir}")

    flag = True
    if 'count' not in st.session_state:
        st.session_state.count = 0
    #tempolary
    if 'temp' not in st.session_state:
        st.session_state.temp = 0

    if 'lang' not in st.session_state:
        st.session_state.lang = ""
    if 'result' not in st.session_state:
        st.session_state.result = ""
    if 'split_pdfs' not in st.session_state:
        st.session_state.split_pdfs = []

    apptitle = st.empty()
    langs = st.empty()
    description = st.empty()
    obj_0 = st.empty()
    obj_1 = st.empty()
    apptitle.header("PDF file Translator 𓁨 β version. PDF 👉 translated PDF", divider='gray')
    langs.write('This App can translate to <[`Japanese`](https://rentry.co/fp5nmkir) , `English`, `French`, `Chinese (traditional)`, `Chinese (simplified)`, `Russian`, `Korean`, `Vietnamese`, `Thai`, `Catalan`, `Sinhalese`, `Nepall`, `Esperanto`>')
    description.markdown("""
It's easy to use. Just upload:outbox_tray:, select the language, and download the resulting .pdf file.

After uploading a PDF file and selecting the translation language, you can wait a while.

1. upload PDF.
2. select language you want to translate to.
3. just wait.
4. completed then download PDF. END. 

🏴‍☠️Note : <span style="color: red; ">The file size will be more than double the original PDF.</span> 🍄


When you press the download button, the PDF file will be downloaded, and if you select another translation language, the process will be repeated.:leftwards_arrow_with_hook:

The uploaded PDF file data will disappear <u>when you close the browser tab.</u> :thought_balloon: :eyes:

**Only PDF files can be uploaded.**

This translation app is useful for people who want to translate something or want to read something but cannot read it unless it is translated, and who want to quickly check the original text and the translation by comparing them in pairs. :yin_yang:

**Even if the PDF file has many pages, there is no limit to the number of pages or characters.**

<u>The untranslated data will be retained until the browser is closed, but once the app page is closed, the connection will be cut off and the data will be deleted.</u>:broken_heart:

####  FAQ :coffee:

:baby: **Q** : Does the translated text have information about paragraphs? :coffee:

:robot_face: **A** : YES.

The text extracted from the original text has the same position indexed with `:::info` as the translated text.

Regardless of the contents of the uploaded PDF or document, counting starts from zero and you can see which page or sentence you are in.
The original text has an `𓃰` : elephant mark after `:::info`, which are unicode characters representing Egyptian hieroglyphics.

𓃰page.1 block:00000-0;

Similarly, the translation of the original text is followed by a `𓆏` : frog mark.

𓆏page.1 block:00000-0;

~~:teapot: **Tips** : If you have a text editor with a pattern replacement function,~~
~~you can use the characters starting with~~
~~`:::info`~~
~~and ending with the line that contains only~~
~~`:::`~~
~~as a pattern to remove.~~

~~Try searching Google for keywords such as `grep and replace`.~~

:teapot: **Tips** : Other solution:
**Google translate** can translate PDF under 100 pages and under 10MB.So if you want to translate big one, you should divide pages.
Try [ THIS app ](https://huggingface.co/spaces/kuroiikimono/Splitpdf_streamlit2)


""", unsafe_allow_html=True)

    obj_0.header("`PDF file uploader` (1st step)")
#    st.markdown(f"The remote ip is `{get_remote_ip()}`")

    uploaded_file = obj_1.file_uploader("UPLOAD your .pdf file", type="pdf")
    ####
    if uploaded_file is not None:
        flag = False
        st.success("PDF file translator")
        # hashed
        raw_filename = uploaded_file.name
        
        intext_0 = f'<span style="color:LavenderBlush;background:Orchid">{raw_filename}</span>'
        st.write(intext_0, unsafe_allow_html=True)
        hashed_filename = hashlib.sha1(raw_filename.encode())
        uploadedfilename = hashed_filename.hexdigest()
        if "uploadedfilename" not in st.session_state:
            st.session_state.uploadedfilename = uploadedfilename

        if "book" not in st.session_state:
            #            pdf_viewer(input=uploaded_file.getvalue(), width=700, height=500)

            my_makedirs(
                f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}"
            )

            with open(
                    f'removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf',
                    'wb') as file:
                file.write(uploaded_file.getvalue())
            if os.path.isfile(f'removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf'):
                file_size = get_size(f'removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf')
                st.write("OK : " + file_size)
            
#            pdf_viewer(input=f'{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf', width=700, height=500)

# read from PDF file
            PDF = glob.glob(
                f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf"
            )

            #doc = PdfReader(PDF[0])
            doc = fitz.open(PDF[0])
            # meta = doc.metadata
            #page_count = len(doc.pages)
            page_count = len(doc)
            
            ocg_xref = doc.add_ocg("White", on=True)

            book = []  # PDF text data pool
            progressbar1 = st.empty()

            my_bar1 = progressbar1.progress(0)

            #from bs4 import BeautifulSoup
            xml_line_blocks = st.empty()
            for index, page in enumerate(doc):

                blocks = page.get_text("blocks", flags=textflags)
                #pprint.pprint(index)
                #pprint.pprint(blocks)
                if blocks == []:
                    #print("blocks == []")
                    book.append((index, "", (0,0,0,0)))
                    xml_line_blocks.write("")
                else:
                    # Every block of text is contained in a rectangle ("bbox")
                    for block in blocks:
                        bbox = block[:4]  # area containing the text
                        origin = block[4]  # the text of this block
                        book.append((index, origin, bbox))
                        xml_line_blocks.write(origin)
                        page.draw_rect(bbox, color=None, fill=WHITE, fill_opacity=1, oc=ocg_xref)

                ##page_text = page.extract_text()
                ##page_text = page.get_text(sort=True)
                #blocks = page.get_text("xml")
                #soup = BeautifulSoup(blocks,'lxml-xml')
                #page_text2 = ""
                #temp_y_posi = 0.0
                #for tag0 in soup.find_all("block"):
                #    for tag1 in tag0.find_all("line"):
                #        for tag2 in tag1.find_all("font"):
                #            for tag3 in tag2.find_all("char"):
                #                y_posi = tag3.get("y")
                #                if y_posi != temp_y_posi:
                #                    page_text2 += "\n"
                #                    temp_y_posi = y_posi
                #                page_text2 += tag3.get("c")
                #
                #    page_text2 += "\n\n"
                #    xml_line_blocks.write(page_text2)

                ##book.append((index, page_text))
                #book.append((index, page_text2))
                done = int(((index + 1) / page_count) * 100)
                my_bar1.progress(done, text=f"Reading Page Number : {index + 1}")
            xml_line_blocks.empty()

            #for index, page in enumerate(doc.pages):
            #for index, page in enumerate(doc):
            #    #page_text = page.extract_text()
            #    page_text = page.get_text(sort=True)
            #    reading_text.write(page_text)
            #    book.append((index, page_text))
            #    done = int(((index + 1) / page_count) * 100)
            #    my_bar1.progress(done,
            #                     text=f"Reading Page Number : {index + 1}")

            split_files, total_pages = split_pdf_by_pages(doc, output_dir=f"removefolder/{temp_dir}/split_pdfs_{st.session_state.count}")
            st.session_state.split_pdfs = split_files

            doc.close()
            st.session_state.book = book
            my_bar1.empty()
            if os.path.isfile(
                    f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf"
            ):
                shutil.rmtree(
                    f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/"
                )

        ########
        reload_bt = st.empty()
        if reload_bt.button("Upload another PDF file"):
            #for key in st.session_state.keys():
            #    if key == "count" or key == "temp" or key == "lang":
            #        continue
            #    else:
            #        del st.session_state[key]
            shutil.rmtree(f"removefolder/{temp_dir}")
            # page reload
            streamlit_js_eval(js_expressions="parent.window.location.reload()")
        st.markdown("----")

        plain_text1 = " 𓃠 select target language 𓃠 "
        var_text1 = f'##### <span style="color:green">{plain_text1}</span>'

        select = st.empty()
        select.write(var_text1, unsafe_allow_html=True)

        # select language
        st.markdown("""
        `ja`: **Japanese**,
        `en`: **English**,
        `fr`: **French**,
        `zb-TW`: **Chinese (traditional)**,
        `zh-CN`: **Chinese (simplified)**,
        `ru`: **Russian**,
        `ko`: **Korean**,
        `vi`: **Vietnamese**,
        `th`: **Thai**,
        `ca`: **Catalan**,
        `si`: **Sinhalese**,
        `ne`: **Nepall**,
        `eo`: **Esperanto**
        """)
        lang_code = [
            "select language",
            "Japanese",
            "English",
            "French",
            "Chinese traditional",
            "Chinese simplified",
            "Russian",
            "Korean",
            "Vietnamese",
            "Thai",
            "Catalan",
            "Sinhalese",
            "Nepall",
            "Esperanto",
        ]
        sel = st.empty()
        language = sel.radio(
                    label='translate to',
                    options=lang_code,
                    index=0,
                    key = f"select_lang{st.session_state.count}",
                    horizontal=True)
        #language = sel.selectbox(
        #    'translate to',
        #    lang_code,
        #    index=0,
            #placeholder = "select language",
        #    key=f"select_lang{st.session_state.count}")

        statename = f"select_lang{st.session_state.count}"
        if "target_lang" not in st.session_state:
            st.session_state.target_lang = "UNSELECTED"

        def reset_selected_lang():
            st.session_state[statename] = "select language"

        st.button('Reset Language', on_click=reset_selected_lang)

    area = st.empty()
    if flag:
        if "select_lang" in st.session_state:
            if st.session_state.select_lang != "select language":
                area2 = st.empty()
                plain_text2 = "☟Reset Language☟"
                empty_text = "☟              ☟"
                var_text2 = f'<span style="color:#FF69B4">{plain_text2}</span>'
                while flag:
                    area2.write(var_text2, unsafe_allow_html=True)
                    time.sleep(0.9)
                    area2.write(empty_text)
                    time.sleep(0.5)

        while flag:
            area.text("𓀤 upload PDF file 𓀤")
            time.sleep(1)
            area.text("𓀥                 𓀥")
            time.sleep(0.8)
    else:
        if f"select_lang{st.session_state.count}" in st.session_state:
            statename = f"select_lang{st.session_state.count}"
            if st.session_state[statename] != "select language":
                plain_text2 = "Reset Language"
                var_text2 = f'<span style="color:gray">▲ `{plain_text2}`</span>'
                area.write(var_text2, unsafe_allow_html=True)

        obj_0.empty()
        obj_1.empty()  # uploader hide
        
        # pdf translator
        #------------------------------------------
        st.markdown("----")
        st.success("translator")

        if "book" in st.session_state:
            book_data = st.session_state.book
            block_count = len(book_data)
            page_index = [x[0] for x in book_data]
            page_counts = len(set(page_index))
        else:
            block_count = 0
            page_count = 0

        st.text(f"PDF : {page_counts} pages total blocks : {block_count}")

        progressbar = st.empty()
        my_bar = progressbar.progress(0)

        #3
        #        from google.colab import output
        import re
        #from googletrans import Translator
        from deep_translator import GoogleTranslator

        title_name = re.sub("\.| |%|@|\"|\'", "_", f"{uploaded_file.name}")

        if st.session_state.temp != int(st.session_state.count):
            st.session_state.lang = "init"
            st.session_state.temp = int(st.session_state.count)

        if language not in lang_code[1:]:
            language = None

        if st.session_state.lang != language and language is not None:
            st.session_state.count += 1
            st.session_state.result = ""
            st.session_state.lang = language

            description.empty()

            my_makedirs(
                f"removefolder/{temp_dir}/work_{st.session_state.count}")

            to = ""
            match language:
                case "Japanese":
                    to = "ja"
                case "English":
                    to = "en"
                case "French":
                    to = "fr"
                case "Chinese traditional":
                    to = "zh-TW"
                case "Chinese simplified":
                    to = "zh-CN"
                case "Russian":
                    to = "ru"
                case "Korean":
                    to = "ko"
                case "Vietnamese":
                    to = "vi"
                case "Thai":
                    to = "th"
                case "Catalan":
                    to = "ca"
                case "Sinhalese":
                    to = "si"
                case "Nepall":
                    to = "ne"
                case "Esperanto":
                    to = "eo"
                case _:
                    to = "unknown"

            st.info(f"translate to [ {language} ]")

            note_text = f'<span style="color:DimGray;background:GhostWhite">If you close the browser tab, all the files you uploaded and the files you are working on in the translation process will be completely deleted.\
            If you try playing the YouTube video below and the sound stops, you will know that the browser tab has been closed or the connection has been lost.</span>'
            st.markdown(note_text, unsafe_allow_html=True)
            st.video("https://youtu.be/FItvc3QvQBw?si=KflrlSEu1mJybAum")
            
            st.session_state.target_lang = to

            plain_text5 = " 𓀡 results 𓁙 "
            var_text5 = f'##### <span style="color:#20B2AA">{plain_text5}</span>'
            st.write(var_text5, unsafe_allow_html=True)
            st.write(intext_0, unsafe_allow_html=True)
            
            work_area1 = st.empty()
            work_area2 = st.empty()
            #--------------------------------------
            
            block_list = []
            for index, block in enumerate(book_data):
                page_bbox = block[2]
                page_text = block[1]
                page_index = int(block[0])
                #                print("\nPage Number:" + str(index))
                done = int(((index + 1) / block_count) * 100)
                my_bar.progress(done,
                                text=f"page:{page_index + 1} Working block Number : {index + 1}")
                #  print(len(page_text))
                #  text_list = [s for s in page_text.split('\n') if s]
                page_text = re.sub('\.', '.𓂀', page_text)
                text_list = [s for s in page_text.split('𓂀')]
                if len(text_list) < 1:
                    continue

                limit = 0
                temp_list = []
                line_number = []
                block_text = ""

                for n, line in enumerate(text_list):
                        
                    line2 = re.sub(r"\s+", " ", line)
                    if line2 == "":
                        continue
                    temp_list.append((n, line2))

                    if len(temp_list) == 14 or n == len(text_list) - 1:
                        limit += 1
                        if limit == 2:
                            limit = 0
                            time.sleep(1)
                    
                        text_ = ""
                        all_text_orig = ""
                        all_text_done = ""
                        for i, t in enumerate(temp_list):
                            if t[1] != " ":
                                line_number.append(t[0])
                                text_ += '𓂀' + t[1].strip()
                        temp_list.clear()

                        text_2 = text_
                        text_ = re.sub('𓂀', "", text_)
                        #while (re.search('𓂀', text_2)):
                        #    num = line_number.pop(0)
                        #    rep_words = f"𓃐NO:{num}| "
                        #    text_2 = text_2.replace('𓂀', rep_words, 1)
                        line_number.clear()

                        #                        print(re.sub("𓃐","\n", text_2))
                        #ts = Translator()
                        all_text_orig = f":::info\n𓃰page.{page_index} block:{index + 1:05d}" + f"-{n}" + f";\n:::\n{text_}\n"

                        for _ in range(0, 5):

                            try:
                                tsd = GoogleTranslator(
                                    source="auto",
                                    target=to).translate(text=text_)
                                if tsd == None:
                                    tsd = text_
                                block_text += tsd + " "
                                #tsd = ts.translate(text_, src="en", dest="ja")
                                #translated_text = ts.translate(line, src="en", dest="ja").text
                                all_text_done = f":::info\n𓆏page.{page_index} block:{index + 1:05d}" + f"-{n}" + f";\n:::\n{tsd}\n"
                                #all_text_done = f"**{index:05d}" + f"-{n}" + "; " +  tsd.text + "\n"

                                # all_text_orig += str(n) + "; " + tsd.pronunciation + "\n"
                                # print(index,n, line)
                                # print(index,n, tsd.text)

                                #                                print(all_text_orig)
                                #                                print(all_text_done + "\n")
                                if type(all_text_orig) is str and type(all_text_done) is str:
#                                    screen_1 = st.empty()
#                                    screen_2 = st.empty()
                                    #color:DimGray;background:GhostWhite
                                    intext_1 = f'<span style="color:DimGray;background:GhostWhite">{all_text_orig}</span>'
#                                    screen_1.markdown(intext_1, unsafe_allow_html=True)
                                    work_area1.markdown(intext_1, unsafe_allow_html=True)
                                    #color:LavenderBlush;background:LightGray
#                                    intext_2 = f'<span style="color:black;background:LavenderBlush">{all_text_done}</span>'
                                    work_area2.write(f"{all_text_done}")
                                    #screen_2.markdown(intext_2, unsafe_allow_html=True)
#                                    screen_2.markdown(f"{all_text_done}")
#                                    work_area2.markdown(intext_2, unsafe_allow_html=True)

                                    with open(
                                            f"removefolder/{temp_dir}/work_{st.session_state.count}/reuseMarkdown.txt",
                                            "a") as tempf:
                                        tempf.write(all_text_orig + "\n\n" +
                                                    all_text_done + "\n\n")

                                    # st.session_state.result += all_text_orig + "\n\n"
                                    # st.session_state.result += all_text_done + "\n\n"

                                # print(n, tsd.pronunciation)
#                                with open(
#                                        f"removefolder/{temp_dir}/work_{st.session_state.count}/{title_name}_done.txt",
#                                        "a") as f:
#                                    f.write(all_text_orig + all_text_done +
#                                            "\n")
#                                with open(
#                                        f"removefolder/{temp_dir}/work_{st.session_state.count}/{title_name}_done_{language}.txt",
#                                        "a") as f:
#                                    f.write(all_text_done + "\n")

                                break

                            except Exception as e:
                                print(e)
                                time.sleep(4)
                                continue

#                        with open(
#                                f"removefolder/{temp_dir}/work_{st.session_state.count}/{title_name}_orig.txt",
#                                "a") as f:
#                            f.write(all_text_orig + "\n")

                block_list.append([page_index, page_bbox, block_text])

            output_dir = f"removefolder/{temp_dir}/split_pdfs_translated_{language}_{st.session_state.count}"
            os.makedirs(output_dir, exist_ok=True)

            progressbar2 = st.empty()
            my_bar2 = progressbar2.progress(0)
            translated_pdfs = []
            for index, block in enumerate(block_list):
                page_bbox = block[1]
                page_text = block[2]
                page_index = int(block[0])

                done = int(((index + 1) / len(block_list)) * 100)
                my_bar2.progress(done,
                                text=f"creating PDF\npage:{page_index + 1} Working block Number : {index + 1}")
                #file_path = f"removefolder/{temp_dir}/split_pdfs_{st.session_state.count}"
                #output = os.path.join(file_path, f"{page_index}.pdf")
                input = st.session_state.split_pdfs[page_index]
                output = os.path.join(output_dir, f"translated_{page_index:08}.pdf")
                if not os.path.isfile(output):
                    try:
                        doc = fitz.open(input)
                    except:
                        continue
                    if not output in translated_pdfs:
                        translated_pdfs.append(output)
                else:
                    try:
                        doc = fitz.open(output)
                    except:
                        continue
                    
                ocg_xref = doc.add_ocg(st.session_state.lang, on=True)
                
                page = doc[0]
                if page_bbox != (0,0,0,0):
                    #page.draw_rect(page_bbox, color=None, fill=WHITE, fill_opacity=1, oc=ocg_xref)
                    page.insert_htmlbox(
                            page_bbox,
                            f"{page_text}",
                            #css=f"{template}",
                            oc=ocg_xref)
                try:
                    doc.subset_fonts()
                    if not os.path.isfile(output):
                        doc.save(output, deflate=True, garbage=4, deflate_fonts=True, use_objstms=1)
                    else:
                        #https://stackoverflow.com/questions/63176013/inserting-pdf-page-on-existing-pdf
                        doc.save(output, incremental=True,encryption=fitz.PDF_ENCRYPT_KEEP, deflate=True, deflate_fonts=True, use_objstms=1)
                except Exception as e:
                    print(e)
                    print(output)
                finally:
                    doc.close()
                    del doc
            gc.collect()

            total_pages = len(st.session_state.split_pdfs)
            merged_files = merge_pdfs_in_groups(translated_pdfs, 10)

            #merged_files = merge_pdfs_in_groups(
            #        #st.session_state.split_pdfs,
            #        translated_pdfs,
            #        total_pages
            #    )


            #st.subheader("Translated PDF")
            #for i, file_obj in merged_files.items():
            #    last = (i + 1) * total_pages
            #    if i + 1 == len(merged_files.items()):
            #        last = total_pages
            #    create_pdf_download_link(
            #        file_obj.getvalue(),
            #        f"{i}_{last}_pages.pdf"
            #        )


            st.balloons()
            work_area2.write("completed.𓁙")
            st.markdown("----")
            #   """https://discuss.huggingface.co/t/download-issues/41743"""

            st.success("Download translated PDF file")
            st.write(intext_0, unsafe_allow_html=True)
            my_makedirs(f"removefolder/{temp_dir}/download_section")

            for group_num, pdf_data in merged_files.items():
                output_path = os.path.join(f"removefolder/{temp_dir}/download_section", f"translated_{group_num + 1}.pdf")
                if isinstance(pdf_data, io.BytesIO):
                    with open(output_path, "wb") as f:
                        f.write(pdf_data.getvalue())
                    #print(f"Saved {output_path}")
                    del pdf_data
                    gc.collect()
                else:
                    raise ValueErorr("pdf_data is not a valid BytesIO object")

                st.subheader("Translated PDF is here")

                if os.path.isfile(output_path):
                    if len(raw_filename) > 20:
                        out_filename = raw_filename[:10] + "--"
                    else:
                        out_filename = raw_filename
                    
                    f_size = get_size(output_path)
                    
                    with open(output_path, "rb") as fpath:
                        bt = st.download_button(
                            label=f"DOWNLOAD translated .pdf file ... {f_size}",
                            data=fpath,
                            file_name=f"translated_{out_filename}-[{language}].pdf",
                            mime="application/pdf"
                            )
            plain_text6 = "download pdf file"
            var_text6 = f'<span style="color:gray">▲ `{plain_text6}` 𓁉 </span>'
            st.write(var_text6, unsafe_allow_html=True)


            shutil.move(
                f"removefolder/{temp_dir}/work_{st.session_state.count}/reuseMarkdown.txt",
                f"removefolder/{temp_dir}/download_section/reuseMarkdown_{st.session_state.count}.txt"
            )
#
#            shutil.make_archive(
#                f'removefolder/{temp_dir}/download_section/{st.session_state.uploadedfilename}_{st.session_state.count}',\
#                format='zip',\
#                root_dir=f'removefolder/{temp_dir}/work_{st.session_state.count}'\
#                )
            shutil.rmtree(
                f"removefolder/{temp_dir}/work_{st.session_state.count}")

            #--------------------------------------

#            st.success("Download translated text files")
#            st.write(intext_0, unsafe_allow_html=True)
#            # plain_text3 = f"[ {st.session_state.target_lang} ] : translated text files"
#            plain_text3 = f"[ {language} ] : translated text files"
            plain_text3 = f"[ {language} ] : translated text "
            var_text3 = f'##### <span style="color:#FF69B4">{plain_text3}</span>'

            translated = st.empty()
            translated.write(var_text3, unsafe_allow_html=True)
#
#            if os.path.isfile(
#                    f'removefolder/{temp_dir}/download_section/{st.session_state.uploadedfilename}_{st.session_state.count}.zip'
#            ):
#                with open(
#                        f"removefolder/{temp_dir}/download_section/{st.session_state.uploadedfilename}_{st.session_state.count}.zip",
#                        "rb") as fpath:
#                    btn = st.download_button(
#                        label=f"DOWNLOAD .zip file",
#                        data=fpath,
#                        file_name=
#                        f"{st.session_state.uploadedfilename}_{st.session_state.count}.zip",
#                        mime="application/zip")
#
#            plain_text4 = "download zip file"
#            var_text4 = f'<span style="color:gray">▲ `{plain_text4}` 𓁉 </span>'
#            st.write(var_text4, unsafe_allow_html=True)
#
            st.markdown("----")

#            plain_text5 = " 𓀡 results 𓁙 "
#            var_text5 = f'##### <span style="color:#20B2AA">{plain_text5}</span>'
#            st.write(var_text5, unsafe_allow_html=True)
#
#            tempf = open(
#                f"removefolder/{temp_dir}/download_section/reuseMarkdown_{st.session_state.count}.txt"
#            )
#            all_result = tempf.read()
#            tempf.close()
#            st.write(intext_0, unsafe_allow_html=True)
#            st.write(all_result, unsafe_allow_html=True)
            # st.write(st.session_state.result, unsafe_allow_html=True)

if __name__ == "__main__":
    heart_beat()

    main()