File size: 4,614 Bytes
14e2683
bb6ed43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14e2683
 
 
 
bb6ed43
 
14e2683
bb6ed43
14e2683
bb6ed43
 
 
 
 
 
 
 
 
 
 
 
 
 
14e2683
bb6ed43
 
 
14e2683
 
 
 
 
 
f7050ab
 
 
 
 
 
 
 
 
 
 
14e2683
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
import fitz
from transformers import pipeline, MBart50TokenizerFast, MBartForConditionalGeneration
from multiprocessing import Pool, cpu_count
import tempfile

# Load summarization pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")

# Load translation model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")

# Define max chunk length
max_chunk_length = 1024

# Function to chunk text
def chunk_text(text, max_chunk_length):
    chunks = []
    current_chunk = ""
    for sentence in text.split("."):
        if len(current_chunk) + len(sentence) + 1 <= max_chunk_length:
            if current_chunk != "":
                current_chunk += " "
            current_chunk += sentence.strip()
        else:
            chunks.append(current_chunk)
            current_chunk = sentence.strip()
    if current_chunk != "":
        chunks.append(current_chunk)
    return chunks

# Function to summarize and translate a chunk
def summarize_and_translate_chunk(chunk, lang):
    summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
    summary_text = summary[0]['summary_text']

    # Translate summary
    translated_chunk = translate_summary(summary_text, lang)
    return translated_chunk

# Function to translate the summary
def translate_summary(summary, lang):
    # Chunk text if it exceeds maximum length
    if len(summary) > max_chunk_length:
        chunks = chunk_text(summary, max_chunk_length)
    else:
        chunks = [summary]

    # Translate each chunk
    translated_chunks = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id[lang],
            max_length=1024,
            num_beams=4,
            early_stopping=True,
            length_penalty=2.0,
        )
        translated_chunks.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])

    return " ".join(translated_chunks)

# Function to read PDF and summarize and translate chunk by chunk
def summarize_and_translate_pdf(pdf_content, lang):
    # Save PDF content to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
        temp_file.write(pdf_content)

    try:
        doc = fitz.open(temp_file.name)
    except FileNotFoundError:
        return "File not found. Please make sure the file path is correct."

    total_chunks = len(doc)
    chunks = []

    for i in range(total_chunks):
        page = doc.load_page(i)
        text = page.get_text()
        chunks.extend([text[j:j+max_chunk_length] for j in range(0, len(text), max_chunk_length)])

    # Use multiprocessing to parallelize the process
    with Pool(cpu_count()) as pool:
        translated_chunks = pool.starmap(summarize_and_translate_chunk, [(chunk, lang) for chunk in chunks])

    # Delete temporary file
    temp_file.close()

    return translated_chunks

# Gradio Interface
def summarize_and_translate_interface(pdf_content, lang):
    translated_chunks = summarize_and_translate_pdf(pdf_content, lang)
    return "\n".join(translated_chunks)

# Gradio UI
input_pdf = gr.inputs.File(label="Upload a PDF file", type="file")
language = gr.inputs.Dropdown(choices=["Arabic", "Czech", "German", "English", "Spanish", "Estonian", "Finnish",
                                       "French", "Gujarati", "Hindi", "Italian", "Japanese", "Kazakh", "Korean",
                                       "Lithuanian", "Latvian", "Burmese", "Nepali", "Dutch", "Romanian", "Russian",
                                       "Sinhala", "Turkish", "Vietnamese", "Chinese", "Afrikaans", "Azerbaijani",
                                       "Bengali", "Persian", "Hebrew", "Croatian", "Indonesian", "Georgian", "Khmer",
                                       "Macedonian", "Malayalam", "Mongolian", "Marathi", "Polish", "Pashto",
                                       "Portuguese", "Swedish", "Swahili", "Tamil", "Telugu", "Thai", "Tagalog",
                                       "Ukrainian", "Urdu", "Xhosa", "Galician", "Slovene"],
                             label="Select language for translation")
output_text = gr.outputs.Textbox(label="Translated Summary")

gr.Interface(summarize_and_translate_interface, inputs=[input_pdf, language], outputs=output_text).launch()