|
import gradio as gr |
|
import fitz |
|
from transformers import pipeline, MBart50TokenizerFast, MBartForConditionalGeneration |
|
from multiprocessing import Pool, cpu_count |
|
import tempfile |
|
|
|
|
|
summarizer = pipeline("summarization", model="Falconsai/text_summarization") |
|
|
|
|
|
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt") |
|
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX") |
|
|
|
|
|
max_chunk_length = 1024 |
|
|
|
|
|
def chunk_text(text, max_chunk_length): |
|
chunks = [] |
|
current_chunk = "" |
|
for sentence in text.split("."): |
|
if len(current_chunk) + len(sentence) + 1 <= max_chunk_length: |
|
if current_chunk != "": |
|
current_chunk += " " |
|
current_chunk += sentence.strip() |
|
else: |
|
chunks.append(current_chunk) |
|
current_chunk = sentence.strip() |
|
if current_chunk != "": |
|
chunks.append(current_chunk) |
|
return chunks |
|
|
|
|
|
def summarize_and_translate_chunk(chunk, lang): |
|
summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False) |
|
summary_text = summary[0]['summary_text'] |
|
|
|
|
|
translated_chunk = translate_summary(summary_text, lang) |
|
return translated_chunk |
|
|
|
|
|
def translate_summary(summary, lang): |
|
|
|
if len(summary) > max_chunk_length: |
|
chunks = chunk_text(summary, max_chunk_length) |
|
else: |
|
chunks = [summary] |
|
|
|
|
|
translated_chunks = [] |
|
for chunk in chunks: |
|
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True) |
|
generated_tokens = model.generate( |
|
**inputs, |
|
forced_bos_token_id=tokenizer.lang_code_to_id[lang], |
|
max_length=1024, |
|
num_beams=4, |
|
early_stopping=True, |
|
length_penalty=2.0, |
|
) |
|
translated_chunks.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]) |
|
|
|
return " ".join(translated_chunks) |
|
|
|
|
|
def summarize_and_translate_pdf(pdf_content, lang): |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: |
|
temp_file.write(pdf_content) |
|
|
|
try: |
|
doc = fitz.open(temp_file.name) |
|
except FileNotFoundError: |
|
return "File not found. Please make sure the file path is correct." |
|
|
|
total_chunks = len(doc) |
|
chunks = [] |
|
|
|
for i in range(total_chunks): |
|
page = doc.load_page(i) |
|
text = page.get_text() |
|
chunks.extend([text[j:j+max_chunk_length] for j in range(0, len(text), max_chunk_length)]) |
|
|
|
|
|
with Pool(cpu_count()) as pool: |
|
translated_chunks = pool.starmap(summarize_and_translate_chunk, [(chunk, lang) for chunk in chunks]) |
|
|
|
|
|
temp_file.close() |
|
|
|
return translated_chunks |
|
|
|
|
|
def summarize_and_translate_interface(pdf_content, lang): |
|
translated_chunks = summarize_and_translate_pdf(pdf_content, lang) |
|
return "\n".join(translated_chunks) |
|
|
|
|
|
input_pdf = gr.inputs.File(label="Upload a PDF file", type="file") |
|
language = gr.inputs.Dropdown(choices=["Arabic", "Czech", "German", "English", "Spanish", "Estonian", "Finnish", |
|
"French", "Gujarati", "Hindi", "Italian", "Japanese", "Kazakh", "Korean", |
|
"Lithuanian", "Latvian", "Burmese", "Nepali", "Dutch", "Romanian", "Russian", |
|
"Sinhala", "Turkish", "Vietnamese", "Chinese", "Afrikaans", "Azerbaijani", |
|
"Bengali", "Persian", "Hebrew", "Croatian", "Indonesian", "Georgian", "Khmer", |
|
"Macedonian", "Malayalam", "Mongolian", "Marathi", "Polish", "Pashto", |
|
"Portuguese", "Swedish", "Swahili", "Tamil", "Telugu", "Thai", "Tagalog", |
|
"Ukrainian", "Urdu", "Xhosa", "Galician", "Slovene"], |
|
label="Select language for translation") |
|
output_text = gr.outputs.Textbox(label="Translated Summary") |
|
|
|
gr.Interface(summarize_and_translate_interface, inputs=[input_pdf, language], outputs=output_text).launch() |
|
|