File size: 4,614 Bytes
14e2683 bb6ed43 14e2683 bb6ed43 14e2683 bb6ed43 14e2683 bb6ed43 14e2683 bb6ed43 14e2683 f7050ab 14e2683 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import gradio as gr
import fitz
from transformers import pipeline, MBart50TokenizerFast, MBartForConditionalGeneration
from multiprocessing import Pool, cpu_count
import tempfile
# Load summarization pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
# Load translation model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
# Define max chunk length
max_chunk_length = 1024
# Function to chunk text
def chunk_text(text, max_chunk_length):
chunks = []
current_chunk = ""
for sentence in text.split("."):
if len(current_chunk) + len(sentence) + 1 <= max_chunk_length:
if current_chunk != "":
current_chunk += " "
current_chunk += sentence.strip()
else:
chunks.append(current_chunk)
current_chunk = sentence.strip()
if current_chunk != "":
chunks.append(current_chunk)
return chunks
# Function to summarize and translate a chunk
def summarize_and_translate_chunk(chunk, lang):
summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
summary_text = summary[0]['summary_text']
# Translate summary
translated_chunk = translate_summary(summary_text, lang)
return translated_chunk
# Function to translate the summary
def translate_summary(summary, lang):
# Chunk text if it exceeds maximum length
if len(summary) > max_chunk_length:
chunks = chunk_text(summary, max_chunk_length)
else:
chunks = [summary]
# Translate each chunk
translated_chunks = []
for chunk in chunks:
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
generated_tokens = model.generate(
**inputs,
forced_bos_token_id=tokenizer.lang_code_to_id[lang],
max_length=1024,
num_beams=4,
early_stopping=True,
length_penalty=2.0,
)
translated_chunks.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
return " ".join(translated_chunks)
# Function to read PDF and summarize and translate chunk by chunk
def summarize_and_translate_pdf(pdf_content, lang):
# Save PDF content to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(pdf_content)
try:
doc = fitz.open(temp_file.name)
except FileNotFoundError:
return "File not found. Please make sure the file path is correct."
total_chunks = len(doc)
chunks = []
for i in range(total_chunks):
page = doc.load_page(i)
text = page.get_text()
chunks.extend([text[j:j+max_chunk_length] for j in range(0, len(text), max_chunk_length)])
# Use multiprocessing to parallelize the process
with Pool(cpu_count()) as pool:
translated_chunks = pool.starmap(summarize_and_translate_chunk, [(chunk, lang) for chunk in chunks])
# Delete temporary file
temp_file.close()
return translated_chunks
# Gradio Interface
def summarize_and_translate_interface(pdf_content, lang):
translated_chunks = summarize_and_translate_pdf(pdf_content, lang)
return "\n".join(translated_chunks)
# Gradio UI
input_pdf = gr.inputs.File(label="Upload a PDF file", type="file")
language = gr.inputs.Dropdown(choices=["Arabic", "Czech", "German", "English", "Spanish", "Estonian", "Finnish",
"French", "Gujarati", "Hindi", "Italian", "Japanese", "Kazakh", "Korean",
"Lithuanian", "Latvian", "Burmese", "Nepali", "Dutch", "Romanian", "Russian",
"Sinhala", "Turkish", "Vietnamese", "Chinese", "Afrikaans", "Azerbaijani",
"Bengali", "Persian", "Hebrew", "Croatian", "Indonesian", "Georgian", "Khmer",
"Macedonian", "Malayalam", "Mongolian", "Marathi", "Polish", "Pashto",
"Portuguese", "Swedish", "Swahili", "Tamil", "Telugu", "Thai", "Tagalog",
"Ukrainian", "Urdu", "Xhosa", "Galician", "Slovene"],
label="Select language for translation")
output_text = gr.outputs.Textbox(label="Translated Summary")
gr.Interface(summarize_and_translate_interface, inputs=[input_pdf, language], outputs=output_text).launch()
|