|
import gradio as gr |
|
from PyPDF2 import PdfReader |
|
from transformers import MarianMTModel, MarianTokenizer |
|
from docx import Document |
|
import io |
|
import torch |
|
from transformers import AutoTokenizer, TFMarianMTModel |
|
from typing import List |
|
|
|
model_name = "shirsh10mall/Helsinki-shirsh-finetuned-translation-english-to-hindi" |
|
model = TFMarianMTModel.from_pretrained(model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
def convert_pdf_to_docx(file): |
|
try: |
|
|
|
pdf_reader = PdfReader(file.name) |
|
full_text = "" |
|
for page_num, page in enumerate(pdf_reader.pages, start=1): |
|
page_text = page.extract_text() |
|
if page_text: |
|
full_text += page_text + "\n" |
|
else: |
|
print(f"Warning: No text found on page {page_num}.") |
|
|
|
if not full_text.strip(): |
|
return gr.Error("No text found in the PDF.", duration=3) |
|
|
|
batch = tokenizer([full_text], return_tensors="tf") |
|
gen = model.generate(**batch) |
|
translated_text = tokenizer.batch_decode(gen, skip_special_tokens=True) |
|
|
|
|
|
doc = Document() |
|
for line in translated_text.split('\n'): |
|
doc.add_paragraph(line) |
|
|
|
|
|
doc_io = io.BytesIO() |
|
doc.save(doc_io) |
|
doc_io.seek(0) |
|
|
|
return doc_io |
|
except Exception as e: |
|
return gr.Error(f"An error occurred: {str(e)}", duration=5) |
|
|
|
|
|
iface = gr.Interface( |
|
fn=convert_pdf_to_docx, |
|
inputs=gr.File(label="Upload Hindi PDF"), |
|
outputs=gr.File(label="Download Translated English DOCX"), |
|
title="Hindi PDF to English DOCX Translator", |
|
description=""" |
|
Upload a PDF file containing Hindi text. This application will extract the text, translate it to English, |
|
and provide a downloadable DOCX file with the translated content. |
|
""", |
|
examples=[ |
|
["example_hindi.pdf"] |
|
] |
|
) |
|
|
|
|
|
iface.launch() |
|
|