H2E / app.py
bot-wot's picture
Update app.py
39ceea4 verified
import gradio as gr
from PyPDF2 import PdfReader
from transformers import MarianMTModel, MarianTokenizer
from docx import Document
import io
import torch
from transformers import AutoTokenizer, TFMarianMTModel
from typing import List
model_name = "shirsh10mall/Helsinki-shirsh-finetuned-translation-english-to-hindi"
model = TFMarianMTModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def convert_pdf_to_docx(file):
try:
# Read the PDF file
pdf_reader = PdfReader(file.name)
full_text = ""
for page_num, page in enumerate(pdf_reader.pages, start=1):
page_text = page.extract_text()
if page_text:
full_text += page_text + "\n"
else:
print(f"Warning: No text found on page {page_num}.")
if not full_text.strip():
return gr.Error("No text found in the PDF.", duration=3)
batch = tokenizer([full_text], return_tensors="tf")
gen = model.generate(**batch)
translated_text = tokenizer.batch_decode(gen, skip_special_tokens=True)
# Create a Word document with the translated text
doc = Document()
for line in translated_text.split('\n'):
doc.add_paragraph(line)
# Save the document to a bytes buffer
doc_io = io.BytesIO()
doc.save(doc_io)
doc_io.seek(0)
return doc_io
except Exception as e:
return gr.Error(f"An error occurred: {str(e)}", duration=5)
# Define the Gradio interface
iface = gr.Interface(
fn=convert_pdf_to_docx,
inputs=gr.File(label="Upload Hindi PDF"),
outputs=gr.File(label="Download Translated English DOCX"),
title="Hindi PDF to English DOCX Translator",
description="""
Upload a PDF file containing Hindi text. This application will extract the text, translate it to English,
and provide a downloadable DOCX file with the translated content.
""",
examples=[
["example_hindi.pdf"]
]
)
# Launch the interface
iface.launch()