import gradio as gr from PyPDF2 import PdfReader from transformers import MarianMTModel, MarianTokenizer from docx import Document import io import torch from transformers import AutoTokenizer, TFMarianMTModel from typing import List model_name = "shirsh10mall/Helsinki-shirsh-finetuned-translation-english-to-hindi" model = TFMarianMTModel.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) def convert_pdf_to_docx(file): try: # Read the PDF file pdf_reader = PdfReader(file.name) full_text = "" for page_num, page in enumerate(pdf_reader.pages, start=1): page_text = page.extract_text() if page_text: full_text += page_text + "\n" else: print(f"Warning: No text found on page {page_num}.") if not full_text.strip(): return gr.Error("No text found in the PDF.", duration=3) batch = tokenizer([full_text], return_tensors="tf") gen = model.generate(**batch) translated_text = tokenizer.batch_decode(gen, skip_special_tokens=True) # Create a Word document with the translated text doc = Document() for line in translated_text.split('\n'): doc.add_paragraph(line) # Save the document to a bytes buffer doc_io = io.BytesIO() doc.save(doc_io) doc_io.seek(0) return doc_io except Exception as e: return gr.Error(f"An error occurred: {str(e)}", duration=5) # Define the Gradio interface iface = gr.Interface( fn=convert_pdf_to_docx, inputs=gr.File(label="Upload Hindi PDF"), outputs=gr.File(label="Download Translated English DOCX"), title="Hindi PDF to English DOCX Translator", description=""" Upload a PDF file containing Hindi text. This application will extract the text, translate it to English, and provide a downloadable DOCX file with the translated content. """, examples=[ ["example_hindi.pdf"] ] ) # Launch the interface iface.launch()