import gradio as gr
from PyPDF2 import PdfReader
from transformers import MarianMTModel, MarianTokenizer
from docx import Document
import io
import torch
from transformers import AutoTokenizer, TFMarianMTModel
from typing import List

model_name = "shirsh10mall/Helsinki-shirsh-finetuned-translation-english-to-hindi"
model = TFMarianMTModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def convert_pdf_to_docx(file):
    try:
        # Read the PDF file
        pdf_reader = PdfReader(file.name)
        full_text = ""
        for page_num, page in enumerate(pdf_reader.pages, start=1):
            page_text = page.extract_text()
            if page_text:                
                full_text += page_text + "\n"
            else:
                print(f"Warning: No text found on page {page_num}.")
        
        if not full_text.strip():
            return gr.Error("No text found in the PDF.", duration=3)

        batch = tokenizer([full_text], return_tensors="tf")
        gen = model.generate(**batch)
        translated_text = tokenizer.batch_decode(gen, skip_special_tokens=True)

        # Create a Word document with the translated text
        doc = Document()
        for line in translated_text.split('\n'):
            doc.add_paragraph(line)
        
        # Save the document to a bytes buffer
        doc_io = io.BytesIO()
        doc.save(doc_io)
        doc_io.seek(0)
        
        return doc_io
    except Exception as e:
        return gr.Error(f"An error occurred: {str(e)}", duration=5)

# Define the Gradio interface
iface = gr.Interface(
    fn=convert_pdf_to_docx,
    inputs=gr.File(label="Upload Hindi PDF"),
    outputs=gr.File(label="Download Translated English DOCX"),
    title="Hindi PDF to English DOCX Translator",
    description="""
    Upload a PDF file containing Hindi text. This application will extract the text, translate it to English, 
    and provide a downloadable DOCX file with the translated content.
    """,
    examples=[
        ["example_hindi.pdf"]
    ]
)

# Launch the interface
iface.launch()