abstract_to_audio / pdf_processor.py
Alberto Primerano
Final Version
0e6d852
raw
history blame
1.13 kB
import gradio as gr
from pdf_exctraction import read_pdf
from transformers import pipeline
# Extract the Abstract from the content of the document
def extract_abstract(pdf_path):
text_from_pdf = read_pdf(pdf_path)
abstract_text = ""
for page_content in text_from_pdf.values():
if "Abstract\n" in page_content[0]:
index_of_abstract = page_content[0].index("Abstract\n")
if index_of_abstract < len(page_content[0]) - 1:
abstract_text = page_content[0][index_of_abstract + 1]
if abstract_text == "":
raise gr.Error("The article does not contains an Abstract or it is not in the expected format")
return abstract_text
# Summarized the abstract extracted from PDF
def summarize_abstract(pdf_path):
abstract = extract_abstract(pdf_path)
abstract = abstract.replace("\n", " ").replace("- ", "").replace(" ", " ")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
result = summarizer(abstract, max_length=50, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
return result[0]['summary_text']