Spaces:

albeprime
/

abstract_to_audio

Sleeping

abstract_to_audio / pdf_processor.py

Alberto Primerano

Final Version

0e6d852 over 1 year ago

1.13 kB

	import gradio as gr
	from pdf_exctraction import read_pdf
	from transformers import pipeline

	# Extract the Abstract from the content of the document
	def extract_abstract(pdf_path):
	text_from_pdf = read_pdf(pdf_path)
	abstract_text = ""
	for page_content in text_from_pdf.values():
	if "Abstract\n" in page_content[0]:
	index_of_abstract = page_content[0].index("Abstract\n")
	if index_of_abstract < len(page_content[0]) - 1:
	abstract_text = page_content[0][index_of_abstract + 1]
	if abstract_text == "":
	raise gr.Error("The article does not contains an Abstract or it is not in the expected format")
	return abstract_text

	# Summarized the abstract extracted from PDF
	def summarize_abstract(pdf_path):
	abstract = extract_abstract(pdf_path)
	abstract = abstract.replace("\n", " ").replace("- ", "").replace(" ", " ")
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
	result = summarizer(abstract, max_length=50, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
	return result[0]['summary_text']