Spaces:

eHemink
/

assessment3_part2

Runtime error

App Files Files Community

assessment3_part2 / app.py

eHemink

Update app.py

f289eee almost 2 years ago

raw

history blame

3.1 kB

	# -- coding: utf-8 --
	"""app.py.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1zk7xuWSf7ii7zowOqNVLy0FwXYVHYE2V
	"""

	# https://huggingface.co/spaces/eHemink/assessment3_part2

	# Here are the imports
	import PyPDF2
	import re
	import transformers
	import scipy
	from transformers import pipeline
	from bark import SAMPLE_RATE, generate_audio, preload_models
	import gradio as gr
	import os

	# Here is the code
	def abstract_to_audio(insert_pdf):
	# Extracting the abstract text from the article pdf
	def extract_abstract(pdf_file):
	# Open the PDF file in read-binary mode
	with open(pdf_file, 'rb') as file:
	# Create a PDF reader object
	pdf_reader = PyPDF2.PdfReader(file)

	# Initialize an empty string to store abstract content
	abstract_text = ''

	# Loop through each page in the PDF
	for page_num in range(len(pdf_reader.pages)):
	# Get the text from the current page
	page = pdf_reader.pages[page_num]
	text = page.extract_text()

	# Use regular expression to find the "Abstract" section
	abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)
	if abstract_match:
	# Get the text after the "Abstract" heading until the next section, indicated by "Introduction" heading
	start_index = abstract_match.end()
	next_section_match = re.search(r'\bIntroduction\b', text[start_index:])
	if next_section_match:
	end_index = start_index + next_section_match.start()
	abstract_text = text[start_index:end_index]
	else:
	# If no next section found, extract text till the end
	abstract_text = text[start_index:]
	break # Exit loop once abstract is found

	return abstract_text.strip()


	abstract = extract_abstract(insert_pdf)

	# Creating a summarization pipeline
	model = "lidiya/bart-large-xsum-samsum"
	pipeline1 = pipeline(task = "summarization", model = model)

	# Summarizing the extracted abstract
	summarized = pipeline1(abstract)
	print(summarized[0]['summary_text'])
	tss_prompt = summarized[0]['summary_text']

	# Generate audio file that speaks the generated sentence using Bark
	# download and load all models
	preload_models()

	# generate audio from text
	audio_array = generate_audio(tss_prompt)
	return (SAMPLE_RATE, audio_array)






	my_app = gr.Interface(fn=abstract_to_audio, inputs='file', outputs='audio', title="PDF Abstract Summarizer",
	description="Extracts abstracts from PDFs and generates audio summaries. This app only accepts PDFs with abstracts.", examples=[os.path.join(os.path.dirname(__file__), "Hidden_Technical_Debt_in_MLSystems.pdf"),os.path.join(os.path.dirname(__file__), "Productivity_Effects_of_GenAI.pdf")],cache_examples=True)
	my_app.launch()