Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """app.py.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1zk7xuWSf7ii7zowOqNVLy0FwXYVHYE2V | |
| """ | |
| # https://huggingface.co/spaces/eHemink/assessment3_part2 | |
| # Here are the imports | |
| import PyPDF2 | |
| import re | |
| import transformers | |
| import scipy | |
| from transformers import pipeline | |
| from bark import SAMPLE_RATE, generate_audio, preload_models | |
| import gradio as gr | |
| import os | |
| # Here is the code | |
| def abstract_to_audio(insert_pdf): | |
| # Extracting the abstract text from the article pdf | |
| def extract_abstract(pdf_file): | |
| # Open the PDF file in read-binary mode | |
| with open(pdf_file, 'rb') as file: | |
| # Create a PDF reader object | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| # Initialize an empty string to store abstract content | |
| abstract_text = '' | |
| # Loop through each page in the PDF | |
| for page_num in range(len(pdf_reader.pages)): | |
| # Get the text from the current page | |
| page = pdf_reader.pages[page_num] | |
| text = page.extract_text() | |
| # Use regular expression to find the "Abstract" section | |
| abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE) | |
| if abstract_match: | |
| # Get the text after the "Abstract" heading until the next section, indicated by "Introduction" heading | |
| start_index = abstract_match.end() | |
| next_section_match = re.search(r'\bIntroduction\b', text[start_index:]) | |
| if next_section_match: | |
| end_index = start_index + next_section_match.start() | |
| abstract_text = text[start_index:end_index] | |
| else: | |
| # If no next section found, extract text till the end | |
| abstract_text = text[start_index:] | |
| break # Exit loop once abstract is found | |
| return abstract_text.strip() | |
| abstract = extract_abstract(insert_pdf) | |
| # Creating a summarization pipeline | |
| model = "lidiya/bart-large-xsum-samsum" | |
| pipeline1 = pipeline(task = "summarization", model = model) | |
| # Summarizing the extracted abstract | |
| summarized = pipeline1(abstract) | |
| print(summarized[0]['summary_text']) | |
| tss_prompt = summarized[0]['summary_text'] | |
| # Generate audio file that speaks the generated sentence using Bark | |
| # download and load all models | |
| preload_models() | |
| # generate audio from text | |
| audio_array = generate_audio(tss_prompt) | |
| return (SAMPLE_RATE, audio_array) | |
| my_app = gr.Interface(fn=abstract_to_audio, inputs='file', outputs='audio', title="PDF Abstract Summarizer", | |
| description="Extracts abstracts from PDFs and generates audio summaries. This app only accepts PDFs with abstracts.", examples=[os.path.join(os.path.dirname(__file__), "Hidden_Technical_Debt_in_MLSystems.pdf"),os.path.join(os.path.dirname(__file__), "Productivity_Effects_of_GenAI.pdf")],cache_examples=True) | |
| my_app.launch() |