Alberto Primerano commited on
Commit
0e6d852
·
1 Parent(s): fd6c4e0

Final Version

Browse files
Article 11 Hidden Technical Debt in Machine Learning Systems.pdf ADDED
Binary file (166 kB). View file
 
__pycache__/audio_processor.cpython-311.pyc ADDED
Binary file (1.21 kB). View file
 
__pycache__/pdf_exctraction.cpython-311.pyc ADDED
Binary file (3.17 kB). View file
 
__pycache__/pdf_processor.cpython-311.pyc ADDED
Binary file (1.86 kB). View file
 
__pycache__/pdf_read.cpython-311.pyc ADDED
Binary file (1.01 kB). View file
 
app.py CHANGED
@@ -1,15 +1,49 @@
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
-
6
- iface = gr.Interface(
7
- fn=greet,
8
- inputs="file",
9
- outputs="audio",
10
- live=True,
11
- title="Audio Processor",
12
- description="Process audio files and return the processed audio.",
13
- )
14
- if __name__ == "__main__":
15
- iface.launch()
 
 
1
+ """
2
+
3
+ This class implments a Gradio interface to generate an audio summary
4
+ from the abstract of a PDF article.
5
+
6
+ """
7
+
8
  import gradio as gr
9
+ from gradio_pdf import PDF
10
+ from pdf_processor import summarize_abstract
11
+ from audio_processor import generate_audio
12
+
13
+ # Given a PDF file path, this function returns an audio summary of the abstract
14
+ def abstract_to_audio(pdf_path):
15
+ """ This function retuns the audio generated from the summary of the abstract of PDF file.
16
+
17
+ Args: (pdf_path : str)
18
+ """
19
+ summarized_abstract = summarize_abstract(pdf_path)
20
+ audio, sampling_rate = generate_audio(summarized_abstract)
21
+ return sampling_rate, audio.T
22
+
23
+
24
+ with gr.Blocks() as abstract_audio:
25
+ gr.Markdown("""
26
+ # PDF abstract audio summarize
27
+ Create an audio summary of the Abstract of the uploaded article.""")
28
+
29
+ with gr.Row():
30
+ pdf_input = PDF(label="PDF File ...", interactive=True)
31
+ audio_output = gr.Audio(label="Audio Summary ...")
32
+
33
+ audio_abstract_button = gr.Button("Generate audio summary")
34
+ audio_abstract_button.click(abstract_to_audio, inputs=pdf_input, outputs=audio_output)
35
 
36
+ gr.Examples(
37
+ examples=["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"],
38
+ inputs=pdf_input,
39
+ outputs=audio_output,
40
+ fn=abstract_to_audio,
41
+ cache_examples=False,
42
+ )
43
+ with gr.Accordion("Information: "):
44
+ gr.Markdown("This application creates an audio summary of the Abstract of the uploaded article. \n"
45
+ "Make sure that the uploaded article is in the expected format......\n"
46
+ "Please note that the summarization is implemented using facebook/bart-large-cnn with maxlength= 50 to provide a meaningful summary and not \n"
47
+ "only the first sentence.")
48
+
49
+ abstract_audio.launch()
audio_processor.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import scipy
3
+ from transformers import pipeline
4
+
5
+
6
+ def generate_audio(one_sentence_summary):
7
+ """ Generate an audio from the summary of the abstract of PDF file."""
8
+ synthesiser = pipeline("text-to-speech", "suno/bark-small")
9
+ speech = synthesiser(one_sentence_summary, forward_params={"do_sample": True})
10
+ return speech["audio"], speech["sampling_rate"]
11
+
12
+ def convert_to_16_bit_wav(data):
13
+ # Based on: https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.wavfile.write.html
14
+ if data.dtype == np.float32:
15
+ data = data / np.abs(data).max()
16
+ data = data * 32767
17
+ data = data.astype(np.int16)
18
+ return data
flagged/PDF File.../9c658cc31a3d3aca9f3f/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf ADDED
Binary file (166 kB). View file
 
flagged/log.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ name,output,flag,username,timestamp
2
+ ,,,,2023-12-01 08:46:01.539926
3
+ ,,,,2023-12-04 18:19:49.008896
4
+ "{""path"":""flagged/PDF File.../9c658cc31a3d3aca9f3f/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"",""url"":""http://127.0.0.1:7860/file=/private/var/folders/ch/cnzx1kvd1d9_f94xydxjcr3m0000gq/T/gradio/1eb131a34fbb508a9dd8b646950c65901d6f1a5b/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"",""size"":165614,""orig_name"":""Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"",""mime_type"":""""}",,,,2023-12-05 12:14:20.913438
gradio_cached_examples/10/Audio Summary/b5d3212345fc9c05bd0b/audio.wav ADDED
Binary file (669 kB). View file
 
gradio_cached_examples/10/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Audio Summary,flag,username,timestamp
2
+ "{""path"":""gradio_cached_examples/10/Audio Summary/b5d3212345fc9c05bd0b/audio.wav"",""url"":null,""size"":null,""orig_name"":""audio.wav"",""mime_type"":null}",,,2023-12-05 15:32:45.408395
gradio_cached_examples/5/Audio Summary/0ecec320413aacb22ba4/audio.wav ADDED
Binary file (664 kB). View file
 
gradio_cached_examples/5/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Audio Summary,flag,username,timestamp
2
+ "{""path"":""gradio_cached_examples/5/Audio Summary/0ecec320413aacb22ba4/audio.wav"",""url"":null,""size"":null,""orig_name"":""audio.wav"",""mime_type"":null}",,,2023-12-05 15:36:16.937479
gradio_cached_examples/9/Audio Summary/f9814d212607b3ca1d85/audio.wav ADDED
Binary file (658 kB). View file
 
gradio_cached_examples/9/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Audio Summary,flag,username,timestamp
2
+ "{""path"":""gradio_cached_examples/9/Audio Summary/f9814d212607b3ca1d85/audio.wav"",""url"":null,""size"":null,""orig_name"":""audio.wav"",""mime_type"":null}",,,2023-12-05 15:39:46.300924
pdf_exctraction.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This class contains the code provided for extracting content from a PDF file
2
+
3
+ import gradio as gr
4
+ import PyPDF2
5
+ import pdfplumber
6
+ from pdfminer.high_level import extract_pages
7
+ from pdfminer.layout import LTTextContainer, LTChar
8
+
9
+ def text_extraction(element):
10
+ # Extracting the text from the in-line text element
11
+ line_text = element.get_text()
12
+
13
+ # Find the formats of the text
14
+ # Initialize the list with all the formats that appeared in the line of text
15
+ line_formats = []
16
+ for text_line in element:
17
+ if isinstance(text_line, LTTextContainer):
18
+ # Iterating through each character in the line of text
19
+ for character in text_line:
20
+ if isinstance(character, LTChar):
21
+ # Append the font name of the character
22
+ line_formats.append(character.fontname)
23
+ # Append the font size of the character
24
+ line_formats.append(character.size)
25
+ # Find the unique font sizes and names in the line
26
+ format_per_line = list(set(line_formats))
27
+
28
+ # Return a tuple with the text in each line along with its format
29
+ return (line_text, format_per_line)
30
+
31
+ def read_pdf(pdf_path):
32
+
33
+ if pdf_path is None:
34
+ raise gr.Error("A PDF file must be specified!")
35
+ # create a PDF file object
36
+ pdf_file_obj = open(pdf_path, 'rb')
37
+ # create a PDF reader object
38
+ pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
39
+
40
+ # Create the dictionary to extract text from each image
41
+ text_per_page = {}
42
+ # We extract the pages from the PDF
43
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
44
+ # Initialize the variables needed for the text extraction from the page
45
+ page_text = []
46
+ line_format = []
47
+ text_from_images = []
48
+ text_from_tables = []
49
+ page_content = []
50
+ table_extraction_flag= False
51
+ # Open the pdf file
52
+ pdf = pdfplumber.open(pdf_path)
53
+ # Find all the elements
54
+ page_elements = [(element.y1, element) for element in page._objs]
55
+ # Sort all the elements as they appear in the page
56
+ page_elements.sort(key=lambda a: a[0], reverse=True)
57
+
58
+ # Find the elements that composed a page
59
+ for i,component in enumerate(page_elements):
60
+ # Extract the position of the top side of the element in the PDF
61
+ pos= component[0]
62
+ # Extract the element of the page layout
63
+ element = component[1]
64
+
65
+ # Check if the element is a text element
66
+ if isinstance(element, LTTextContainer):
67
+ # Check if the text appeared in a table
68
+ if table_extraction_flag == False:
69
+ # Use the function to extract the text and format for each text element
70
+ (line_text, format_per_line) = text_extraction(element)
71
+ # Append the text of each line to the page text
72
+ page_text.append(line_text)
73
+ # Append the format for each line containing text
74
+ line_format.append(format_per_line)
75
+ page_content.append(line_text)
76
+ else:
77
+ # Omit the text that appeared in a table
78
+ pass
79
+
80
+ # Create the key of the dictionary
81
+ dctkey = 'Page_'+str(pagenum)
82
+ # Add the list of list as the value of the page key
83
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
84
+ # Closing the pdf file object
85
+ pdf_file_obj.close()
86
+ return text_per_page
pdf_processor.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pdf_exctraction import read_pdf
3
+ from transformers import pipeline
4
+
5
+ # Extract the Abstract from the content of the document
6
+ def extract_abstract(pdf_path):
7
+ text_from_pdf = read_pdf(pdf_path)
8
+ abstract_text = ""
9
+ for page_content in text_from_pdf.values():
10
+ if "Abstract\n" in page_content[0]:
11
+ index_of_abstract = page_content[0].index("Abstract\n")
12
+ if index_of_abstract < len(page_content[0]) - 1:
13
+ abstract_text = page_content[0][index_of_abstract + 1]
14
+ if abstract_text == "":
15
+ raise gr.Error("The article does not contains an Abstract or it is not in the expected format")
16
+ return abstract_text
17
+
18
+ # Summarized the abstract extracted from PDF
19
+ def summarize_abstract(pdf_path):
20
+ abstract = extract_abstract(pdf_path)
21
+ abstract = abstract.replace("\n", " ").replace("- ", "").replace(" ", " ")
22
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
23
+ result = summarizer(abstract, max_length=50, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
24
+ return result[0]['summary_text']