rashisinghal commited on
Commit
a3908d4
·
1 Parent(s): 201ca69

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +201 -0
app.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/spaces/rashisinghal/ai_speech_application
2
+
3
+ # Here are the imports
4
+ """
5
+ !pip install pymupdf
6
+ !pip install git+https://github.com/huggingface/transformers.git
7
+ !pip install datasets sentencepiece
8
+ !pip install unidecode
9
+ !pip install transformers
10
+ !pip install gradio
11
+ """
12
+ import gradio as gr
13
+ import fitz
14
+ import torch
15
+ from unidecode import unidecode
16
+ import pandas as pd
17
+ import numpy as np
18
+ import re
19
+ import soundfile as sf
20
+ from IPython.display import Audio
21
+ from datasets import load_dataset
22
+ from transformers import pipeline
23
+ from transformers import SpeechT5HifiGan
24
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
25
+
26
+
27
+ # Here is the code
28
+
29
+
30
+ def pdf_to_speech(pdf_path):
31
+ # The “doc” is a PyMuPDF’s Document class representing the whole document. We will get every necessary information from it, including the text.
32
+
33
+ doc = fitz.open(pdf_path)
34
+
35
+ # We need to isolate various sections of the page in order to search for Abstract Paragraph. It can be done by passing the parameter “blocks” to the get_text() method.
36
+ # The output is a list of tuple items, each item will look like this:
37
+ # (x0, yo, x1, y1, "lines in the block", block_no, block_type)
38
+
39
+
40
+ # Since our PDF is a multipage document we will using a loop to get the plain text from the document
41
+ for page in doc:
42
+ text = page.get_text()
43
+ output = page.get_text("blocks")
44
+
45
+ # ANALYZING THE TEXT TO EXTRACT ABSTRACT
46
+
47
+ # A span is an inline container that helps mark up a part of a text or a part of a document. In short, span is a small chunk of text.
48
+ # To get the spans from the PDF file, we have passed the parameter “dict” into the get_text() method of the doc object.
49
+ # The “block_dict” is a dictionary containing detailed information of all spans in a document.
50
+
51
+
52
+ block_dict = {}
53
+ page_num = 1
54
+ for page in doc: # Iterate all pages in the document
55
+ file_dict = page.get_text('dict') # Get the page dictionary
56
+ block = file_dict['blocks'] # Get the block information
57
+ block_dict[page_num] = block # Store in block dictionary
58
+ page_num += 1 # Increase the page value by 1
59
+
60
+
61
+ # In this we will retrieve the spans and store them in a DataFrame as follow:
62
+ # The code tries to loop over the page, blocks, and lines in a document. Then we will get every span in a line.
63
+ # Although there are some properties in the spans, we care about the bbox (the bounding box), size, font, and text only.
64
+
65
+
66
+ spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
67
+ rows = []
68
+ for page_num, blocks in block_dict.items():
69
+ for block in blocks:
70
+ if block['type'] == 0:
71
+ for line in block['lines']:
72
+ for span in line['spans']:
73
+ xmin, ymin, xmax, ymax = list(span['bbox'])
74
+ font_size = span['size']
75
+ text = unidecode(span['text'])
76
+ span_font = span['font']
77
+ is_upper = False
78
+ is_bold = False
79
+ if "bold" in span_font.lower():
80
+ is_bold = True
81
+ if re.sub("[\(\[].*?[\)\]]", "", text).isupper():
82
+ is_upper = True
83
+ if text.replace(" ","") != "":
84
+ rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size))
85
+ span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size'])
86
+
87
+ span_scores=[]
88
+ span_num_occur={}
89
+ special = '[(_:/,#%\=@)]'
90
+ for index, span_row in span_df.iterrows():
91
+
92
+ score = round(span_row.font_size)
93
+ text = span_row.text
94
+ if not re.search(special, text):
95
+ if span_row.is_bold:
96
+ score +=1
97
+ if span_row.is_upper:
98
+ score +=1
99
+ span_scores.append(score)
100
+ values, counts = np.unique(span_scores, return_counts=True)
101
+
102
+
103
+ # From this, we want to know the numer of unique text styles in the document, and the number of its occurrences.
104
+
105
+ values, counts = np.unique(span_scores, return_counts=True)
106
+ style_dict = {}
107
+ for value, count in zip(values, counts):
108
+ style_dict[value] = count
109
+ sorted(style_dict.items(), key=lambda x: x[1])
110
+
111
+
112
+ # From this, we will be able to create a new column in our span dataframe for the tag information.
113
+ # More the occurances means its a Paragraph and not the heading
114
+
115
+ p_size = max(style_dict, key=style_dict.get)
116
+ idx = 0
117
+ tag = {}
118
+ for size in sorted(values, reverse = True):
119
+ idx += 1
120
+ if size == p_size:
121
+ idx = 0
122
+ tag[size] = 'p'
123
+ if size > p_size:
124
+ tag[size] = 'h{0}'.format(idx)
125
+ if size < p_size:
126
+ tag[size] = 's{0}'.format(idx)
127
+
128
+
129
+ span_tags = [tag[score] for score in span_scores]
130
+ span_df['tag'] = span_tags
131
+
132
+ # We’re now clear on which text is the headings and which one is the content in the document. This is very useful when extracting information
133
+ # since we want all paragraphs below a heading will be grouped. We will create a new dataframe where we can store the text by headings.
134
+ # Thus we can easily extract information based on headings.
135
+
136
+ headings_list = []
137
+ text_list = []
138
+ tmp = []
139
+ heading = ''
140
+
141
+ for index, span_row in span_df.iterrows():
142
+ text = span_row.text
143
+ tag = span_row.tag
144
+ if 'h' in tag:
145
+ headings_list.append(text)
146
+ text_list.append('\n'.join(tmp))
147
+ tmp = []
148
+ heading = text
149
+ else:
150
+ tmp.append(text)
151
+ text_list.append('\n'.join(tmp))
152
+ text_list = text_list[1:]
153
+ text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] )
154
+
155
+ # Extracting the content of the column of the dataframe where the another column named heading is Abstract.
156
+ # Basically, extracting the content of the paragraph abstract
157
+ str_abstract=text_df.loc[text_df['heading'] == 'Abstract', 'content'].item()
158
+
159
+ # Using the Summarization model pszemraj/long-t5-tglobal-base-sci-simplify in the pipeline in order to generate summary of text
160
+
161
+ new_summarized_pipeline= pipeline(task="summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
162
+ summarized_text=new_summarized_pipeline(str_abstract)
163
+
164
+ # Creating string from the list of dictionary
165
+ str_summary = ",".join([item['summary_text'] for item in summarized_text])
166
+
167
+ # We tokenize the input with the processor. The input is the string that we generated of the summary
168
+
169
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
170
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
171
+
172
+ inputs = processor(text=str_summary, return_tensors="pt")
173
+
174
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
175
+
176
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
177
+
178
+ spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
179
+
180
+
181
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
182
+ with torch.no_grad():
183
+ speech = vocoder(spectrogram)
184
+
185
+ # Generating the speech of the summarized one liner Abstract
186
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
187
+
188
+ sr=16000
189
+ return (sr,speech.numpy())
190
+ # Audio(speech, rate=16000)
191
+
192
+
193
+ # Using Gradio Interface to specify the function name, inputs and outputs
194
+ app = gr.Interface(fn=pdf_to_speech,
195
+ inputs="file",
196
+ outputs="audio",
197
+ title="PDF Abstract to Audio Application",
198
+ description="This App accepts PDF which has Abstract , summarises it and converts into Speech. Click to upload PDF with abstract.",
199
+ theme="soft")
200
+
201
+ app.launch()