Spaces:
Runtime error
Runtime error
Commit
·
a3908d4
1
Parent(s):
201ca69
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# https://huggingface.co/spaces/rashisinghal/ai_speech_application
|
2 |
+
|
3 |
+
# Here are the imports
|
4 |
+
"""
|
5 |
+
!pip install pymupdf
|
6 |
+
!pip install git+https://github.com/huggingface/transformers.git
|
7 |
+
!pip install datasets sentencepiece
|
8 |
+
!pip install unidecode
|
9 |
+
!pip install transformers
|
10 |
+
!pip install gradio
|
11 |
+
"""
|
12 |
+
import gradio as gr
|
13 |
+
import fitz
|
14 |
+
import torch
|
15 |
+
from unidecode import unidecode
|
16 |
+
import pandas as pd
|
17 |
+
import numpy as np
|
18 |
+
import re
|
19 |
+
import soundfile as sf
|
20 |
+
from IPython.display import Audio
|
21 |
+
from datasets import load_dataset
|
22 |
+
from transformers import pipeline
|
23 |
+
from transformers import SpeechT5HifiGan
|
24 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
|
25 |
+
|
26 |
+
|
27 |
+
# Here is the code
|
28 |
+
|
29 |
+
|
30 |
+
def pdf_to_speech(pdf_path):
|
31 |
+
# The “doc” is a PyMuPDF’s Document class representing the whole document. We will get every necessary information from it, including the text.
|
32 |
+
|
33 |
+
doc = fitz.open(pdf_path)
|
34 |
+
|
35 |
+
# We need to isolate various sections of the page in order to search for Abstract Paragraph. It can be done by passing the parameter “blocks” to the get_text() method.
|
36 |
+
# The output is a list of tuple items, each item will look like this:
|
37 |
+
# (x0, yo, x1, y1, "lines in the block", block_no, block_type)
|
38 |
+
|
39 |
+
|
40 |
+
# Since our PDF is a multipage document we will using a loop to get the plain text from the document
|
41 |
+
for page in doc:
|
42 |
+
text = page.get_text()
|
43 |
+
output = page.get_text("blocks")
|
44 |
+
|
45 |
+
# ANALYZING THE TEXT TO EXTRACT ABSTRACT
|
46 |
+
|
47 |
+
# A span is an inline container that helps mark up a part of a text or a part of a document. In short, span is a small chunk of text.
|
48 |
+
# To get the spans from the PDF file, we have passed the parameter “dict” into the get_text() method of the doc object.
|
49 |
+
# The “block_dict” is a dictionary containing detailed information of all spans in a document.
|
50 |
+
|
51 |
+
|
52 |
+
block_dict = {}
|
53 |
+
page_num = 1
|
54 |
+
for page in doc: # Iterate all pages in the document
|
55 |
+
file_dict = page.get_text('dict') # Get the page dictionary
|
56 |
+
block = file_dict['blocks'] # Get the block information
|
57 |
+
block_dict[page_num] = block # Store in block dictionary
|
58 |
+
page_num += 1 # Increase the page value by 1
|
59 |
+
|
60 |
+
|
61 |
+
# In this we will retrieve the spans and store them in a DataFrame as follow:
|
62 |
+
# The code tries to loop over the page, blocks, and lines in a document. Then we will get every span in a line.
|
63 |
+
# Although there are some properties in the spans, we care about the bbox (the bounding box), size, font, and text only.
|
64 |
+
|
65 |
+
|
66 |
+
spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
|
67 |
+
rows = []
|
68 |
+
for page_num, blocks in block_dict.items():
|
69 |
+
for block in blocks:
|
70 |
+
if block['type'] == 0:
|
71 |
+
for line in block['lines']:
|
72 |
+
for span in line['spans']:
|
73 |
+
xmin, ymin, xmax, ymax = list(span['bbox'])
|
74 |
+
font_size = span['size']
|
75 |
+
text = unidecode(span['text'])
|
76 |
+
span_font = span['font']
|
77 |
+
is_upper = False
|
78 |
+
is_bold = False
|
79 |
+
if "bold" in span_font.lower():
|
80 |
+
is_bold = True
|
81 |
+
if re.sub("[\(\[].*?[\)\]]", "", text).isupper():
|
82 |
+
is_upper = True
|
83 |
+
if text.replace(" ","") != "":
|
84 |
+
rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size))
|
85 |
+
span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size'])
|
86 |
+
|
87 |
+
span_scores=[]
|
88 |
+
span_num_occur={}
|
89 |
+
special = '[(_:/,#%\=@)]'
|
90 |
+
for index, span_row in span_df.iterrows():
|
91 |
+
|
92 |
+
score = round(span_row.font_size)
|
93 |
+
text = span_row.text
|
94 |
+
if not re.search(special, text):
|
95 |
+
if span_row.is_bold:
|
96 |
+
score +=1
|
97 |
+
if span_row.is_upper:
|
98 |
+
score +=1
|
99 |
+
span_scores.append(score)
|
100 |
+
values, counts = np.unique(span_scores, return_counts=True)
|
101 |
+
|
102 |
+
|
103 |
+
# From this, we want to know the numer of unique text styles in the document, and the number of its occurrences.
|
104 |
+
|
105 |
+
values, counts = np.unique(span_scores, return_counts=True)
|
106 |
+
style_dict = {}
|
107 |
+
for value, count in zip(values, counts):
|
108 |
+
style_dict[value] = count
|
109 |
+
sorted(style_dict.items(), key=lambda x: x[1])
|
110 |
+
|
111 |
+
|
112 |
+
# From this, we will be able to create a new column in our span dataframe for the tag information.
|
113 |
+
# More the occurances means its a Paragraph and not the heading
|
114 |
+
|
115 |
+
p_size = max(style_dict, key=style_dict.get)
|
116 |
+
idx = 0
|
117 |
+
tag = {}
|
118 |
+
for size in sorted(values, reverse = True):
|
119 |
+
idx += 1
|
120 |
+
if size == p_size:
|
121 |
+
idx = 0
|
122 |
+
tag[size] = 'p'
|
123 |
+
if size > p_size:
|
124 |
+
tag[size] = 'h{0}'.format(idx)
|
125 |
+
if size < p_size:
|
126 |
+
tag[size] = 's{0}'.format(idx)
|
127 |
+
|
128 |
+
|
129 |
+
span_tags = [tag[score] for score in span_scores]
|
130 |
+
span_df['tag'] = span_tags
|
131 |
+
|
132 |
+
# We’re now clear on which text is the headings and which one is the content in the document. This is very useful when extracting information
|
133 |
+
# since we want all paragraphs below a heading will be grouped. We will create a new dataframe where we can store the text by headings.
|
134 |
+
# Thus we can easily extract information based on headings.
|
135 |
+
|
136 |
+
headings_list = []
|
137 |
+
text_list = []
|
138 |
+
tmp = []
|
139 |
+
heading = ''
|
140 |
+
|
141 |
+
for index, span_row in span_df.iterrows():
|
142 |
+
text = span_row.text
|
143 |
+
tag = span_row.tag
|
144 |
+
if 'h' in tag:
|
145 |
+
headings_list.append(text)
|
146 |
+
text_list.append('\n'.join(tmp))
|
147 |
+
tmp = []
|
148 |
+
heading = text
|
149 |
+
else:
|
150 |
+
tmp.append(text)
|
151 |
+
text_list.append('\n'.join(tmp))
|
152 |
+
text_list = text_list[1:]
|
153 |
+
text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] )
|
154 |
+
|
155 |
+
# Extracting the content of the column of the dataframe where the another column named heading is Abstract.
|
156 |
+
# Basically, extracting the content of the paragraph abstract
|
157 |
+
str_abstract=text_df.loc[text_df['heading'] == 'Abstract', 'content'].item()
|
158 |
+
|
159 |
+
# Using the Summarization model pszemraj/long-t5-tglobal-base-sci-simplify in the pipeline in order to generate summary of text
|
160 |
+
|
161 |
+
new_summarized_pipeline= pipeline(task="summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
|
162 |
+
summarized_text=new_summarized_pipeline(str_abstract)
|
163 |
+
|
164 |
+
# Creating string from the list of dictionary
|
165 |
+
str_summary = ",".join([item['summary_text'] for item in summarized_text])
|
166 |
+
|
167 |
+
# We tokenize the input with the processor. The input is the string that we generated of the summary
|
168 |
+
|
169 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
170 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
171 |
+
|
172 |
+
inputs = processor(text=str_summary, return_tensors="pt")
|
173 |
+
|
174 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
175 |
+
|
176 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
177 |
+
|
178 |
+
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
|
179 |
+
|
180 |
+
|
181 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
182 |
+
with torch.no_grad():
|
183 |
+
speech = vocoder(spectrogram)
|
184 |
+
|
185 |
+
# Generating the speech of the summarized one liner Abstract
|
186 |
+
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
187 |
+
|
188 |
+
sr=16000
|
189 |
+
return (sr,speech.numpy())
|
190 |
+
# Audio(speech, rate=16000)
|
191 |
+
|
192 |
+
|
193 |
+
# Using Gradio Interface to specify the function name, inputs and outputs
|
194 |
+
app = gr.Interface(fn=pdf_to_speech,
|
195 |
+
inputs="file",
|
196 |
+
outputs="audio",
|
197 |
+
title="PDF Abstract to Audio Application",
|
198 |
+
description="This App accepts PDF which has Abstract , summarises it and converts into Speech. Click to upload PDF with abstract.",
|
199 |
+
theme="soft")
|
200 |
+
|
201 |
+
app.launch()
|