Spaces:

rashisinghal
/

ai_speech_application

Runtime error

App Files Files Community

rashisinghal commited on Dec 11, 2023

Commit

a3908d4

1 Parent(s): 201ca69

Upload app.py

Browse files

Files changed (1) hide show

app.py +201 -0

app.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# https://huggingface.co/spaces/rashisinghal/ai_speech_application
+# Here are the imports
+"""
+!pip install pymupdf
+!pip install git+https://github.com/huggingface/transformers.git
+!pip install datasets sentencepiece
+!pip install unidecode
+!pip install transformers
+!pip install gradio
+"""
+import gradio as gr
+import fitz
+import torch
+from unidecode import unidecode
+import pandas as pd
+import numpy as np
+import re
+import soundfile as sf
+from IPython.display import Audio
+from datasets import load_dataset
+from transformers import pipeline
+from transformers import SpeechT5HifiGan
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
+# Here is the code
+def pdf_to_speech(pdf_path):
+# The “doc” is a PyMuPDF’s Document class representing the whole document. We will get every necessary information from it, including the text.
+    doc = fitz.open(pdf_path)
+# We need to isolate various sections of the page in order to search for Abstract Paragraph. It can be done by passing the parameter “blocks” to the get_text() method.
+# The output is a list of tuple items, each item will look like this:
+# (x0, yo, x1, y1, "lines in the block", block_no, block_type)
+# Since our PDF is a multipage document we will using a loop to get the plain text from the document
+    for page in doc:
+        text = page.get_text()
+    output = page.get_text("blocks")
+# ANALYZING THE TEXT TO EXTRACT ABSTRACT
+# A span is an inline container that helps mark up a part of a text or a part of a document. In short, span is a small chunk of text.
+# To get the spans from the PDF file, we have passed the parameter “dict” into the get_text() method of the doc object.
+# The “block_dict” is a dictionary containing detailed information of all spans in a document.
+    block_dict = {}
+    page_num = 1
+    for page in doc: # Iterate all pages in the document
+        file_dict = page.get_text('dict') # Get the page dictionary
+        block = file_dict['blocks'] # Get the block information
+        block_dict[page_num] = block # Store in block dictionary
+        page_num += 1 # Increase the page value by 1
+# In this we will retrieve the spans and store them in a DataFrame as follow:
+# The code tries to loop over the page, blocks, and lines in a document. Then we will get every span in a line.
+# Although there are some properties in the spans, we care about the bbox (the bounding box), size, font, and text only.
+    spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
+    rows = []
+    for page_num, blocks in block_dict.items():
+        for block in blocks:
+            if block['type'] == 0:
+                for line in block['lines']:
+                    for span in line['spans']:
+                        xmin, ymin, xmax, ymax = list(span['bbox'])
+                        font_size = span['size']
+                        text = unidecode(span['text'])
+                        span_font = span['font']
+                        is_upper = False
+                        is_bold = False
+                        if "bold" in span_font.lower():
+                            is_bold = True
+                        if re.sub("[\(\[].*?[\)\]]", "", text).isupper():
+                            is_upper = True
+                        if text.replace(" ","") !=  "":
+                            rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size))
+    span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size'])
+    span_scores=[]
+    span_num_occur={}
+    special = '[(_:/,#%\=@)]'
+    for index, span_row in span_df.iterrows():
+        score = round(span_row.font_size)
+        text = span_row.text
+        if not re.search(special, text):
+            if span_row.is_bold:
+                score +=1
+            if span_row.is_upper:
+                score +=1
+        span_scores.append(score)
+    values, counts = np.unique(span_scores, return_counts=True)
+# From this, we want to know the numer of unique text styles in the document, and the number of its occurrences.
+    values, counts = np.unique(span_scores, return_counts=True)
+    style_dict = {}
+    for value, count in zip(values, counts):
+        style_dict[value] = count
+    sorted(style_dict.items(), key=lambda x: x[1])
+# From this, we will be able to create a new column in our span dataframe for the tag information.
+# More the occurances means its a Paragraph and not the heading
+    p_size = max(style_dict, key=style_dict.get)
+    idx = 0
+    tag = {}
+    for size in sorted(values, reverse = True):
+        idx += 1
+        if size == p_size:
+            idx = 0
+            tag[size] = 'p'
+        if size > p_size:
+            tag[size] = 'h{0}'.format(idx)
+        if size < p_size:
+            tag[size] = 's{0}'.format(idx)
+    span_tags = [tag[score] for score in span_scores]
+    span_df['tag'] = span_tags
+# We’re now clear on which text is the headings and which one is the content in the document. This is very useful when extracting information
+# since we want all paragraphs below a heading will be grouped. We will create a new dataframe where we can store the text by headings.
+# Thus we can easily extract information based on headings.
+    headings_list = []
+    text_list = []
+    tmp = []
+    heading = ''
+    for index, span_row in span_df.iterrows():
+        text = span_row.text
+        tag = span_row.tag
+        if 'h' in tag:
+            headings_list.append(text)
+            text_list.append('\n'.join(tmp))
+            tmp = []
+            heading = text
+        else:
+            tmp.append(text)
+    text_list.append('\n'.join(tmp))
+    text_list = text_list[1:]
+    text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] )
+    # Extracting the content of the column of the dataframe where the another column named heading is Abstract.
+    # Basically, extracting the content of the paragraph abstract
+    str_abstract=text_df.loc[text_df['heading'] == 'Abstract', 'content'].item()
+    # Using the Summarization model pszemraj/long-t5-tglobal-base-sci-simplify in the pipeline in order to generate summary of text
+    new_summarized_pipeline= pipeline(task="summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
+    summarized_text=new_summarized_pipeline(str_abstract)
+    # Creating string from the list of dictionary
+    str_summary = ",".join([item['summary_text'] for item in summarized_text])
+    # We tokenize the input with the processor. The input is the string that we generated of the summary
+    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+    inputs = processor(text=str_summary, return_tensors="pt")
+    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+    spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+    with torch.no_grad():
+        speech = vocoder(spectrogram)
+# Generating the speech of the summarized one liner Abstract
+    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+    sr=16000
+    return (sr,speech.numpy())
+    # Audio(speech, rate=16000)
+# Using Gradio Interface to specify the function name, inputs and outputs
+app = gr.Interface(fn=pdf_to_speech,
+                     inputs="file",
+                     outputs="audio",
+                     title="PDF Abstract to Audio Application",
+                     description="This App accepts PDF which has Abstract , summarises it and converts into Speech. Click to upload PDF with abstract.",
+                     theme="soft")
+app.launch()