Spaces:

Linseypass
/

PLTS

Runtime error

App Files Files Community

Linseypass commited on Aug 22, 2023

Commit

5001698

1 Parent(s): c2ce80d

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -87

app.py CHANGED Viewed

@@ -27,113 +27,143 @@ stop_token_ids = [0]
 print('Guanaco model loaded into memory.')
-def generate(title, abstract):
-    print("Started running.")
-    '''
-    Take gradio input and output data to sample-data.jsonl in readable form for classifier.py to run.
-    '''
-    newline = {}
-    text = abstract
-    # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
-    if text.lower()[0:9] == "abstract.":
-        text = text[9:]
-    elif text.lower()[0:8] == "abstract":
-        text = text[8:]
-    sentences = sent_tokenize(text)
-    newline["target"] = sentences
-    newline["title"] = title
-    print("Tokenized abstract to sentences.")
     '''
-    Main part
     '''
     '''
     This is for summarization
     '''
-    tooShortForKeyword = False
-    obj = newline
-    doc = ""
-    if len(obj["target"]) > 1:
-        doc += obj["title"] + ". " + obj["target"][0] + " "  + obj["target"][1]
-    elif len(obj["target"]) == 1:
-        tooShortForKeyword = True
-        doc += obj["title"] + ". " + obj["target"][0]
-    else:
-        tooShortForKeyword = True
-        doc += obj["title"]
-    text = doc
     prompt = """
     Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
     """
     formatted_prompt = (
         f"A chat between a curious human and an artificial intelligence assistant."
         f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
-        f"### Human: {prompt + doc} \n"
         f"### Assistant:"
     )
-    inputs = tok(formatted_prompt, return_tensors="pt")#.to("cuda:1")
-    outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
     output = tok.decode(outputs[0], skip_special_tokens=True)
     index_response = output.find("### Assistant: ") + 15
-    if (output[index_response:index_response + 10] == "Certainly!"):
         index_response += 10
     end_response = output.rfind('.') + 1
     response = output[index_response:end_response]
-    print('Plain Language Summary Created.')
-    '''
-    Keyphrase extraction.
-    '''
-    # the document is the title and first two sentences of the abstract.
-    obj = newline
-    doc = ""
-    if len(obj["target"]) > 1:
-        doc += obj["title"] + ". " + obj["target"][0] + " "  + obj["target"][1]
-        kw_model = KeyBERT(model="all-MiniLM-L6-v2")
-        vectorizer = KeyphraseCountVectorizer()
-        top_n = 2
-        keywords = kw_model.extract_keywords(doc, stop_words="english", top_n = top_n, vectorizer=vectorizer, use_mmr=True)
-        my_keywords = []
-        for i in range(top_n):
-            add = True
-            for j in range(top_n):
-                if i != j:
-                    if keywords[i][0] in keywords[j][0]:
-                        add = False
-            if add:
-                my_keywords.append(keywords[i][0])
-        for entry in my_keywords:
-            print(entry)
-    '''
-    This is for feeding the keyphrases into Guanaco.
-    '''
-    responseTwo = ""
-    keyword_string = ""
-    if not tooShortForKeyword:
-        separator = ', '
-        keyword_string = separator.join(my_keywords)
-        prompt = "What is the purpose of studying " + keyword_string + "? Comment on areas of application."
-        formatted_prompt = (
-            f"A chat between a curious human and an artificial intelligence assistant."
-            f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
-            f"### Human: {prompt} \n"
-            f"### Assistant:"
-        )
-        inputs = tok(formatted_prompt, return_tensors="pt")#.to("cuda:2")
-        outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
-        output = tok.decode(outputs[0], skip_special_tokens=True)
-        index_response = output.find("### Assistant: ") + 15
-        end_response = output.rfind('.') + 1
-        responseTwo = output[index_response:end_response]
-    print('Keyphrase elaboration ran.')
-    return keyword_string, responseTwo, response
-demo = gr.Interface(
-    fn=generate,
-    inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Abstract")],
-    outputs=[gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration"), gr.Textbox(label="Plain Language Summary")],
-)
-demo.launch()

 print('Guanaco model loaded into memory.')
+def keyphraseElaboration(title, abstract, userGivenKeyphrases, maxTokensElaboration, numAbstractSentencesKeyphrase):
+    numKeywordsToExtract = 2
+    if userGivenKeyphrases == "":
+        '''
+        Process Abstract (eliminate word abstract at front and put into sentences)
+        '''
+        # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
+        if abstract.lower()[0:9] == "abstract.":
+            abstract = abstract[9:]
+        elif abstract.lower()[0:8] == "abstract":
+            abstract = abstract[8:]
+        abstractSentences = sent_tokenize(abstract)
+        tooShort = True # if the document only has one or fewer abstract sentences, then the document is too short for the keyphrase extraction/elaboration to give a meaningful output.
+        numAbstractSentences = len(abstractSentences)
+        if numAbstractSentences > 1:
+            tooShort = False
+            numAbstractSentencesKeyphrase = min(numAbstractSentences, numAbstractSentencesKeyphrase)
+            doc = f"{title}. {' '.join(abstractSentences[:numAbstractSentencesKeyphrase])}"
+            kw_model = KeyBERT(model="all-MiniLM-L6-v2")
+            vectorizer = KeyphraseCountVectorizer()
+            keywordsOut = kw_model.extract_keywords(doc, stop_words="english", top_n = numKeywordsToExtract, vectorizer=vectorizer, use_mmr=True)
+            keyBERTKeywords = [x[0] for x in keywordsOut]
+            for entry in keyBERTKeywords:
+                print(entry)
+    keywordString = ""
+    if userGivenKeyphrases != "":
+        keywordString = userGivenKeyphrases
+    elif not tooShort:
+        separator = ', '
+        keywordString = separator.join(keyBERTKeywords)
+    prompt = "What is the purpose of studying " + keywordString + "? Comment on areas of application."
+    if keywordString != "":
+        formatted_prompt = (
+            f"A chat between a curious human and an artificial intelligence assistant."
+            f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
+            f"### Human: {prompt} \n"
+            f"### Assistant:"
+        )
+        inputs = tok(formatted_prompt, return_tensors="pt").to(deviceElaboration)
+        outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=maxTokensElaboration)
+        output = tok.decode(outputs[0], skip_special_tokens=True)
+        index_response = output.find("### Assistant: ") + 15
+        end_response = output.rfind('.') + 1
+        response = output[index_response:end_response]
+        return keywordString, response
+def plainLanguageSummary(title, abstract, maxTokensSummary, numAbstractSentencesSummary):
     '''
+    Process Abstract (eliminate word abstract at front and put into sentences)
     '''
+    # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
+    if abstract.lower()[0:9] == "abstract.":
+        abstract = abstract[9:]
+    elif abstract.lower()[0:8] == "abstract":
+        abstract = abstract[8:]
+    abstractSentences = sent_tokenize(abstract)
     '''
     This is for summarization
     '''
     prompt = """
     Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
     """
+    text = ""
+    if text == "":
+        numAbstractSentences = len(abstractSentences)
+        numAbstractSentencesSummary = min(numAbstractSentences, numAbstractSentencesSummary)
+        text = f"{title}. {' '.join(abstractSentences[:numAbstractSentencesSummary])}"
     formatted_prompt = (
         f"A chat between a curious human and an artificial intelligence assistant."
         f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
+        f"### Human: {prompt + text} \n"
         f"### Assistant:"
     )
+    inputs = tok(formatted_prompt, return_tensors="pt").to(deviceSummary)
+    outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=maxTokensSummary)
     output = tok.decode(outputs[0], skip_special_tokens=True)
     index_response = output.find("### Assistant: ") + 15
+    if (output[index_response:index_response + 10] == "Certainly!" or output[index_response:index_response + 10] == "Certainly,"):
         index_response += 10
     end_response = output.rfind('.') + 1
     response = output[index_response:end_response]
+    return response
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            title = gr.Textbox(label="Title")
+            abstract = gr.Textbox(label="Abstract")
+            userDefinedKeyphrases = gr.Textbox(label="Your keyphrases (Optional - Model will elaborate on these keyphrases without using the title or abstract)")
+            keyphraseButton = gr.Button("Generate Keyphrase Elaboration")
+            summaryButton = gr.Button("Generate Plain Language Summary")
+            with gr.Accordion(label="Parameters", open=False):
+                maxTokensElaboration = gr.Slider(
+                    label="Maximum Number of Elaboration Tokens",
+                    value=500,
+                    minimum=0,
+                    maximum=2048,
+                    step=10,
+                    interactive=True,
+                    info="Length of Keyphrase Elaboration",
+                )
+                maxTokensSummary = gr.Slider(
+                    label="Maximum Number of Summary Tokens",
+                    value=300,
+                    minimum=0,
+                    maximum=2048,
+                    step=10,
+                    interactive=True,
+                    info="Length of Plain Language Summary",
+                )
+                numAbstractSentencesKeyphrase = gr.Slider(
+                    label="Number of Abstract Sentences to use for Keyphrase Extraction",
+                    value=2,
+                    minimum=0,
+                    maximum=20,
+                    step=1,
+                    interactive=True,
+                    info="Default: use first two sentences of abstract."
+                )
+                numAbstractSentencesSummary = gr.Slider(
+                    label="Number of Abstract Sentences to use for Plain Language Summary",
+                    value=2,
+                    minimum=0,
+                    maximum=20,
+                    step=1,
+                    interactive=True,
+                    info="Default: use first two sentences of abstract."
+                )
+        with gr.Column():
+            outputKeyphrase = [gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration")]
+            outputSummary = gr.Textbox(label="Plain Language Summary")
+    keyphraseButton.click(fn=keyphraseElaboration, inputs=[title, abstract, userDefinedKeyphrases, maxTokensElaboration, numAbstractSentencesKeyphrase], outputs=outputKeyphrase)
+    summaryButton.click(fn=plainLanguageSummary, inputs=[title, abstract, maxTokensSummary, numAbstractSentencesSummary], outputs = outputSummary)
+demo.launch(share=True)