Linseypass commited on
Commit
5001698
·
1 Parent(s): c2ce80d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -87
app.py CHANGED
@@ -27,113 +27,143 @@ stop_token_ids = [0]
27
  print('Guanaco model loaded into memory.')
28
 
29
 
30
- def generate(title, abstract):
31
- print("Started running.")
32
- '''
33
- Take gradio input and output data to sample-data.jsonl in readable form for classifier.py to run.
34
- '''
35
- newline = {}
36
- text = abstract
37
- # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
38
- if text.lower()[0:9] == "abstract.":
39
- text = text[9:]
40
- elif text.lower()[0:8] == "abstract":
41
- text = text[8:]
42
- sentences = sent_tokenize(text)
43
- newline["target"] = sentences
44
- newline["title"] = title
45
- print("Tokenized abstract to sentences.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  '''
47
- Main part
48
  '''
 
 
 
 
 
 
49
  '''
50
  This is for summarization
51
  '''
52
- tooShortForKeyword = False
53
- obj = newline
54
- doc = ""
55
- if len(obj["target"]) > 1:
56
- doc += obj["title"] + ". " + obj["target"][0] + " " + obj["target"][1]
57
- elif len(obj["target"]) == 1:
58
- tooShortForKeyword = True
59
- doc += obj["title"] + ". " + obj["target"][0]
60
- else:
61
- tooShortForKeyword = True
62
- doc += obj["title"]
63
- text = doc
64
  prompt = """
65
  Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
66
  """
 
 
 
 
 
 
67
  formatted_prompt = (
68
  f"A chat between a curious human and an artificial intelligence assistant."
69
  f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
70
- f"### Human: {prompt + doc} \n"
71
  f"### Assistant:"
72
  )
73
- inputs = tok(formatted_prompt, return_tensors="pt")#.to("cuda:1")
74
- outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
75
  output = tok.decode(outputs[0], skip_special_tokens=True)
76
  index_response = output.find("### Assistant: ") + 15
77
- if (output[index_response:index_response + 10] == "Certainly!"):
78
  index_response += 10
79
  end_response = output.rfind('.') + 1
80
  response = output[index_response:end_response]
81
- print('Plain Language Summary Created.')
82
 
83
- '''
84
- Keyphrase extraction.
85
- '''
86
- # the document is the title and first two sentences of the abstract.
87
-
88
- obj = newline
89
- doc = ""
90
- if len(obj["target"]) > 1:
91
- doc += obj["title"] + ". " + obj["target"][0] + " " + obj["target"][1]
92
- kw_model = KeyBERT(model="all-MiniLM-L6-v2")
93
- vectorizer = KeyphraseCountVectorizer()
94
- top_n = 2
95
- keywords = kw_model.extract_keywords(doc, stop_words="english", top_n = top_n, vectorizer=vectorizer, use_mmr=True)
96
- my_keywords = []
97
- for i in range(top_n):
98
- add = True
99
- for j in range(top_n):
100
- if i != j:
101
- if keywords[i][0] in keywords[j][0]:
102
- add = False
103
- if add:
104
- my_keywords.append(keywords[i][0])
105
- for entry in my_keywords:
106
- print(entry)
107
- '''
108
- This is for feeding the keyphrases into Guanaco.
109
- '''
110
- responseTwo = ""
111
- keyword_string = ""
112
- if not tooShortForKeyword:
113
- separator = ', '
114
- keyword_string = separator.join(my_keywords)
115
- prompt = "What is the purpose of studying " + keyword_string + "? Comment on areas of application."
116
 
117
- formatted_prompt = (
118
- f"A chat between a curious human and an artificial intelligence assistant."
119
- f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
120
- f"### Human: {prompt} \n"
121
- f"### Assistant:"
122
- )
123
- inputs = tok(formatted_prompt, return_tensors="pt")#.to("cuda:2")
124
- outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
125
- output = tok.decode(outputs[0], skip_special_tokens=True)
126
- index_response = output.find("### Assistant: ") + 15
127
- end_response = output.rfind('.') + 1
128
- responseTwo = output[index_response:end_response]
129
- print('Keyphrase elaboration ran.')
130
- return keyword_string, responseTwo, response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- demo = gr.Interface(
133
- fn=generate,
134
- inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Abstract")],
135
- outputs=[gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration"), gr.Textbox(label="Plain Language Summary")],
136
- )
137
- demo.launch()
138
 
 
139
 
 
27
  print('Guanaco model loaded into memory.')
28
 
29
 
30
+ def keyphraseElaboration(title, abstract, userGivenKeyphrases, maxTokensElaboration, numAbstractSentencesKeyphrase):
31
+ numKeywordsToExtract = 2
32
+ if userGivenKeyphrases == "":
33
+ '''
34
+ Process Abstract (eliminate word abstract at front and put into sentences)
35
+ '''
36
+ # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
37
+ if abstract.lower()[0:9] == "abstract.":
38
+ abstract = abstract[9:]
39
+ elif abstract.lower()[0:8] == "abstract":
40
+ abstract = abstract[8:]
41
+ abstractSentences = sent_tokenize(abstract)
42
+ tooShort = True # if the document only has one or fewer abstract sentences, then the document is too short for the keyphrase extraction/elaboration to give a meaningful output.
43
+ numAbstractSentences = len(abstractSentences)
44
+ if numAbstractSentences > 1:
45
+ tooShort = False
46
+ numAbstractSentencesKeyphrase = min(numAbstractSentences, numAbstractSentencesKeyphrase)
47
+ doc = f"{title}. {' '.join(abstractSentences[:numAbstractSentencesKeyphrase])}"
48
+ kw_model = KeyBERT(model="all-MiniLM-L6-v2")
49
+ vectorizer = KeyphraseCountVectorizer()
50
+ keywordsOut = kw_model.extract_keywords(doc, stop_words="english", top_n = numKeywordsToExtract, vectorizer=vectorizer, use_mmr=True)
51
+ keyBERTKeywords = [x[0] for x in keywordsOut]
52
+ for entry in keyBERTKeywords:
53
+ print(entry)
54
+
55
+ keywordString = ""
56
+ if userGivenKeyphrases != "":
57
+ keywordString = userGivenKeyphrases
58
+ elif not tooShort:
59
+ separator = ', '
60
+ keywordString = separator.join(keyBERTKeywords)
61
+ prompt = "What is the purpose of studying " + keywordString + "? Comment on areas of application."
62
+ if keywordString != "":
63
+ formatted_prompt = (
64
+ f"A chat between a curious human and an artificial intelligence assistant."
65
+ f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
66
+ f"### Human: {prompt} \n"
67
+ f"### Assistant:"
68
+ )
69
+ inputs = tok(formatted_prompt, return_tensors="pt").to(deviceElaboration)
70
+ outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=maxTokensElaboration)
71
+ output = tok.decode(outputs[0], skip_special_tokens=True)
72
+ index_response = output.find("### Assistant: ") + 15
73
+ end_response = output.rfind('.') + 1
74
+ response = output[index_response:end_response]
75
+ return keywordString, response
76
+
77
+ def plainLanguageSummary(title, abstract, maxTokensSummary, numAbstractSentencesSummary):
78
  '''
79
+ Process Abstract (eliminate word abstract at front and put into sentences)
80
  '''
81
+ # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
82
+ if abstract.lower()[0:9] == "abstract.":
83
+ abstract = abstract[9:]
84
+ elif abstract.lower()[0:8] == "abstract":
85
+ abstract = abstract[8:]
86
+ abstractSentences = sent_tokenize(abstract)
87
  '''
88
  This is for summarization
89
  '''
 
 
 
 
 
 
 
 
 
 
 
 
90
  prompt = """
91
  Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
92
  """
93
+ text = ""
94
+ if text == "":
95
+ numAbstractSentences = len(abstractSentences)
96
+ numAbstractSentencesSummary = min(numAbstractSentences, numAbstractSentencesSummary)
97
+ text = f"{title}. {' '.join(abstractSentences[:numAbstractSentencesSummary])}"
98
+
99
  formatted_prompt = (
100
  f"A chat between a curious human and an artificial intelligence assistant."
101
  f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
102
+ f"### Human: {prompt + text} \n"
103
  f"### Assistant:"
104
  )
105
+ inputs = tok(formatted_prompt, return_tensors="pt").to(deviceSummary)
106
+ outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=maxTokensSummary)
107
  output = tok.decode(outputs[0], skip_special_tokens=True)
108
  index_response = output.find("### Assistant: ") + 15
109
+ if (output[index_response:index_response + 10] == "Certainly!" or output[index_response:index_response + 10] == "Certainly,"):
110
  index_response += 10
111
  end_response = output.rfind('.') + 1
112
  response = output[index_response:end_response]
113
+ return response
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ with gr.Blocks() as demo:
117
+ with gr.Row():
118
+ with gr.Column():
119
+ title = gr.Textbox(label="Title")
120
+ abstract = gr.Textbox(label="Abstract")
121
+ userDefinedKeyphrases = gr.Textbox(label="Your keyphrases (Optional - Model will elaborate on these keyphrases without using the title or abstract)")
122
+ keyphraseButton = gr.Button("Generate Keyphrase Elaboration")
123
+ summaryButton = gr.Button("Generate Plain Language Summary")
124
+ with gr.Accordion(label="Parameters", open=False):
125
+ maxTokensElaboration = gr.Slider(
126
+ label="Maximum Number of Elaboration Tokens",
127
+ value=500,
128
+ minimum=0,
129
+ maximum=2048,
130
+ step=10,
131
+ interactive=True,
132
+ info="Length of Keyphrase Elaboration",
133
+ )
134
+ maxTokensSummary = gr.Slider(
135
+ label="Maximum Number of Summary Tokens",
136
+ value=300,
137
+ minimum=0,
138
+ maximum=2048,
139
+ step=10,
140
+ interactive=True,
141
+ info="Length of Plain Language Summary",
142
+ )
143
+ numAbstractSentencesKeyphrase = gr.Slider(
144
+ label="Number of Abstract Sentences to use for Keyphrase Extraction",
145
+ value=2,
146
+ minimum=0,
147
+ maximum=20,
148
+ step=1,
149
+ interactive=True,
150
+ info="Default: use first two sentences of abstract."
151
+ )
152
+ numAbstractSentencesSummary = gr.Slider(
153
+ label="Number of Abstract Sentences to use for Plain Language Summary",
154
+ value=2,
155
+ minimum=0,
156
+ maximum=20,
157
+ step=1,
158
+ interactive=True,
159
+ info="Default: use first two sentences of abstract."
160
+ )
161
+ with gr.Column():
162
+ outputKeyphrase = [gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration")]
163
+ outputSummary = gr.Textbox(label="Plain Language Summary")
164
 
165
+ keyphraseButton.click(fn=keyphraseElaboration, inputs=[title, abstract, userDefinedKeyphrases, maxTokensElaboration, numAbstractSentencesKeyphrase], outputs=outputKeyphrase)
166
+ summaryButton.click(fn=plainLanguageSummary, inputs=[title, abstract, maxTokensSummary, numAbstractSentencesSummary], outputs = outputSummary)
 
 
 
 
167
 
168
+ demo.launch(share=True)
169