atrytone commited on
Commit
568e115
Β·
1 Parent(s): e4cc097

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -0
app.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain.vectorstores import FAISS
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline
5
+ import textwrap
6
+ import torch
7
+
8
+ prompt = 'BEGINNING OF CONVERSATION: USER: \
9
+ I will provide you with two abstracts, I intend to use the author of the second to review the first. Tell me in a few words why or why not the second author is a good fit to review the first paper.\n\
10
+ Abstract To Be Reviewed: '
11
+
12
+ tokenizer = LlamaTokenizer.from_pretrained("samwit/koala-7b")
13
+
14
+ base_model = LlamaForCausalLM.from_pretrained(
15
+ "samwit/koala-7b",
16
+ load_in_8bit=True,
17
+ device_map='auto',
18
+ )
19
+
20
+ pipe = pipeline(
21
+ "text-generation",
22
+ model=base_model,
23
+ tokenizer=tokenizer,
24
+ max_length=1024,
25
+ temperature=0.7,
26
+ top_p=0.95,
27
+ repetition_penalty=1.15
28
+ )
29
+
30
+
31
+ def wrap_text_preserve_newlines(text, width=110):
32
+ # Split the input text into lines based on newline characters
33
+ lines = text.split('\n')
34
+ # Wrap each line individually
35
+ wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
36
+ # Join the wrapped lines back together using newline characters
37
+ wrapped_text = '\n'.join(wrapped_lines)
38
+ return wrapped_text
39
+
40
+
41
+ def create_miread_embed(sents, bundle):
42
+ tokenizer = bundle[0]
43
+ model = bundle[1]
44
+ model.cpu()
45
+ tokens = tokenizer(sents,
46
+ max_length=512,
47
+ padding=True,
48
+ truncation=True,
49
+ return_tensors="pt"
50
+ )
51
+ device = torch.device('cpu')
52
+ tokens = tokens.to(device)
53
+ with torch.no_grad():
54
+ out = model.bert(**tokens)
55
+ feature = out.last_hidden_state[:, 0, :]
56
+ return feature.cpu()
57
+
58
+
59
+ def get_matches(query, k):
60
+ matches = vecdb.similarity_search_with_score(query, k=k)
61
+ return matches
62
+
63
+
64
+ def inference(query,k=30):
65
+ matches = get_matches(query,k)
66
+ j_bucket = {}
67
+ n_table = []
68
+ a_table = []
69
+ r_table = []
70
+ scores = [round(match[1].item(),3) for match in matches]
71
+ min_score = min(scores)
72
+ max_score = max(scores)
73
+ normaliser = lambda x: round(1 - (x-min_score)/max_score,3)
74
+ for i,match in enumerate(matches):
75
+ doc = match[0]
76
+ score = normaliser(round(match[1].item(),3))
77
+ title = doc.metadata['title']
78
+ author = eval(doc.metadata['authors'])[0]
79
+ date = doc.metadata.get('date','None')
80
+ link = doc.metadata.get('link','None')
81
+ submitter = doc.metadata.get('submitter','None')
82
+ journal = doc.metadata.get('journal','None')
83
+
84
+ # For journals
85
+ if journal not in j_bucket:
86
+ j_bucket[journal] = score
87
+ else:
88
+ j_bucket[journal] += score
89
+
90
+ # For authors
91
+ record = [i+1,
92
+ score,
93
+ author,
94
+ title,
95
+ link,
96
+ date]
97
+ n_table.append(record)
98
+
99
+ # For abstracts
100
+ record = [i+1,
101
+ title,
102
+ author,
103
+ submitter,
104
+ journal,
105
+ date,
106
+ link,
107
+ score
108
+ ]
109
+ a_table.append(record)
110
+
111
+ # For reviewer
112
+ output = pipe(prompt + query + '\n Candidate Abstract: ' + candidate + '\n')
113
+
114
+ r_record = [i+1,
115
+ score,
116
+ author,
117
+ title,
118
+ output[0]['generated_text'],
119
+ link,
120
+ date]
121
+ r_table.append(r_record)
122
+
123
+
124
+ j_table = sorted([[journal,score] for journal,score in j_bucket.items()],key= lambda x : x[1],reverse=True)
125
+ j_table = [[i+1,item[0],item[1]] for i,item in enumerate(j_table)]
126
+ j_output= gr.Dataframe.update(value=j_table,visible=True)
127
+ n_output= gr.Dataframe.update(value=n_table,visible=True)
128
+ a_output = gr.Dataframe.update(value=a_table,visible=True)
129
+ r_output = gr.Dataframe.update(value=r_table,visible=True)
130
+
131
+ return [a_output,j_output,n_output,r_output]
132
+
133
+
134
+
135
+ model_name = "biodatlab/MIReAD-Neuro"
136
+ model_kwargs = {'device': 'cpu'}
137
+ encode_kwargs = {'normalize_embeddings': False}
138
+ faiss_embedder = HuggingFaceEmbeddings(
139
+ model_name=model_name,
140
+ model_kwargs=model_kwargs,
141
+ encode_kwargs=encode_kwargs
142
+ )
143
+
144
+ vecdb = FAISS.load_local("faiss_index", faiss_embedder)
145
+
146
+
147
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
148
+ gr.Markdown("# NBDT Recommendation Engine for Editors")
149
+ gr.Markdown("NBDT Recommendation Engine for Editors is a tool for neuroscience authors/abstracts/journalsrecommendation built for NBDT journal editors. \
150
+ It aims to help an editor to find similar reviewers, abstracts, and journals to a given submitted abstract.\
151
+ To find a recommendation, paste a `title[SEP]abstract` or `abstract` in the text box below and click \"Find Matches\".\
152
+ Then, you can hover to authors/abstracts/journals tab to find a suggested list.\
153
+ The data in our current demo is selected from 2018 to 2022. We will update the data monthly for an up-to-date publications.")
154
+
155
+
156
+ abst = gr.Textbox(label="Abstract",lines=10)
157
+
158
+ k = gr.Slider(1,100,step=1,value=50,label="Number of matches to consider")
159
+
160
+ action_btn = gr.Button(value="Find Matches")
161
+
162
+ with gr.Tab("Authors"):
163
+ n_output = gr.Dataframe(
164
+ headers=['No.','Score','Name','Title','Link','Date'],
165
+ datatype=['number','number','str','str','str','str'],
166
+ col_count=(6, "fixed"),
167
+ wrap=True,
168
+ visible=False
169
+ )
170
+ with gr.Tab("Abstracts"):
171
+ a_output = gr.Dataframe(
172
+ headers=['No.','Title','Author','Corresponding Author','Journal','Date','Link','Score'],
173
+ datatype=['number','str','str','str','str','str','str','number'],
174
+ col_count=(8,"fixed"),
175
+ wrap=True,
176
+ visible=False
177
+ )
178
+ with gr.Tab("Journals"):
179
+ j_output = gr.Dataframe(
180
+ headers=['No.','Name','Score'],
181
+ datatype=['number','str','number'],
182
+ col_count=(3, "fixed"),
183
+ wrap=True,
184
+ visible=False
185
+ )
186
+ with gr.Tab("Reviewers New"):
187
+ r_output = gr.Dataframe(
188
+ headers=['No.','Score','Name','Title','Reasoning','Link','Date'],
189
+ datatype=['number','number','str','str','str','str','str'],
190
+ col_count=(7,"fixed"),
191
+ wrap=True,
192
+ visible=False
193
+ )
194
+ action_btn.click(fn=inference,
195
+ inputs=[
196
+ abst,
197
+ k,
198
+ # modes,
199
+ ],
200
+ outputs=[a_output,j_output,n_output,r_output],
201
+ api_name="neurojane")
202
+
203
+ demo.launch(debug=True)