NikosKprl commited on
Commit
8f072d8
·
verified ·
1 Parent(s): 55b1651

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +507 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+ import numpy as np
4
+ import re
5
+ from itertools import combinations as itertools_combinations
6
+ import os
7
+ import sys
8
+ from SPARQLWrapper import SPARQLWrapper, JSON
9
+ from sentence_transformers import SentenceTransformer
10
+ import aiohttp
11
+ import asyncio
12
+ import streamlit as st
13
+ import time
14
+ from openai import OpenAI
15
+ import sys
16
+
17
+ model = SentenceTransformer("Lajavaness/bilingual-embedding-large", trust_remote_code=True)
18
+
19
+ token = os.environ["GITHUB_TOKEN"]
20
+ endpoint = "https://models.inference.ai.azure.com"
21
+ model_name = "gpt-4o"
22
+
23
+ client = OpenAI(
24
+ base_url=endpoint,
25
+ api_key=token,
26
+ )
27
+
28
+
29
+ async def fetch_url(session, url):
30
+ pageids_list = []
31
+ async with session.get(url) as response:
32
+ x = await response.text()
33
+ objective_list = x.split('"objectiveResults\\":')[-1].split(',\\"wikipediaResults\\"')[0].replace('\\\\\\"', "").replace("\\", "")
34
+ wikipedia_list = x.split(',\\"wikipediaResults\\":')[-1].split(',\\"data-sentry-element\\"')[0].replace('\\\\\\"', "").replace("\\", "")
35
+ data_1 = json.loads(objective_list)
36
+ data_2 = json.loads(wikipedia_list)
37
+ for i in data_1:
38
+ pageids_list.append(i.get("page_id"))
39
+ for i in data_2:
40
+ pageids_list.append(i.get("pageid"))
41
+ print(pageids_list)
42
+ return pageids_list
43
+
44
+
45
+ async def fetch_json(url, session):
46
+ async with session.get(url) as response:
47
+ return await response.json()
48
+
49
+ async def combination_method(name, session):
50
+ async with aiohttp.ClientSession() as session:
51
+ data = set()
52
+ new_name = name.replace("+", " ").split()
53
+ x = itertools_combinations(new_name, 2)
54
+ for i in x:
55
+ new_word = (i[0] + " " + i[1]).replace(" ", "+")
56
+ url = f"https://www.objective.inc/demos/wikipedia?query={new_word}"
57
+ page_source = await fetch_url(session, url)
58
+ for i in page_source:
59
+ data.add(i)
60
+ return data
61
+
62
+ async def single_method(name, session):
63
+ async with aiohttp.ClientSession() as session:
64
+ data = set()
65
+ new_name = name.replace("+", " ").replace("-", " ").replace("/", " ").split()
66
+ for i in new_name:
67
+ new_word = i.replace(" ", "+")
68
+ url = f"https://www.objective.inc/demos/wikipedia?query={new_word}"
69
+ page_source = await fetch_url(session, url)
70
+ for i in page_source:
71
+ data.add(i)
72
+ return data
73
+
74
+ async def mains(name, single, combi):
75
+ data = set()
76
+ disam_data = set()
77
+ qids = set()
78
+
79
+ async with aiohttp.ClientSession() as session:
80
+ url = f"https://www.objective.inc/demos/wikipedia?query={name}"
81
+ page_source = await fetch_url(session, url)
82
+ for i in page_source:
83
+ data.add(i)
84
+
85
+ wikipedia_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=1&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
86
+ json_data = await fetch_json(wikipedia_url, session)
87
+ suggestion = json_data.get('query', {}).get('searchinfo', {}).get('suggestion')
88
+
89
+ if suggestion:
90
+ suggested_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={suggestion}&srlimit=10&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
91
+ json_suggestion = await fetch_json(suggested_url, session)
92
+ results = json_suggestion.get('query', {}).get('search')
93
+ for i in results:
94
+ data.add(int(i.get('pageid')))
95
+
96
+ # Handle disambiguation links
97
+ if data != {0}:
98
+ for ids in data:
99
+ titles = set()
100
+ wikipedia_disambiguation = f"https://en.wikipedia.org/w/api.php?action=query&generator=links&format=json&redirects=1&pageids={ids}&prop=pageprops&gpllimit=50&ppprop=wikibase_item"
101
+ json_id = await fetch_json(wikipedia_disambiguation, session)
102
+ try:
103
+ title = json_id.get('query').get('pages')
104
+ for k, v in title.items():
105
+ titles.add(v.get("title"))
106
+ except:
107
+ pass
108
+
109
+ if "Help:Disambiguation" in titles:
110
+ for i in titles:
111
+ if ":" not in i:
112
+ wikipedia_disamb = f"https://en.wikipedia.org/w/api.php?action=query&format=json&titles={i}&indexpageids"
113
+ json_id = await fetch_json(wikipedia_disamb, session)
114
+ real_title = json_id.get('query').get('pageids')
115
+ disam_data.add(int(real_title[0]))
116
+ else:
117
+ disam_data.add(ids)
118
+
119
+ # Makes combinations of the name
120
+ if combi == "Yes":
121
+ if len(name.replace("+", " ").replace("-", " ").split()) >= 3:
122
+ combination_names = await combination_method(name, session)
123
+ for i in combination_names:
124
+ disam_data.add(i)
125
+
126
+ # Checks every word alone
127
+ if single == "Yes":
128
+ if len(name.replace("+", " ").replace("-", " ").replace("/", " ").split()) >= 2:
129
+ singles = await single_method(name, session)
130
+ for i in singles:
131
+ disam_data.add(i)
132
+
133
+ for ids in disam_data:
134
+ try:
135
+ wikibase_url = f"https://en.wikipedia.org/w/api.php?action=query&pageids={ids}&prop=pageprops&format=json"
136
+ json_qid = await fetch_json(wikibase_url, session)
137
+ wikidata_qid = json_qid.get('query', {}).get('pages', {}).get(str(ids), {}).get('pageprops', {}).get('wikibase_item', {})
138
+ if wikidata_qid:
139
+ qids.add(wikidata_qid)
140
+ except:
141
+ pass
142
+
143
+ # Save QIDs to file
144
+ with open(f"qids_folder/{name}.json", "w") as f:
145
+ json.dump(list(qids), f)
146
+
147
+
148
+ async def get_results(query):
149
+ user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
150
+ url = "https://query.wikidata.org/sparql"
151
+ sparql = SPARQLWrapper(url, agent=user_agent)
152
+ sparql.setQuery(query)
153
+ sparql.setReturnFormat(JSON)
154
+ return sparql.query().convert()
155
+
156
+ def get_resultss(query):
157
+ user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
158
+ url = "https://query.wikidata.org/sparql"
159
+ sparql = SPARQLWrapper(url, agent=user_agent)
160
+ sparql.setQuery(query)
161
+ sparql.setReturnFormat(JSON)
162
+ return sparql.query().convert()
163
+
164
+
165
+ def cleaner(text):
166
+ text = text.replace('\\', '').replace('\n', ' ')
167
+ text = re.sub(r'\{.*?\}', '', text)
168
+ text = re.sub(' +', ' ', text).strip()
169
+ return text
170
+
171
+ async def retriever(qid):
172
+ async with aiohttp.ClientSession() as session:
173
+ list_with_sent = []
174
+
175
+ query_label = f"""SELECT ?subjectLabel
176
+ WHERE {{
177
+ wd:{qid} rdfs:label ?subjectLabel .
178
+ FILTER(LANG(?subjectLabel) = "en")
179
+ }}
180
+ """
181
+
182
+ results = await get_results(query_label)
183
+
184
+ label = None
185
+ if results["results"]["bindings"]:
186
+ for result in results["results"]["bindings"]:
187
+ for key, value in result.items():
188
+ label = value.get("value", {}).lower() # Get label and convert to lower case
189
+
190
+ query_alias = f"""SELECT ?alias
191
+ WHERE {{
192
+ wd:{qid} skos:altLabel ?alias
193
+ FILTER(LANG(?alias) = "en")
194
+ }}
195
+ """
196
+
197
+ alias_list = []
198
+ results = await get_results(query_alias)
199
+
200
+ for result in results["results"]["bindings"]:
201
+ for key, value in result.items():
202
+ alias = value.get("value", "None")
203
+ alias_list.append(alias)
204
+
205
+ query_desci = f"""SELECT ?subjectLabel
206
+ WHERE {{
207
+ ?subjectLabel schema:about wd:{qid} ;
208
+ schema:inLanguage "en" ;
209
+ schema:isPartOf <https://en.wikipedia.org/> .
210
+ }}
211
+ """
212
+
213
+ results = await get_results(query_desci)
214
+ cleaned_first_para = "None"
215
+
216
+ if results["results"]["bindings"]:
217
+ for result in results["results"]["bindings"]:
218
+ for key, value in result.items():
219
+ desc = value.get("value", "None")
220
+
221
+ title = desc.split("/wiki/")[1]
222
+
223
+ url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles={title}&exintro=&exsentences=2&explaintext=&redirects=&formatversion=2&format=json"
224
+
225
+
226
+ json_data = await fetch_json(url, session)
227
+ cleaned_first_para = cleaner(json_data.get('query', {}).get('pages', [{}])[0].get('extract', 'None'))
228
+ else:
229
+ query_desc = f"""SELECT ?subjectLabel
230
+ WHERE {{
231
+ wd:{qid} schema:description ?subjectLabel .
232
+ FILTER(LANG(?subjectLabel) = "en")
233
+ }}
234
+ """
235
+
236
+ results = await get_results(query_desc)
237
+ if results["results"]["bindings"]:
238
+ for result in results["results"]["bindings"]:
239
+ for key, value in result.items():
240
+ cleaned_first_para = value.get("value", "None")
241
+
242
+ list_with_sent.append({"qid": qid, "label": label, "description": cleaned_first_para})
243
+
244
+ if alias_list:
245
+ for alias in alias_list:
246
+ list_with_sent.append({"qid": qid, "label": alias.lower(), "description": cleaned_first_para})
247
+
248
+ return list_with_sent
249
+
250
+ # Main async function to handle multiple QIDs with batching
251
+ async def main(name):
252
+ with open(f"qids_folder/{name}.json", "r") as f:
253
+ final_list = []
254
+ qids = json.load(f)
255
+ for q in qids:
256
+ returned_list = await retriever(q)
257
+ if returned_list:
258
+ final_list.extend(returned_list)
259
+
260
+ with open(f"info_extraction/{name}.json", "w", encoding="utf-8") as flast:
261
+ json.dump(final_list, flast)
262
+
263
+ def check_sentence(sentence):
264
+ two_consecutive_uppercase = r"[A-Z]{2}"
265
+ uppercase_followed_by_fullstop = r"[A-Z]\."
266
+
267
+ if re.search(two_consecutive_uppercase, sentence):
268
+ return True
269
+
270
+ if re.search(uppercase_followed_by_fullstop, sentence):
271
+ return True
272
+
273
+ return False
274
+
275
+ chrome_driver_path = "chromedriver.exe"
276
+ chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"'
277
+
278
+ def main_cli():
279
+ st.title("✨ Entity Linking Application ✨")
280
+ st.caption("This Web Application is part of my master dissertation.")
281
+
282
+
283
+ input_sentence_user = st.text_input("Enter the sentence:", "")
284
+ input_mention_user = st.text_input("Enter the mention:", "")
285
+ single = st.selectbox("Search each word individually?", ['Yes', 'No'], index=1)
286
+ combi = st.selectbox("Make combinations of each word?", ['Yes', 'No'], index=1)
287
+ disambi = st.selectbox("Run acronym disambiguation? (Enable it if the mention is nested)", ['Yes', 'No'], index=0)
288
+
289
+
290
+ if st.button("Run Entity Linking"):
291
+ if input_sentence_user and input_mention_user:
292
+ # Example logic: check if the mention is in the sentence
293
+ if input_mention_user in input_sentence_user:
294
+ st.write("Applying Data Normalization module... (1/5)")
295
+ # Data Normalization
296
+
297
+ start_time = time.time()
298
+
299
+ list_with_full_names = []
300
+ list_with_names_to_show = []
301
+
302
+ if disambi == "Yes":
303
+ response = client.chat.completions.create(
304
+ messages=[
305
+ {
306
+ "role": "system",
307
+ "content": """
308
+ I will give you one or more labels within a sentence. Your task is as follows:
309
+
310
+ Identify each label in the sentence, and check if it is an acronym.
311
+
312
+ If the label is an acronym, respond with the full name of the acronym.
313
+ If the label is not an acronym, respond with the label exactly as it was given to you.
314
+ If a label contains multiple terms (e.g., 'phase and DIC microscopy'), treat each term within the label as a separate label.
315
+
316
+ This means you should identify and explain each part of the label individually.
317
+ Each part should be on its own line in the response.
318
+ Context-Specific Terms: If the sentence context suggests a relevant term that applies to each label (such as "study" in 'morphological, sedimentological, and stratigraphical study'), add that term to each label’s explanation.
319
+
320
+ Use context clues to determine the appropriate term to add (e.g., 'study' or 'microscopy').
321
+ Output Format: Your response should contain only the explanations, formatted as follows:
322
+
323
+ Each label or part of a label should be on a new line.
324
+ Do not include any additional text, and do not repeat the original sentence.
325
+ Example 1:
326
+
327
+ Input:
328
+
329
+ label: phase and DIC microscopy
330
+ context: Tardigrades have been extracted from samples using centrifugation with Ludox AM™ and mounted on individual microscope slides in Hoyer's medium for identification under phase and DIC microscopy.
331
+ Expected response:
332
+
333
+ phase: phase microscopy
334
+ DIC microscopy: Differential interference contrast microscopy
335
+ Example 2:
336
+
337
+ Input:
338
+
339
+ label: morphological, sedimentological, and stratigraphical study
340
+ context: This paper presents results of a morphological, sedimentological, and stratigraphical study of relict beach ridges formed on a prograded coastal barrier in Bream Bay, North Island New Zealand.
341
+ Expected response:
342
+
343
+ morphological: morphological study
344
+ sedimentological: sedimentological study
345
+ stratigraphical: stratigraphical study
346
+ IMPORTANT:
347
+
348
+ Each label, even if nested within another, should be treated as an individual item.
349
+ Each individual label or acronym should be output on a separate line.
350
+ """
351
+ },
352
+ {
353
+ "role": "user",
354
+ "content": f"label:{input_mention_user}, context:{input_sentence_user}"
355
+ }
356
+ ],
357
+ temperature=1.0,
358
+ top_p=1.0,
359
+ max_tokens=1000,
360
+ model=model_name
361
+ )
362
+
363
+ print(response.choices[0].message.content)
364
+
365
+ kati = response.choices[0].message.content.splitlines()
366
+
367
+ for i in kati:
368
+ context = i.split(":")[-1].strip()
369
+ original_name = i.split(":")[0].strip()
370
+ list_with_full_names.append(context)
371
+ list_with_names_to_show.append(original_name)
372
+
373
+ name = ",".join(list_with_full_names)
374
+
375
+ else:
376
+ name = input_mention_user
377
+ list_with_full_names.append(name)
378
+ list_with_names_to_show.append(name)
379
+
380
+ input_sentence_user = input_sentence_user.replace(input_mention_user, name) # Changing the mention to the correct one
381
+
382
+ response = client.chat.completions.create(
383
+ messages=[
384
+ {
385
+ "role": "system",
386
+ "content": "Given a label or labels within a sentence, provide a brief description (2-3 sentences) explaining what the label represents, similar to how a Wikipedia entry would. Format your response as follows: label: description. I want only the description of the label, not the role in the context. Include the label in the description as well. For example: Sentiment analysis: Sentiment analysis is the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information.\nText analysis: Text mining, text data mining (TDM) or text analytics is the process of deriving high-quality information from text. It involves the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources.",
387
+ },
388
+ {
389
+ "role": "user",
390
+ "content": f"label:{name}, context:{input_sentence_user}"
391
+ }
392
+ ],
393
+ temperature=1.0,
394
+ top_p=1.0,
395
+ max_tokens=1000,
396
+ model=model_name
397
+ )
398
+
399
+ print(response.choices[0].message.content)
400
+
401
+ z = response.choices[0].message.content.splitlines()
402
+ list_with_contexts = []
403
+ for i in z:
404
+ context = i.split(":")[-1].strip()
405
+ list_with_contexts.append(context)
406
+
407
+ # Candidate Generation & Information Extraction
408
+ async def big_main(mention, single, combi):
409
+ mention = mention.split(",")
410
+ st.write("Applying Candidate Generation module... (2/5)")
411
+ for i in mention:
412
+ await mains(i, single, combi)
413
+ st.write("Applying Information Extraction module... (3/5)")
414
+ for i in mention:
415
+ await main(i)
416
+
417
+ asyncio.run(big_main(name, single, combi))
418
+
419
+ number = 0
420
+ for i,j,o in zip(list_with_full_names,list_with_contexts,list_with_names_to_show):
421
+ number += 1
422
+ st.write(f"Applying Candidate Selection module... (4/5) [{number}/{len(list_with_full_names)}]")
423
+ with open(f"info_extraction/{i}.json", "r") as f:
424
+ json_file = json.load(f)
425
+ lista = []
426
+ lista_1 = []
427
+ for element in json_file:
428
+ qid = element.get("qid")
429
+ link = f"https://www.wikidata.org/wiki/{qid}"
430
+ label = element.get("label")
431
+ description = element.get("description")
432
+
433
+ label_emb = model.encode([label])
434
+ desc_emb = model.encode([description])
435
+
436
+ lista.append({link: [label_emb, desc_emb]})
437
+
438
+ label_dataset_emb = model.encode([i])
439
+ desc_dataset_emb = model.encode([j])
440
+
441
+ for emb in lista:
442
+ for k, v in emb.items():
443
+ cossim_label = model.similarity(label_dataset_emb, v[0][0])
444
+ desc_label = model.similarity(desc_dataset_emb, v[1][0])
445
+ emb_mean = np.mean([cossim_label, desc_label])
446
+ lista_1.append({k: emb_mean})
447
+
448
+ sorted_data = sorted(lista_1, key=lambda x: list(x.values())[0], reverse=True)
449
+ st.write(f"Applying Entity Linking module... (4/5) [{number}/{len(list_with_full_names)}]")
450
+ if sorted_data:
451
+ sorted_top = sorted_data[0]
452
+ for k, v in sorted_top.items():
453
+ qid = k.split("/")[-1]
454
+
455
+ wikidata2wikipedia = f"""
456
+ SELECT ?wikipedia
457
+ WHERE {{
458
+ ?wikipedia schema:about wd:{qid} .
459
+ ?wikipedia schema:isPartOf <https://en.wikipedia.org/> .
460
+ }}
461
+ """
462
+ results = get_resultss(wikidata2wikipedia)
463
+
464
+ for result in results["results"]["bindings"]:
465
+ for key, value in result.items():
466
+ wikipedia = value.get("value", "None")
467
+
468
+ sparql = SPARQLWrapper("http://dbpedia.org/sparql")
469
+ wikidata2dbpedia = f"""
470
+ SELECT ?dbpedia
471
+ WHERE {{
472
+ ?dbpedia owl:sameAs <http://www.wikidata.org/entity/{qid}>.
473
+ }}
474
+ """
475
+ sparql.setQuery(wikidata2dbpedia)
476
+ sparql.setReturnFormat(JSON)
477
+ results = sparql.query().convert()
478
+ for result in results["results"]["bindings"]:
479
+ dbpedia = result["dbpedia"]["value"]
480
+
481
+ st.text(f"The correct entity for '{o}' is:")
482
+ st.success(f"Wikipedia: {wikipedia}")
483
+ st.success(f"Wikidata: {k}")
484
+ st.success(f"DBpedia: {dbpedia}")
485
+ else:
486
+ st.warning(f"The entity: {o} is NIL.")
487
+ else:
488
+ st.warning(f"The mention '{input_mention_user}' was NOT found in the sentence.")
489
+ else:
490
+ st.warning("Please fill in both fields.")
491
+ end_time = time.time()
492
+ execution_time = end_time - start_time
493
+ ETA = time.strftime("%H:%M:%S", time.gmtime(execution_time))
494
+ st.write(f"Execution time: {ETA}")
495
+
496
+ folder_path = "qids_folder"
497
+ for filename in os.listdir(folder_path):
498
+ file_path = os.path.join(folder_path, filename)
499
+ os.remove(file_path)
500
+
501
+ folder_path_1 = "info_extraction"
502
+ for filename in os.listdir(folder_path_1):
503
+ file_path = os.path.join(folder_path_1, filename)
504
+ os.remove(file_path)
505
+
506
+ if __name__ == "__main__":
507
+ main_cli()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ SPARQLWrapper
2
+ sentence_transformers
3
+ aiohttp
4
+ asyncio
5
+ openai