NikosKprl commited on
Commit
9d37057
·
verified ·
1 Parent(s): 16f867f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +510 -506
app.py CHANGED
@@ -1,507 +1,511 @@
1
- import pandas as pd
2
- import json
3
- import numpy as np
4
- import re
5
- from itertools import combinations as itertools_combinations
6
- import os
7
- import sys
8
- from SPARQLWrapper import SPARQLWrapper, JSON
9
- from sentence_transformers import SentenceTransformer
10
- import aiohttp
11
- import asyncio
12
- import streamlit as st
13
- import time
14
- from openai import OpenAI
15
- import sys
16
-
17
- model = SentenceTransformer("Lajavaness/bilingual-embedding-large", trust_remote_code=True)
18
-
19
- token = os.environ["GITHUB_TOKEN"]
20
- endpoint = "https://models.inference.ai.azure.com"
21
- model_name = "gpt-4o"
22
-
23
- client = OpenAI(
24
- base_url=endpoint,
25
- api_key=token,
26
- )
27
-
28
-
29
- async def fetch_url(session, url):
30
- pageids_list = []
31
- async with session.get(url) as response:
32
- x = await response.text()
33
- objective_list = x.split('"objectiveResults\\":')[-1].split(',\\"wikipediaResults\\"')[0].replace('\\\\\\"', "").replace("\\", "")
34
- wikipedia_list = x.split(',\\"wikipediaResults\\":')[-1].split(',\\"data-sentry-element\\"')[0].replace('\\\\\\"', "").replace("\\", "")
35
- data_1 = json.loads(objective_list)
36
- data_2 = json.loads(wikipedia_list)
37
- for i in data_1:
38
- pageids_list.append(i.get("page_id"))
39
- for i in data_2:
40
- pageids_list.append(i.get("pageid"))
41
- print(pageids_list)
42
- return pageids_list
43
-
44
-
45
- async def fetch_json(url, session):
46
- async with session.get(url) as response:
47
- return await response.json()
48
-
49
- async def combination_method(name, session):
50
- async with aiohttp.ClientSession() as session:
51
- data = set()
52
- new_name = name.replace("+", " ").split()
53
- x = itertools_combinations(new_name, 2)
54
- for i in x:
55
- new_word = (i[0] + " " + i[1]).replace(" ", "+")
56
- url = f"https://www.objective.inc/demos/wikipedia?query={new_word}"
57
- page_source = await fetch_url(session, url)
58
- for i in page_source:
59
- data.add(i)
60
- return data
61
-
62
- async def single_method(name, session):
63
- async with aiohttp.ClientSession() as session:
64
- data = set()
65
- new_name = name.replace("+", " ").replace("-", " ").replace("/", " ").split()
66
- for i in new_name:
67
- new_word = i.replace(" ", "+")
68
- url = f"https://www.objective.inc/demos/wikipedia?query={new_word}"
69
- page_source = await fetch_url(session, url)
70
- for i in page_source:
71
- data.add(i)
72
- return data
73
-
74
- async def mains(name, single, combi):
75
- data = set()
76
- disam_data = set()
77
- qids = set()
78
-
79
- async with aiohttp.ClientSession() as session:
80
- url = f"https://www.objective.inc/demos/wikipedia?query={name}"
81
- page_source = await fetch_url(session, url)
82
- for i in page_source:
83
- data.add(i)
84
-
85
- wikipedia_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=1&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
86
- json_data = await fetch_json(wikipedia_url, session)
87
- suggestion = json_data.get('query', {}).get('searchinfo', {}).get('suggestion')
88
-
89
- if suggestion:
90
- suggested_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={suggestion}&srlimit=10&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
91
- json_suggestion = await fetch_json(suggested_url, session)
92
- results = json_suggestion.get('query', {}).get('search')
93
- for i in results:
94
- data.add(int(i.get('pageid')))
95
-
96
- # Handle disambiguation links
97
- if data != {0}:
98
- for ids in data:
99
- titles = set()
100
- wikipedia_disambiguation = f"https://en.wikipedia.org/w/api.php?action=query&generator=links&format=json&redirects=1&pageids={ids}&prop=pageprops&gpllimit=50&ppprop=wikibase_item"
101
- json_id = await fetch_json(wikipedia_disambiguation, session)
102
- try:
103
- title = json_id.get('query').get('pages')
104
- for k, v in title.items():
105
- titles.add(v.get("title"))
106
- except:
107
- pass
108
-
109
- if "Help:Disambiguation" in titles:
110
- for i in titles:
111
- if ":" not in i:
112
- wikipedia_disamb = f"https://en.wikipedia.org/w/api.php?action=query&format=json&titles={i}&indexpageids"
113
- json_id = await fetch_json(wikipedia_disamb, session)
114
- real_title = json_id.get('query').get('pageids')
115
- disam_data.add(int(real_title[0]))
116
- else:
117
- disam_data.add(ids)
118
-
119
- # Makes combinations of the name
120
- if combi == "Yes":
121
- if len(name.replace("+", " ").replace("-", " ").split()) >= 3:
122
- combination_names = await combination_method(name, session)
123
- for i in combination_names:
124
- disam_data.add(i)
125
-
126
- # Checks every word alone
127
- if single == "Yes":
128
- if len(name.replace("+", " ").replace("-", " ").replace("/", " ").split()) >= 2:
129
- singles = await single_method(name, session)
130
- for i in singles:
131
- disam_data.add(i)
132
-
133
- for ids in disam_data:
134
- try:
135
- wikibase_url = f"https://en.wikipedia.org/w/api.php?action=query&pageids={ids}&prop=pageprops&format=json"
136
- json_qid = await fetch_json(wikibase_url, session)
137
- wikidata_qid = json_qid.get('query', {}).get('pages', {}).get(str(ids), {}).get('pageprops', {}).get('wikibase_item', {})
138
- if wikidata_qid:
139
- qids.add(wikidata_qid)
140
- except:
141
- pass
142
-
143
- # Save QIDs to file
144
- with open(f"qids_folder/{name}.json", "w") as f:
145
- json.dump(list(qids), f)
146
-
147
-
148
- async def get_results(query):
149
- user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
150
- url = "https://query.wikidata.org/sparql"
151
- sparql = SPARQLWrapper(url, agent=user_agent)
152
- sparql.setQuery(query)
153
- sparql.setReturnFormat(JSON)
154
- return sparql.query().convert()
155
-
156
- def get_resultss(query):
157
- user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
158
- url = "https://query.wikidata.org/sparql"
159
- sparql = SPARQLWrapper(url, agent=user_agent)
160
- sparql.setQuery(query)
161
- sparql.setReturnFormat(JSON)
162
- return sparql.query().convert()
163
-
164
-
165
- def cleaner(text):
166
- text = text.replace('\\', '').replace('\n', ' ')
167
- text = re.sub(r'\{.*?\}', '', text)
168
- text = re.sub(' +', ' ', text).strip()
169
- return text
170
-
171
- async def retriever(qid):
172
- async with aiohttp.ClientSession() as session:
173
- list_with_sent = []
174
-
175
- query_label = f"""SELECT ?subjectLabel
176
- WHERE {{
177
- wd:{qid} rdfs:label ?subjectLabel .
178
- FILTER(LANG(?subjectLabel) = "en")
179
- }}
180
- """
181
-
182
- results = await get_results(query_label)
183
-
184
- label = None
185
- if results["results"]["bindings"]:
186
- for result in results["results"]["bindings"]:
187
- for key, value in result.items():
188
- label = value.get("value", {}).lower() # Get label and convert to lower case
189
-
190
- query_alias = f"""SELECT ?alias
191
- WHERE {{
192
- wd:{qid} skos:altLabel ?alias
193
- FILTER(LANG(?alias) = "en")
194
- }}
195
- """
196
-
197
- alias_list = []
198
- results = await get_results(query_alias)
199
-
200
- for result in results["results"]["bindings"]:
201
- for key, value in result.items():
202
- alias = value.get("value", "None")
203
- alias_list.append(alias)
204
-
205
- query_desci = f"""SELECT ?subjectLabel
206
- WHERE {{
207
- ?subjectLabel schema:about wd:{qid} ;
208
- schema:inLanguage "en" ;
209
- schema:isPartOf <https://en.wikipedia.org/> .
210
- }}
211
- """
212
-
213
- results = await get_results(query_desci)
214
- cleaned_first_para = "None"
215
-
216
- if results["results"]["bindings"]:
217
- for result in results["results"]["bindings"]:
218
- for key, value in result.items():
219
- desc = value.get("value", "None")
220
-
221
- title = desc.split("/wiki/")[1]
222
-
223
- url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles={title}&exintro=&exsentences=2&explaintext=&redirects=&formatversion=2&format=json"
224
-
225
-
226
- json_data = await fetch_json(url, session)
227
- cleaned_first_para = cleaner(json_data.get('query', {}).get('pages', [{}])[0].get('extract', 'None'))
228
- else:
229
- query_desc = f"""SELECT ?subjectLabel
230
- WHERE {{
231
- wd:{qid} schema:description ?subjectLabel .
232
- FILTER(LANG(?subjectLabel) = "en")
233
- }}
234
- """
235
-
236
- results = await get_results(query_desc)
237
- if results["results"]["bindings"]:
238
- for result in results["results"]["bindings"]:
239
- for key, value in result.items():
240
- cleaned_first_para = value.get("value", "None")
241
-
242
- list_with_sent.append({"qid": qid, "label": label, "description": cleaned_first_para})
243
-
244
- if alias_list:
245
- for alias in alias_list:
246
- list_with_sent.append({"qid": qid, "label": alias.lower(), "description": cleaned_first_para})
247
-
248
- return list_with_sent
249
-
250
- # Main async function to handle multiple QIDs with batching
251
- async def main(name):
252
- with open(f"qids_folder/{name}.json", "r") as f:
253
- final_list = []
254
- qids = json.load(f)
255
- for q in qids:
256
- returned_list = await retriever(q)
257
- if returned_list:
258
- final_list.extend(returned_list)
259
-
260
- with open(f"info_extraction/{name}.json", "w", encoding="utf-8") as flast:
261
- json.dump(final_list, flast)
262
-
263
- def check_sentence(sentence):
264
- two_consecutive_uppercase = r"[A-Z]{2}"
265
- uppercase_followed_by_fullstop = r"[A-Z]\."
266
-
267
- if re.search(two_consecutive_uppercase, sentence):
268
- return True
269
-
270
- if re.search(uppercase_followed_by_fullstop, sentence):
271
- return True
272
-
273
- return False
274
-
275
- chrome_driver_path = "chromedriver.exe"
276
- chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"'
277
-
278
- def main_cli():
279
- st.title("✨ Entity Linking Application ✨")
280
- st.caption("This Web Application is part of my master dissertation.")
281
-
282
-
283
- input_sentence_user = st.text_input("Enter the sentence:", "")
284
- input_mention_user = st.text_input("Enter the mention:", "")
285
- single = st.selectbox("Search each word individually?", ['Yes', 'No'], index=1)
286
- combi = st.selectbox("Make combinations of each word?", ['Yes', 'No'], index=1)
287
- disambi = st.selectbox("Run acronym disambiguation? (Enable it if the mention is nested)", ['Yes', 'No'], index=0)
288
-
289
-
290
- if st.button("Run Entity Linking"):
291
- if input_sentence_user and input_mention_user:
292
- # Example logic: check if the mention is in the sentence
293
- if input_mention_user in input_sentence_user:
294
- st.write("Applying Data Normalization module... (1/5)")
295
- # Data Normalization
296
-
297
- start_time = time.time()
298
-
299
- list_with_full_names = []
300
- list_with_names_to_show = []
301
-
302
- if disambi == "Yes":
303
- response = client.chat.completions.create(
304
- messages=[
305
- {
306
- "role": "system",
307
- "content": """
308
- I will give you one or more labels within a sentence. Your task is as follows:
309
-
310
- Identify each label in the sentence, and check if it is an acronym.
311
-
312
- If the label is an acronym, respond with the full name of the acronym.
313
- If the label is not an acronym, respond with the label exactly as it was given to you.
314
- If a label contains multiple terms (e.g., 'phase and DIC microscopy'), treat each term within the label as a separate label.
315
-
316
- This means you should identify and explain each part of the label individually.
317
- Each part should be on its own line in the response.
318
- Context-Specific Terms: If the sentence context suggests a relevant term that applies to each label (such as "study" in 'morphological, sedimentological, and stratigraphical study'), add that term to each label’s explanation.
319
-
320
- Use context clues to determine the appropriate term to add (e.g., 'study' or 'microscopy').
321
- Output Format: Your response should contain only the explanations, formatted as follows:
322
-
323
- Each label or part of a label should be on a new line.
324
- Do not include any additional text, and do not repeat the original sentence.
325
- Example 1:
326
-
327
- Input:
328
-
329
- label: phase and DIC microscopy
330
- context: Tardigrades have been extracted from samples using centrifugation with Ludox AM™ and mounted on individual microscope slides in Hoyer's medium for identification under phase and DIC microscopy.
331
- Expected response:
332
-
333
- phase: phase microscopy
334
- DIC microscopy: Differential interference contrast microscopy
335
- Example 2:
336
-
337
- Input:
338
-
339
- label: morphological, sedimentological, and stratigraphical study
340
- context: This paper presents results of a morphological, sedimentological, and stratigraphical study of relict beach ridges formed on a prograded coastal barrier in Bream Bay, North Island New Zealand.
341
- Expected response:
342
-
343
- morphological: morphological study
344
- sedimentological: sedimentological study
345
- stratigraphical: stratigraphical study
346
- IMPORTANT:
347
-
348
- Each label, even if nested within another, should be treated as an individual item.
349
- Each individual label or acronym should be output on a separate line.
350
- """
351
- },
352
- {
353
- "role": "user",
354
- "content": f"label:{input_mention_user}, context:{input_sentence_user}"
355
- }
356
- ],
357
- temperature=1.0,
358
- top_p=1.0,
359
- max_tokens=1000,
360
- model=model_name
361
- )
362
-
363
- print(response.choices[0].message.content)
364
-
365
- kati = response.choices[0].message.content.splitlines()
366
-
367
- for i in kati:
368
- context = i.split(":")[-1].strip()
369
- original_name = i.split(":")[0].strip()
370
- list_with_full_names.append(context)
371
- list_with_names_to_show.append(original_name)
372
-
373
- name = ",".join(list_with_full_names)
374
-
375
- else:
376
- name = input_mention_user
377
- list_with_full_names.append(name)
378
- list_with_names_to_show.append(name)
379
-
380
- input_sentence_user = input_sentence_user.replace(input_mention_user, name) # Changing the mention to the correct one
381
-
382
- response = client.chat.completions.create(
383
- messages=[
384
- {
385
- "role": "system",
386
- "content": "Given a label or labels within a sentence, provide a brief description (2-3 sentences) explaining what the label represents, similar to how a Wikipedia entry would. Format your response as follows: label: description. I want only the description of the label, not the role in the context. Include the label in the description as well. For example: Sentiment analysis: Sentiment analysis is the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information.\nText analysis: Text mining, text data mining (TDM) or text analytics is the process of deriving high-quality information from text. It involves the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources.",
387
- },
388
- {
389
- "role": "user",
390
- "content": f"label:{name}, context:{input_sentence_user}"
391
- }
392
- ],
393
- temperature=1.0,
394
- top_p=1.0,
395
- max_tokens=1000,
396
- model=model_name
397
- )
398
-
399
- print(response.choices[0].message.content)
400
-
401
- z = response.choices[0].message.content.splitlines()
402
- list_with_contexts = []
403
- for i in z:
404
- context = i.split(":")[-1].strip()
405
- list_with_contexts.append(context)
406
-
407
- # Candidate Generation & Information Extraction
408
- async def big_main(mention, single, combi):
409
- mention = mention.split(",")
410
- st.write("Applying Candidate Generation module... (2/5)")
411
- for i in mention:
412
- await mains(i, single, combi)
413
- st.write("Applying Information Extraction module... (3/5)")
414
- for i in mention:
415
- await main(i)
416
-
417
- asyncio.run(big_main(name, single, combi))
418
-
419
- number = 0
420
- for i,j,o in zip(list_with_full_names,list_with_contexts,list_with_names_to_show):
421
- number += 1
422
- st.write(f"Applying Candidate Selection module... (4/5) [{number}/{len(list_with_full_names)}]")
423
- with open(f"info_extraction/{i}.json", "r") as f:
424
- json_file = json.load(f)
425
- lista = []
426
- lista_1 = []
427
- for element in json_file:
428
- qid = element.get("qid")
429
- link = f"https://www.wikidata.org/wiki/{qid}"
430
- label = element.get("label")
431
- description = element.get("description")
432
-
433
- label_emb = model.encode([label])
434
- desc_emb = model.encode([description])
435
-
436
- lista.append({link: [label_emb, desc_emb]})
437
-
438
- label_dataset_emb = model.encode([i])
439
- desc_dataset_emb = model.encode([j])
440
-
441
- for emb in lista:
442
- for k, v in emb.items():
443
- cossim_label = model.similarity(label_dataset_emb, v[0][0])
444
- desc_label = model.similarity(desc_dataset_emb, v[1][0])
445
- emb_mean = np.mean([cossim_label, desc_label])
446
- lista_1.append({k: emb_mean})
447
-
448
- sorted_data = sorted(lista_1, key=lambda x: list(x.values())[0], reverse=True)
449
- st.write(f"Applying Entity Linking module... (4/5) [{number}/{len(list_with_full_names)}]")
450
- if sorted_data:
451
- sorted_top = sorted_data[0]
452
- for k, v in sorted_top.items():
453
- qid = k.split("/")[-1]
454
-
455
- wikidata2wikipedia = f"""
456
- SELECT ?wikipedia
457
- WHERE {{
458
- ?wikipedia schema:about wd:{qid} .
459
- ?wikipedia schema:isPartOf <https://en.wikipedia.org/> .
460
- }}
461
- """
462
- results = get_resultss(wikidata2wikipedia)
463
-
464
- for result in results["results"]["bindings"]:
465
- for key, value in result.items():
466
- wikipedia = value.get("value", "None")
467
-
468
- sparql = SPARQLWrapper("http://dbpedia.org/sparql")
469
- wikidata2dbpedia = f"""
470
- SELECT ?dbpedia
471
- WHERE {{
472
- ?dbpedia owl:sameAs <http://www.wikidata.org/entity/{qid}>.
473
- }}
474
- """
475
- sparql.setQuery(wikidata2dbpedia)
476
- sparql.setReturnFormat(JSON)
477
- results = sparql.query().convert()
478
- for result in results["results"]["bindings"]:
479
- dbpedia = result["dbpedia"]["value"]
480
-
481
- st.text(f"The correct entity for '{o}' is:")
482
- st.success(f"Wikipedia: {wikipedia}")
483
- st.success(f"Wikidata: {k}")
484
- st.success(f"DBpedia: {dbpedia}")
485
- else:
486
- st.warning(f"The entity: {o} is NIL.")
487
- else:
488
- st.warning(f"The mention '{input_mention_user}' was NOT found in the sentence.")
489
- else:
490
- st.warning("Please fill in both fields.")
491
- end_time = time.time()
492
- execution_time = end_time - start_time
493
- ETA = time.strftime("%H:%M:%S", time.gmtime(execution_time))
494
- st.write(f"Execution time: {ETA}")
495
-
496
- folder_path = "qids_folder"
497
- for filename in os.listdir(folder_path):
498
- file_path = os.path.join(folder_path, filename)
499
- os.remove(file_path)
500
-
501
- folder_path_1 = "info_extraction"
502
- for filename in os.listdir(folder_path_1):
503
- file_path = os.path.join(folder_path_1, filename)
504
- os.remove(file_path)
505
-
506
- if __name__ == "__main__":
 
 
 
 
507
  main_cli()
 
1
+ import pandas as pd
2
+ import json
3
+ import numpy as np
4
+ import re
5
+ from itertools import combinations as itertools_combinations
6
+ import os
7
+ import sys
8
+ from SPARQLWrapper import SPARQLWrapper, JSON
9
+ from sentence_transformers import SentenceTransformer
10
+ import aiohttp
11
+ import asyncio
12
+ import streamlit as st
13
+ import time
14
+ from openai import OpenAI
15
+ import sys
16
+
17
+ new_directory = "info_extraction"
18
+ if not os.path.exists(new_directory):
19
+ os.makedirs(new_directory)
20
+
21
+ model = SentenceTransformer("Lajavaness/bilingual-embedding-large", trust_remote_code=True)
22
+
23
+ token = os.environ["GITHUB_TOKEN"]
24
+ endpoint = "https://models.inference.ai.azure.com"
25
+ model_name = "gpt-4o"
26
+
27
+ client = OpenAI(
28
+ base_url=endpoint,
29
+ api_key=token,
30
+ )
31
+
32
+
33
+ async def fetch_url(session, url):
34
+ pageids_list = []
35
+ async with session.get(url) as response:
36
+ x = await response.text()
37
+ objective_list = x.split('"objectiveResults\\":')[-1].split(',\\"wikipediaResults\\"')[0].replace('\\\\\\"', "").replace("\\", "")
38
+ wikipedia_list = x.split(',\\"wikipediaResults\\":')[-1].split(',\\"data-sentry-element\\"')[0].replace('\\\\\\"', "").replace("\\", "")
39
+ data_1 = json.loads(objective_list)
40
+ data_2 = json.loads(wikipedia_list)
41
+ for i in data_1:
42
+ pageids_list.append(i.get("page_id"))
43
+ for i in data_2:
44
+ pageids_list.append(i.get("pageid"))
45
+ print(pageids_list)
46
+ return pageids_list
47
+
48
+
49
+ async def fetch_json(url, session):
50
+ async with session.get(url) as response:
51
+ return await response.json()
52
+
53
+ async def combination_method(name, session):
54
+ async with aiohttp.ClientSession() as session:
55
+ data = set()
56
+ new_name = name.replace("+", " ").split()
57
+ x = itertools_combinations(new_name, 2)
58
+ for i in x:
59
+ new_word = (i[0] + " " + i[1]).replace(" ", "+")
60
+ url = f"https://www.objective.inc/demos/wikipedia?query={new_word}"
61
+ page_source = await fetch_url(session, url)
62
+ for i in page_source:
63
+ data.add(i)
64
+ return data
65
+
66
+ async def single_method(name, session):
67
+ async with aiohttp.ClientSession() as session:
68
+ data = set()
69
+ new_name = name.replace("+", " ").replace("-", " ").replace("/", " ").split()
70
+ for i in new_name:
71
+ new_word = i.replace(" ", "+")
72
+ url = f"https://www.objective.inc/demos/wikipedia?query={new_word}"
73
+ page_source = await fetch_url(session, url)
74
+ for i in page_source:
75
+ data.add(i)
76
+ return data
77
+
78
+ async def mains(name, single, combi):
79
+ data = set()
80
+ disam_data = set()
81
+ qids = set()
82
+
83
+ async with aiohttp.ClientSession() as session:
84
+ url = f"https://www.objective.inc/demos/wikipedia?query={name}"
85
+ page_source = await fetch_url(session, url)
86
+ for i in page_source:
87
+ data.add(i)
88
+
89
+ wikipedia_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=1&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
90
+ json_data = await fetch_json(wikipedia_url, session)
91
+ suggestion = json_data.get('query', {}).get('searchinfo', {}).get('suggestion')
92
+
93
+ if suggestion:
94
+ suggested_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={suggestion}&srlimit=10&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
95
+ json_suggestion = await fetch_json(suggested_url, session)
96
+ results = json_suggestion.get('query', {}).get('search')
97
+ for i in results:
98
+ data.add(int(i.get('pageid')))
99
+
100
+ # Handle disambiguation links
101
+ if data != {0}:
102
+ for ids in data:
103
+ titles = set()
104
+ wikipedia_disambiguation = f"https://en.wikipedia.org/w/api.php?action=query&generator=links&format=json&redirects=1&pageids={ids}&prop=pageprops&gpllimit=50&ppprop=wikibase_item"
105
+ json_id = await fetch_json(wikipedia_disambiguation, session)
106
+ try:
107
+ title = json_id.get('query').get('pages')
108
+ for k, v in title.items():
109
+ titles.add(v.get("title"))
110
+ except:
111
+ pass
112
+
113
+ if "Help:Disambiguation" in titles:
114
+ for i in titles:
115
+ if ":" not in i:
116
+ wikipedia_disamb = f"https://en.wikipedia.org/w/api.php?action=query&format=json&titles={i}&indexpageids"
117
+ json_id = await fetch_json(wikipedia_disamb, session)
118
+ real_title = json_id.get('query').get('pageids')
119
+ disam_data.add(int(real_title[0]))
120
+ else:
121
+ disam_data.add(ids)
122
+
123
+ # Makes combinations of the name
124
+ if combi == "Yes":
125
+ if len(name.replace("+", " ").replace("-", " ").split()) >= 3:
126
+ combination_names = await combination_method(name, session)
127
+ for i in combination_names:
128
+ disam_data.add(i)
129
+
130
+ # Checks every word alone
131
+ if single == "Yes":
132
+ if len(name.replace("+", " ").replace("-", " ").replace("/", " ").split()) >= 2:
133
+ singles = await single_method(name, session)
134
+ for i in singles:
135
+ disam_data.add(i)
136
+
137
+ for ids in disam_data:
138
+ try:
139
+ wikibase_url = f"https://en.wikipedia.org/w/api.php?action=query&pageids={ids}&prop=pageprops&format=json"
140
+ json_qid = await fetch_json(wikibase_url, session)
141
+ wikidata_qid = json_qid.get('query', {}).get('pages', {}).get(str(ids), {}).get('pageprops', {}).get('wikibase_item', {})
142
+ if wikidata_qid:
143
+ qids.add(wikidata_qid)
144
+ except:
145
+ pass
146
+
147
+ # Save QIDs to file
148
+ with open(f"qids_folder/{name}.json", "w") as f:
149
+ json.dump(list(qids), f)
150
+
151
+
152
+ async def get_results(query):
153
+ user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
154
+ url = "https://query.wikidata.org/sparql"
155
+ sparql = SPARQLWrapper(url, agent=user_agent)
156
+ sparql.setQuery(query)
157
+ sparql.setReturnFormat(JSON)
158
+ return sparql.query().convert()
159
+
160
+ def get_resultss(query):
161
+ user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
162
+ url = "https://query.wikidata.org/sparql"
163
+ sparql = SPARQLWrapper(url, agent=user_agent)
164
+ sparql.setQuery(query)
165
+ sparql.setReturnFormat(JSON)
166
+ return sparql.query().convert()
167
+
168
+
169
+ def cleaner(text):
170
+ text = text.replace('\\', '').replace('\n', ' ')
171
+ text = re.sub(r'\{.*?\}', '', text)
172
+ text = re.sub(' +', ' ', text).strip()
173
+ return text
174
+
175
+ async def retriever(qid):
176
+ async with aiohttp.ClientSession() as session:
177
+ list_with_sent = []
178
+
179
+ query_label = f"""SELECT ?subjectLabel
180
+ WHERE {{
181
+ wd:{qid} rdfs:label ?subjectLabel .
182
+ FILTER(LANG(?subjectLabel) = "en")
183
+ }}
184
+ """
185
+
186
+ results = await get_results(query_label)
187
+
188
+ label = None
189
+ if results["results"]["bindings"]:
190
+ for result in results["results"]["bindings"]:
191
+ for key, value in result.items():
192
+ label = value.get("value", {}).lower() # Get label and convert to lower case
193
+
194
+ query_alias = f"""SELECT ?alias
195
+ WHERE {{
196
+ wd:{qid} skos:altLabel ?alias
197
+ FILTER(LANG(?alias) = "en")
198
+ }}
199
+ """
200
+
201
+ alias_list = []
202
+ results = await get_results(query_alias)
203
+
204
+ for result in results["results"]["bindings"]:
205
+ for key, value in result.items():
206
+ alias = value.get("value", "None")
207
+ alias_list.append(alias)
208
+
209
+ query_desci = f"""SELECT ?subjectLabel
210
+ WHERE {{
211
+ ?subjectLabel schema:about wd:{qid} ;
212
+ schema:inLanguage "en" ;
213
+ schema:isPartOf <https://en.wikipedia.org/> .
214
+ }}
215
+ """
216
+
217
+ results = await get_results(query_desci)
218
+ cleaned_first_para = "None"
219
+
220
+ if results["results"]["bindings"]:
221
+ for result in results["results"]["bindings"]:
222
+ for key, value in result.items():
223
+ desc = value.get("value", "None")
224
+
225
+ title = desc.split("/wiki/")[1]
226
+
227
+ url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles={title}&exintro=&exsentences=2&explaintext=&redirects=&formatversion=2&format=json"
228
+
229
+
230
+ json_data = await fetch_json(url, session)
231
+ cleaned_first_para = cleaner(json_data.get('query', {}).get('pages', [{}])[0].get('extract', 'None'))
232
+ else:
233
+ query_desc = f"""SELECT ?subjectLabel
234
+ WHERE {{
235
+ wd:{qid} schema:description ?subjectLabel .
236
+ FILTER(LANG(?subjectLabel) = "en")
237
+ }}
238
+ """
239
+
240
+ results = await get_results(query_desc)
241
+ if results["results"]["bindings"]:
242
+ for result in results["results"]["bindings"]:
243
+ for key, value in result.items():
244
+ cleaned_first_para = value.get("value", "None")
245
+
246
+ list_with_sent.append({"qid": qid, "label": label, "description": cleaned_first_para})
247
+
248
+ if alias_list:
249
+ for alias in alias_list:
250
+ list_with_sent.append({"qid": qid, "label": alias.lower(), "description": cleaned_first_para})
251
+
252
+ return list_with_sent
253
+
254
+ # Main async function to handle multiple QIDs with batching
255
+ async def main(name):
256
+ with open(f"qids_folder/{name}.json", "r") as f:
257
+ final_list = []
258
+ qids = json.load(f)
259
+ for q in qids:
260
+ returned_list = await retriever(q)
261
+ if returned_list:
262
+ final_list.extend(returned_list)
263
+
264
+ with open(f"info_extraction/{name}.json", "w", encoding="utf-8") as flast:
265
+ json.dump(final_list, flast)
266
+
267
+ def check_sentence(sentence):
268
+ two_consecutive_uppercase = r"[A-Z]{2}"
269
+ uppercase_followed_by_fullstop = r"[A-Z]\."
270
+
271
+ if re.search(two_consecutive_uppercase, sentence):
272
+ return True
273
+
274
+ if re.search(uppercase_followed_by_fullstop, sentence):
275
+ return True
276
+
277
+ return False
278
+
279
+ chrome_driver_path = "chromedriver.exe"
280
+ chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"'
281
+
282
+ def main_cli():
283
+ st.title(" Entity Linking Application ✨")
284
+ st.caption("This Web Application is part of my master dissertation.")
285
+
286
+
287
+ input_sentence_user = st.text_input("Enter the sentence:", "")
288
+ input_mention_user = st.text_input("Enter the mention:", "")
289
+ single = st.selectbox("Search each word individually?", ['Yes', 'No'], index=1)
290
+ combi = st.selectbox("Make combinations of each word?", ['Yes', 'No'], index=1)
291
+ disambi = st.selectbox("Run acronym disambiguation? (Enable it if the mention is nested)", ['Yes', 'No'], index=0)
292
+
293
+
294
+ if st.button("Run Entity Linking"):
295
+ if input_sentence_user and input_mention_user:
296
+ # Example logic: check if the mention is in the sentence
297
+ if input_mention_user in input_sentence_user:
298
+ st.write("Applying Data Normalization module... (1/5)")
299
+ # Data Normalization
300
+
301
+ start_time = time.time()
302
+
303
+ list_with_full_names = []
304
+ list_with_names_to_show = []
305
+
306
+ if disambi == "Yes":
307
+ response = client.chat.completions.create(
308
+ messages=[
309
+ {
310
+ "role": "system",
311
+ "content": """
312
+ I will give you one or more labels within a sentence. Your task is as follows:
313
+
314
+ Identify each label in the sentence, and check if it is an acronym.
315
+
316
+ If the label is an acronym, respond with the full name of the acronym.
317
+ If the label is not an acronym, respond with the label exactly as it was given to you.
318
+ If a label contains multiple terms (e.g., 'phase and DIC microscopy'), treat each term within the label as a separate label.
319
+
320
+ This means you should identify and explain each part of the label individually.
321
+ Each part should be on its own line in the response.
322
+ Context-Specific Terms: If the sentence context suggests a relevant term that applies to each label (such as "study" in 'morphological, sedimentological, and stratigraphical study'), add that term to each label’s explanation.
323
+
324
+ Use context clues to determine the appropriate term to add (e.g., 'study' or 'microscopy').
325
+ Output Format: Your response should contain only the explanations, formatted as follows:
326
+
327
+ Each label or part of a label should be on a new line.
328
+ Do not include any additional text, and do not repeat the original sentence.
329
+ Example 1:
330
+
331
+ Input:
332
+
333
+ label: phase and DIC microscopy
334
+ context: Tardigrades have been extracted from samples using centrifugation with Ludox AM™ and mounted on individual microscope slides in Hoyer's medium for identification under phase and DIC microscopy.
335
+ Expected response:
336
+
337
+ phase: phase microscopy
338
+ DIC microscopy: Differential interference contrast microscopy
339
+ Example 2:
340
+
341
+ Input:
342
+
343
+ label: morphological, sedimentological, and stratigraphical study
344
+ context: This paper presents results of a morphological, sedimentological, and stratigraphical study of relict beach ridges formed on a prograded coastal barrier in Bream Bay, North Island New Zealand.
345
+ Expected response:
346
+
347
+ morphological: morphological study
348
+ sedimentological: sedimentological study
349
+ stratigraphical: stratigraphical study
350
+ IMPORTANT:
351
+
352
+ Each label, even if nested within another, should be treated as an individual item.
353
+ Each individual label or acronym should be output on a separate line.
354
+ """
355
+ },
356
+ {
357
+ "role": "user",
358
+ "content": f"label:{input_mention_user}, context:{input_sentence_user}"
359
+ }
360
+ ],
361
+ temperature=1.0,
362
+ top_p=1.0,
363
+ max_tokens=1000,
364
+ model=model_name
365
+ )
366
+
367
+ print(response.choices[0].message.content)
368
+
369
+ kati = response.choices[0].message.content.splitlines()
370
+
371
+ for i in kati:
372
+ context = i.split(":")[-1].strip()
373
+ original_name = i.split(":")[0].strip()
374
+ list_with_full_names.append(context)
375
+ list_with_names_to_show.append(original_name)
376
+
377
+ name = ",".join(list_with_full_names)
378
+
379
+ else:
380
+ name = input_mention_user
381
+ list_with_full_names.append(name)
382
+ list_with_names_to_show.append(name)
383
+
384
+ input_sentence_user = input_sentence_user.replace(input_mention_user, name) # Changing the mention to the correct one
385
+
386
+ response = client.chat.completions.create(
387
+ messages=[
388
+ {
389
+ "role": "system",
390
+ "content": "Given a label or labels within a sentence, provide a brief description (2-3 sentences) explaining what the label represents, similar to how a Wikipedia entry would. Format your response as follows: label: description. I want only the description of the label, not the role in the context. Include the label in the description as well. For example: Sentiment analysis: Sentiment analysis is the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information.\nText analysis: Text mining, text data mining (TDM) or text analytics is the process of deriving high-quality information from text. It involves the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources.",
391
+ },
392
+ {
393
+ "role": "user",
394
+ "content": f"label:{name}, context:{input_sentence_user}"
395
+ }
396
+ ],
397
+ temperature=1.0,
398
+ top_p=1.0,
399
+ max_tokens=1000,
400
+ model=model_name
401
+ )
402
+
403
+ print(response.choices[0].message.content)
404
+
405
+ z = response.choices[0].message.content.splitlines()
406
+ list_with_contexts = []
407
+ for i in z:
408
+ context = i.split(":")[-1].strip()
409
+ list_with_contexts.append(context)
410
+
411
+ # Candidate Generation & Information Extraction
412
+ async def big_main(mention, single, combi):
413
+ mention = mention.split(",")
414
+ st.write("Applying Candidate Generation module... (2/5)")
415
+ for i in mention:
416
+ await mains(i, single, combi)
417
+ st.write("Applying Information Extraction module... (3/5)")
418
+ for i in mention:
419
+ await main(i)
420
+
421
+ asyncio.run(big_main(name, single, combi))
422
+
423
+ number = 0
424
+ for i,j,o in zip(list_with_full_names,list_with_contexts,list_with_names_to_show):
425
+ number += 1
426
+ st.write(f"Applying Candidate Selection module... (4/5) [{number}/{len(list_with_full_names)}]")
427
+ with open(f"info_extraction/{i}.json", "r") as f:
428
+ json_file = json.load(f)
429
+ lista = []
430
+ lista_1 = []
431
+ for element in json_file:
432
+ qid = element.get("qid")
433
+ link = f"https://www.wikidata.org/wiki/{qid}"
434
+ label = element.get("label")
435
+ description = element.get("description")
436
+
437
+ label_emb = model.encode([label])
438
+ desc_emb = model.encode([description])
439
+
440
+ lista.append({link: [label_emb, desc_emb]})
441
+
442
+ label_dataset_emb = model.encode([i])
443
+ desc_dataset_emb = model.encode([j])
444
+
445
+ for emb in lista:
446
+ for k, v in emb.items():
447
+ cossim_label = model.similarity(label_dataset_emb, v[0][0])
448
+ desc_label = model.similarity(desc_dataset_emb, v[1][0])
449
+ emb_mean = np.mean([cossim_label, desc_label])
450
+ lista_1.append({k: emb_mean})
451
+
452
+ sorted_data = sorted(lista_1, key=lambda x: list(x.values())[0], reverse=True)
453
+ st.write(f"Applying Entity Linking module... (4/5) [{number}/{len(list_with_full_names)}]")
454
+ if sorted_data:
455
+ sorted_top = sorted_data[0]
456
+ for k, v in sorted_top.items():
457
+ qid = k.split("/")[-1]
458
+
459
+ wikidata2wikipedia = f"""
460
+ SELECT ?wikipedia
461
+ WHERE {{
462
+ ?wikipedia schema:about wd:{qid} .
463
+ ?wikipedia schema:isPartOf <https://en.wikipedia.org/> .
464
+ }}
465
+ """
466
+ results = get_resultss(wikidata2wikipedia)
467
+
468
+ for result in results["results"]["bindings"]:
469
+ for key, value in result.items():
470
+ wikipedia = value.get("value", "None")
471
+
472
+ sparql = SPARQLWrapper("http://dbpedia.org/sparql")
473
+ wikidata2dbpedia = f"""
474
+ SELECT ?dbpedia
475
+ WHERE {{
476
+ ?dbpedia owl:sameAs <http://www.wikidata.org/entity/{qid}>.
477
+ }}
478
+ """
479
+ sparql.setQuery(wikidata2dbpedia)
480
+ sparql.setReturnFormat(JSON)
481
+ results = sparql.query().convert()
482
+ for result in results["results"]["bindings"]:
483
+ dbpedia = result["dbpedia"]["value"]
484
+
485
+ st.text(f"The correct entity for '{o}' is:")
486
+ st.success(f"Wikipedia: {wikipedia}")
487
+ st.success(f"Wikidata: {k}")
488
+ st.success(f"DBpedia: {dbpedia}")
489
+ else:
490
+ st.warning(f"The entity: {o} is NIL.")
491
+ else:
492
+ st.warning(f"The mention '{input_mention_user}' was NOT found in the sentence.")
493
+ else:
494
+ st.warning("Please fill in both fields.")
495
+ end_time = time.time()
496
+ execution_time = end_time - start_time
497
+ ETA = time.strftime("%H:%M:%S", time.gmtime(execution_time))
498
+ st.write(f"Execution time: {ETA}")
499
+
500
+ folder_path = "qids_folder"
501
+ for filename in os.listdir(folder_path):
502
+ file_path = os.path.join(folder_path, filename)
503
+ os.remove(file_path)
504
+
505
+ folder_path_1 = "info_extraction"
506
+ for filename in os.listdir(folder_path_1):
507
+ file_path = os.path.join(folder_path_1, filename)
508
+ os.remove(file_path)
509
+
510
+ if __name__ == "__main__":
511
  main_cli()