Upload 2 files
Browse files- app.py +507 -0
- requirements.txt +5 -0
@@ -0,0 +1,507 @@
1 |
import pandas as pd
2 |
import json
3 |
import numpy as np
4 |
import re
5 |
from itertools import combinations as itertools_combinations
6 |
import os
7 |
import sys
8 |
from SPARQLWrapper import SPARQLWrapper, JSON
9 |
from sentence_transformers import SentenceTransformer
10 |
import aiohttp
11 |
import asyncio
12 |
import streamlit as st
13 |
import time
14 |
from openai import OpenAI
15 |
import sys
16 |
17 |
model = SentenceTransformer("Lajavaness/bilingual-embedding-large", trust_remote_code=True)
18 |
19 |
token = os.environ["GITHUB_TOKEN"]
20 |
endpoint = "https://models.inference.ai.azure.com"
21 |
model_name = "gpt-4o"
22 |
23 |
client = OpenAI(
24 |
25 |
26 |
27 |
28 |
29 |
async def fetch_url(session, url):
30 |
pageids_list = []
31 |
async with session.get(url) as response:
32 |
x = await response.text()
33 |
objective_list = x.split('"objectiveResults\\":')[-1].split(',\\"wikipediaResults\\"')[0].replace('\\\\\\"', "").replace("\\", "")
34 |
wikipedia_list = x.split(',\\"wikipediaResults\\":')[-1].split(',\\"data-sentry-element\\"')[0].replace('\\\\\\"', "").replace("\\", "")
35 |
data_1 = json.loads(objective_list)
36 |
data_2 = json.loads(wikipedia_list)
37 |
for i in data_1:
38 |
39 |
for i in data_2:
40 |
41 |
42 |
return pageids_list
43 |
44 |
45 |
async def fetch_json(url, session):
46 |
async with session.get(url) as response:
47 |
return await response.json()
48 |
49 |
async def combination_method(name, session):
50 |
async with aiohttp.ClientSession() as session:
51 |
data = set()
52 |
new_name = name.replace("+", " ").split()
53 |
x = itertools_combinations(new_name, 2)
54 |
for i in x:
55 |
new_word = (i[0] + " " + i[1]).replace(" ", "+")
56 |
url = f"https://www.objective.inc/demos/wikipedia?query={new_word}"
57 |
page_source = await fetch_url(session, url)
58 |
for i in page_source:
59 |
60 |
return data
61 |
62 |
async def single_method(name, session):
63 |
async with aiohttp.ClientSession() as session:
64 |
data = set()
65 |
new_name = name.replace("+", " ").replace("-", " ").replace("/", " ").split()
66 |
for i in new_name:
67 |
new_word = i.replace(" ", "+")
68 |
url = f"https://www.objective.inc/demos/wikipedia?query={new_word}"
69 |
page_source = await fetch_url(session, url)
70 |
for i in page_source:
71 |
72 |
return data
73 |
74 |
async def mains(name, single, combi):
75 |
data = set()
76 |
disam_data = set()
77 |
qids = set()
78 |
79 |
async with aiohttp.ClientSession() as session:
80 |
url = f"https://www.objective.inc/demos/wikipedia?query={name}"
81 |
page_source = await fetch_url(session, url)
82 |
for i in page_source:
83 |
84 |
85 |
wikipedia_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=1&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
86 |
json_data = await fetch_json(wikipedia_url, session)
87 |
suggestion = json_data.get('query', {}).get('searchinfo', {}).get('suggestion')
88 |
89 |
if suggestion:
90 |
suggested_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={suggestion}&srlimit=10&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
91 |
json_suggestion = await fetch_json(suggested_url, session)
92 |
results = json_suggestion.get('query', {}).get('search')
93 |
for i in results:
94 |
95 |
96 |
# Handle disambiguation links
97 |
if data != {0}:
98 |
for ids in data:
99 |
titles = set()
100 |
wikipedia_disambiguation = f"https://en.wikipedia.org/w/api.php?action=query&generator=links&format=json&redirects=1&pageids={ids}&prop=pageprops&gpllimit=50&ppprop=wikibase_item"
101 |
json_id = await fetch_json(wikipedia_disambiguation, session)
102 |
103 |
title = json_id.get('query').get('pages')
104 |
for k, v in title.items():
105 |
106 |
107 |
108 |
109 |
if "Help:Disambiguation" in titles:
110 |
for i in titles:
111 |
if ":" not in i:
112 |
wikipedia_disamb = f"https://en.wikipedia.org/w/api.php?action=query&format=json&titles={i}&indexpageids"
113 |
json_id = await fetch_json(wikipedia_disamb, session)
114 |
real_title = json_id.get('query').get('pageids')
115 |
116 |
117 |
118 |
119 |
# Makes combinations of the name
120 |
if combi == "Yes":
121 |
if len(name.replace("+", " ").replace("-", " ").split()) >= 3:
122 |
combination_names = await combination_method(name, session)
123 |
for i in combination_names:
124 |
125 |
126 |
# Checks every word alone
127 |
if single == "Yes":
128 |
if len(name.replace("+", " ").replace("-", " ").replace("/", " ").split()) >= 2:
129 |
singles = await single_method(name, session)
130 |
for i in singles:
131 |
132 |
133 |
for ids in disam_data:
134 |
135 |
wikibase_url = f"https://en.wikipedia.org/w/api.php?action=query&pageids={ids}&prop=pageprops&format=json"
136 |
json_qid = await fetch_json(wikibase_url, session)
137 |
wikidata_qid = json_qid.get('query', {}).get('pages', {}).get(str(ids), {}).get('pageprops', {}).get('wikibase_item', {})
138 |
if wikidata_qid:
139 |
140 |
141 |
142 |
143 |
# Save QIDs to file
144 |
with open(f"qids_folder/{name}.json", "w") as f:
145 |
json.dump(list(qids), f)
146 |
147 |
148 |
async def get_results(query):
149 |
user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
150 |
url = "https://query.wikidata.org/sparql"
151 |
sparql = SPARQLWrapper(url, agent=user_agent)
152 |
153 |
154 |
return sparql.query().convert()
155 |
156 |
def get_resultss(query):
157 |
user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
158 |
url = "https://query.wikidata.org/sparql"
159 |
sparql = SPARQLWrapper(url, agent=user_agent)
160 |
161 |
162 |
return sparql.query().convert()
163 |
164 |
165 |
def cleaner(text):
166 |
text = text.replace('\\', '').replace('\n', ' ')
167 |
text = re.sub(r'\{.*?\}', '', text)
168 |
text = re.sub(' +', ' ', text).strip()
169 |
return text
170 |
171 |
async def retriever(qid):
172 |
async with aiohttp.ClientSession() as session:
173 |
list_with_sent = []
174 |
175 |
query_label = f"""SELECT ?subjectLabel
176 |
177 |
wd:{qid} rdfs:label ?subjectLabel .
178 |
FILTER(LANG(?subjectLabel) = "en")
179 |
180 |
181 |
182 |
results = await get_results(query_label)
183 |
184 |
label = None
185 |
if results["results"]["bindings"]:
186 |
for result in results["results"]["bindings"]:
187 |
for key, value in result.items():
188 |
label = value.get("value", {}).lower() # Get label and convert to lower case
189 |
190 |
query_alias = f"""SELECT ?alias
191 |
192 |
wd:{qid} skos:altLabel ?alias
193 |
FILTER(LANG(?alias) = "en")
194 |
195 |
196 |
197 |
alias_list = []
198 |
results = await get_results(query_alias)
199 |
200 |
for result in results["results"]["bindings"]:
201 |
for key, value in result.items():
202 |
alias = value.get("value", "None")
203 |
204 |
205 |
query_desci = f"""SELECT ?subjectLabel
206 |
207 |
?subjectLabel schema:about wd:{qid} ;
208 |
schema:inLanguage "en" ;
209 |
schema:isPartOf <https://en.wikipedia.org/> .
210 |
211 |
212 |
213 |
results = await get_results(query_desci)
214 |
cleaned_first_para = "None"
215 |
216 |
if results["results"]["bindings"]:
217 |
for result in results["results"]["bindings"]:
218 |
for key, value in result.items():
219 |
desc = value.get("value", "None")
220 |
221 |
title = desc.split("/wiki/")[1]
222 |
223 |
url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles={title}&exintro=&exsentences=2&explaintext=&redirects=&formatversion=2&format=json"
224 |
225 |
226 |
json_data = await fetch_json(url, session)
227 |
cleaned_first_para = cleaner(json_data.get('query', {}).get('pages', [{}])[0].get('extract', 'None'))
228 |
229 |
query_desc = f"""SELECT ?subjectLabel
230 |
231 |
wd:{qid} schema:description ?subjectLabel .
232 |
FILTER(LANG(?subjectLabel) = "en")
233 |
234 |
235 |
236 |
results = await get_results(query_desc)
237 |
if results["results"]["bindings"]:
238 |
for result in results["results"]["bindings"]:
239 |
for key, value in result.items():
240 |
cleaned_first_para = value.get("value", "None")
241 |
242 |
list_with_sent.append({"qid": qid, "label": label, "description": cleaned_first_para})
243 |
244 |
if alias_list:
245 |
for alias in alias_list:
246 |
list_with_sent.append({"qid": qid, "label": alias.lower(), "description": cleaned_first_para})
247 |
248 |
return list_with_sent
249 |
250 |
# Main async function to handle multiple QIDs with batching
251 |
async def main(name):
252 |
with open(f"qids_folder/{name}.json", "r") as f:
253 |
final_list = []
254 |
qids = json.load(f)
255 |
for q in qids:
256 |
returned_list = await retriever(q)
257 |
if returned_list:
258 |
259 |
260 |
with open(f"info_extraction/{name}.json", "w", encoding="utf-8") as flast:
261 |
json.dump(final_list, flast)
262 |
263 |
def check_sentence(sentence):
264 |
two_consecutive_uppercase = r"[A-Z]{2}"
265 |
uppercase_followed_by_fullstop = r"[A-Z]\."
266 |
267 |
if re.search(two_consecutive_uppercase, sentence):
268 |
return True
269 |
270 |
if re.search(uppercase_followed_by_fullstop, sentence):
271 |
return True
272 |
273 |
return False
274 |
275 |
chrome_driver_path = "chromedriver.exe"
276 |
chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"'
277 |
278 |
def main_cli():
279 |
st.title("✨ Entity Linking Application ✨")
280 |
st.caption("This Web Application is part of my master dissertation.")
281 |
282 |
283 |
input_sentence_user = st.text_input("Enter the sentence:", "")
284 |
input_mention_user = st.text_input("Enter the mention:", "")
285 |
single = st.selectbox("Search each word individually?", ['Yes', 'No'], index=1)
286 |
combi = st.selectbox("Make combinations of each word?", ['Yes', 'No'], index=1)
287 |
disambi = st.selectbox("Run acronym disambiguation? (Enable it if the mention is nested)", ['Yes', 'No'], index=0)
288 |
289 |
290 |
if st.button("Run Entity Linking"):
291 |
if input_sentence_user and input_mention_user:
292 |
# Example logic: check if the mention is in the sentence
293 |
if input_mention_user in input_sentence_user:
294 |
st.write("Applying Data Normalization module... (1/5)")
295 |
# Data Normalization
296 |
297 |
start_time = time.time()
298 |
299 |
list_with_full_names = []
300 |
list_with_names_to_show = []
301 |
302 |
if disambi == "Yes":
303 |
response = client.chat.completions.create(
304 |
305 |
306 |
"role": "system",
307 |
"content": """
308 |
I will give you one or more labels within a sentence. Your task is as follows:
309 |
310 |
Identify each label in the sentence, and check if it is an acronym.
311 |
312 |
If the label is an acronym, respond with the full name of the acronym.
313 |
If the label is not an acronym, respond with the label exactly as it was given to you.
314 |
If a label contains multiple terms (e.g., 'phase and DIC microscopy'), treat each term within the label as a separate label.
315 |
316 |
This means you should identify and explain each part of the label individually.
317 |
Each part should be on its own line in the response.
318 |
Context-Specific Terms: If the sentence context suggests a relevant term that applies to each label (such as "study" in 'morphological, sedimentological, and stratigraphical study'), add that term to each label’s explanation.
319 |
320 |
Use context clues to determine the appropriate term to add (e.g., 'study' or 'microscopy').
321 |
Output Format: Your response should contain only the explanations, formatted as follows:
322 |
323 |
Each label or part of a label should be on a new line.
324 |
Do not include any additional text, and do not repeat the original sentence.
325 |
Example 1:
326 |
327 |
328 |
329 |
label: phase and DIC microscopy
330 |
context: Tardigrades have been extracted from samples using centrifugation with Ludox AM™ and mounted on individual microscope slides in Hoyer's medium for identification under phase and DIC microscopy.
331 |
Expected response:
332 |
333 |
phase: phase microscopy
334 |
DIC microscopy: Differential interference contrast microscopy
335 |
Example 2:
336 |
337 |
338 |
339 |
label: morphological, sedimentological, and stratigraphical study
340 |
context: This paper presents results of a morphological, sedimentological, and stratigraphical study of relict beach ridges formed on a prograded coastal barrier in Bream Bay, North Island New Zealand.
341 |
Expected response:
342 |
343 |
morphological: morphological study
344 |
sedimentological: sedimentological study
345 |
stratigraphical: stratigraphical study
346 |
347 |
348 |
Each label, even if nested within another, should be treated as an individual item.
349 |
Each individual label or acronym should be output on a separate line.
350 |
351 |
352 |
353 |
"role": "user",
354 |
"content": f"label:{input_mention_user}, context:{input_sentence_user}"
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
kati = response.choices[0].message.content.splitlines()
366 |
367 |
for i in kati:
368 |
context = i.split(":")[-1].strip()
369 |
original_name = i.split(":")[0].strip()
370 |
371 |
372 |
373 |
name = ",".join(list_with_full_names)
374 |
375 |
376 |
name = input_mention_user
377 |
378 |
379 |
380 |
input_sentence_user = input_sentence_user.replace(input_mention_user, name) # Changing the mention to the correct one
381 |
382 |
response = client.chat.completions.create(
383 |
384 |
385 |
"role": "system",
386 |
"content": "Given a label or labels within a sentence, provide a brief description (2-3 sentences) explaining what the label represents, similar to how a Wikipedia entry would. Format your response as follows: label: description. I want only the description of the label, not the role in the context. Include the label in the description as well. For example: Sentiment analysis: Sentiment analysis is the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information.\nText analysis: Text mining, text data mining (TDM) or text analytics is the process of deriving high-quality information from text. It involves the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources.",
387 |
388 |
389 |
"role": "user",
390 |
"content": f"label:{name}, context:{input_sentence_user}"
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
z = response.choices[0].message.content.splitlines()
402 |
list_with_contexts = []
403 |
for i in z:
404 |
context = i.split(":")[-1].strip()
405 |
406 |
407 |
# Candidate Generation & Information Extraction
408 |
async def big_main(mention, single, combi):
409 |
mention = mention.split(",")
410 |
st.write("Applying Candidate Generation module... (2/5)")
411 |
for i in mention:
412 |
await mains(i, single, combi)
413 |
st.write("Applying Information Extraction module... (3/5)")
414 |
for i in mention:
415 |
await main(i)
416 |
417 |
asyncio.run(big_main(name, single, combi))
418 |
419 |
number = 0
420 |
for i,j,o in zip(list_with_full_names,list_with_contexts,list_with_names_to_show):
421 |
number += 1
422 |
st.write(f"Applying Candidate Selection module... (4/5) [{number}/{len(list_with_full_names)}]")
423 |
with open(f"info_extraction/{i}.json", "r") as f:
424 |
json_file = json.load(f)
425 |
lista = []
426 |
lista_1 = []
427 |
for element in json_file:
428 |
qid = element.get("qid")
429 |
link = f"https://www.wikidata.org/wiki/{qid}"
430 |
label = element.get("label")
431 |
description = element.get("description")
432 |
433 |
label_emb = model.encode([label])
434 |
desc_emb = model.encode([description])
435 |
436 |
lista.append({link: [label_emb, desc_emb]})
437 |
438 |
label_dataset_emb = model.encode([i])
439 |
desc_dataset_emb = model.encode([j])
440 |
441 |
for emb in lista:
442 |
for k, v in emb.items():
443 |
cossim_label = model.similarity(label_dataset_emb, v[0][0])
444 |
desc_label = model.similarity(desc_dataset_emb, v[1][0])
445 |
emb_mean = np.mean([cossim_label, desc_label])
446 |
lista_1.append({k: emb_mean})
447 |
448 |
sorted_data = sorted(lista_1, key=lambda x: list(x.values())[0], reverse=True)
449 |
st.write(f"Applying Entity Linking module... (4/5) [{number}/{len(list_with_full_names)}]")
450 |
if sorted_data:
451 |
sorted_top = sorted_data[0]
452 |
for k, v in sorted_top.items():
453 |
qid = k.split("/")[-1]
454 |
455 |
wikidata2wikipedia = f"""
456 |
SELECT ?wikipedia
457 |
458 |
?wikipedia schema:about wd:{qid} .
459 |
?wikipedia schema:isPartOf <https://en.wikipedia.org/> .
460 |
461 |
462 |
results = get_resultss(wikidata2wikipedia)
463 |
464 |
for result in results["results"]["bindings"]:
465 |
for key, value in result.items():
466 |
wikipedia = value.get("value", "None")
467 |
468 |
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
469 |
wikidata2dbpedia = f"""
470 |
SELECT ?dbpedia
471 |
472 |
?dbpedia owl:sameAs <http://www.wikidata.org/entity/{qid}>.
473 |
474 |
475 |
476 |
477 |
results = sparql.query().convert()
478 |
for result in results["results"]["bindings"]:
479 |
dbpedia = result["dbpedia"]["value"]
480 |
481 |
st.text(f"The correct entity for '{o}' is:")
482 |
st.success(f"Wikipedia: {wikipedia}")
483 |
st.success(f"Wikidata: {k}")
484 |
st.success(f"DBpedia: {dbpedia}")
485 |
486 |
st.warning(f"The entity: {o} is NIL.")
487 |
488 |
st.warning(f"The mention '{input_mention_user}' was NOT found in the sentence.")
489 |
490 |
st.warning("Please fill in both fields.")
491 |
end_time = time.time()
492 |
execution_time = end_time - start_time
493 |
ETA = time.strftime("%H:%M:%S", time.gmtime(execution_time))
494 |
st.write(f"Execution time: {ETA}")
495 |
496 |
folder_path = "qids_folder"
497 |
for filename in os.listdir(folder_path):
498 |
file_path = os.path.join(folder_path, filename)
499 |
500 |
501 |
folder_path_1 = "info_extraction"
502 |
for filename in os.listdir(folder_path_1):
503 |
file_path = os.path.join(folder_path_1, filename)
504 |
505 |
506 |
if __name__ == "__main__":
507 |
@@ -0,0 +1,5 @@
1 |
2 |
3 |
4 |
5 |