NikosKprl commited on
Commit
2d2d25b
·
verified ·
1 Parent(s): 07c6579

Update ✨Entity Linking Application✨.py

Browse files
Files changed (1) hide show
  1. ✨Entity Linking Application✨.py +180 -176
✨Entity Linking Application✨.py CHANGED
@@ -298,198 +298,202 @@ def main_cli():
298
  if input_sentence_user and input_mention_user:
299
  # check if the mention is in the sentence
300
  if input_mention_user in input_sentence_user:
301
- st.write("Applying Data Normalization module... (1/5)")
302
- # Data Normalization
303
-
304
- start_time = time.time()
305
-
306
- list_with_full_names = []
307
- list_with_names_to_show = []
308
-
309
- if disambi == "Yes":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  response = client.chat.completions.create(
311
- messages=[
312
- {
313
- "role": "system",
314
- "content": """
315
- I will give you one or more labels within a sentence. Your task is as follows:
316
-
317
- Identify each label in the sentence, and check if it is an acronym.
318
-
319
- If the label is an acronym, respond with the full name of the acronym.
320
- If the label is not an acronym, respond with the label exactly as it was given to you.
321
- If a label contains multiple terms (e.g., 'phase and DIC microscopy'), treat each term within the label as a separate label.
322
-
323
- This means you should identify and explain each part of the label individually.
324
- Each part should be on its own line in the response.
325
- Context-Specific Terms: If the sentence context suggests a relevant term that applies to each label (such as "study" in 'morphological, sedimentological, and stratigraphical study'), add that term to each label’s explanation.
326
-
327
- Use context clues to determine the appropriate term to add (e.g., 'study' or 'microscopy').
328
- Output Format: Your response should contain only the explanations, formatted as follows:
329
-
330
- Each label or part of a label should be on a new line.
331
- Do not include any additional text, and do not repeat the original sentence.
332
- Example 1:
333
-
334
- Input:
335
-
336
- label: phase and DIC microscopy
337
- context: Tardigrades have been extracted from samples using centrifugation with Ludox AM™ and mounted on individual microscope slides in Hoyer's medium for identification under phase and DIC microscopy.
338
- Expected response:
339
-
340
- phase: phase microscopy
341
- DIC microscopy: Differential interference contrast microscopy
342
- Example 2:
343
-
344
- Input:
345
-
346
- label: morphological, sedimentological, and stratigraphical study
347
- context: This paper presents results of a morphological, sedimentological, and stratigraphical study of relict beach ridges formed on a prograded coastal barrier in Bream Bay, North Island New Zealand.
348
- Expected response:
349
-
350
- morphological: morphological study
351
- sedimentological: sedimentological study
352
- stratigraphical: stratigraphical study
353
- IMPORTANT:
354
-
355
- Each label, even if nested within another, should be treated as an individual item.
356
- Each individual label or acronym should be output on a separate line.
357
- """
358
- },
359
- {
360
- "role": "user",
361
- "content": f"label:{input_mention_user}, context:{input_sentence_user}"
362
- }
363
- ],
364
- temperature=1.0,
365
- top_p=1.0,
366
- max_tokens=1000,
367
- model=model_name
368
- )
369
-
370
 
371
- kati = response.choices[0].message.content.splitlines()
372
  print(response.choices[0].message.content)
373
- for i in kati:
 
374
  context = i.split(":")[-1].strip()
375
- original_name = i.split(":")[0].strip()
376
- list_with_full_names.append(context)
377
- list_with_names_to_show.append(original_name)
378
-
379
- name = ",".join(list_with_full_names)
380
-
381
- else:
382
- name = input_mention_user
383
- list_with_full_names.append(name)
384
- list_with_names_to_show.append(name)
385
-
386
- input_sentence_user = input_sentence_user.replace(input_mention_user, name) # Changing the mention to the correct one
387
-
388
- response = client.chat.completions.create(
389
- messages=[
390
- {
391
- "role": "system",
392
- "content": "Given a label or labels within a sentence, provide a brief description (2-3 sentences) explaining what the label represents, similar to how a Wikipedia entry would. Format your response as follows: label: description. I want only the description of the label, not the role in the context. Include the label in the description as well. For example: Sentiment analysis: Sentiment analysis is the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information.\nText analysis: Text mining, text data mining (TDM) or text analytics is the process of deriving high-quality information from text. It involves the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources.",
393
- },
394
- {
395
- "role": "user",
396
- "content": f"label:{name}, context:{input_sentence_user}"
397
- }
398
- ],
399
- temperature=1.0,
400
- top_p=1.0,
401
- max_tokens=1000,
402
- model=model_name
403
- )
404
-
405
-
406
- z = response.choices[0].message.content.splitlines()
407
- print(response.choices[0].message.content)
408
- list_with_contexts = []
409
- for i in z:
410
- context = i.split(":")[-1].strip()
411
- list_with_contexts.append(context)
412
-
413
  # Candidate Retrieval & Information Gathering
414
  async def big_main(mention, single, combi):
415
  mention = mention.split(",")
416
- st.write("Applying Candidate Retrieval module... (2/5)")
417
- for i in mention:
418
- await mains(i, single, combi)
419
- st.write("Applying Information Gathering module... (3/5)")
420
- for i in mention:
421
- await main(i)
 
 
422
 
423
  asyncio.run(big_main(name, single, combi))
424
 
425
  number = 0
426
  for i,j,o in zip(list_with_full_names,list_with_contexts,list_with_names_to_show):
427
  number += 1
428
- st.write(f"Applying Candidate Selection module... (4/5) [{number}/{len(list_with_full_names)}]")
429
- with open(f"/home/user/app/info_extraction/{i}.json", "r") as f:
430
- json_file = json.load(f)
431
- lista = []
432
- lista_1 = []
433
- for element in json_file:
434
- qid = element.get("qid")
435
- link = f"https://www.wikidata.org/wiki/{qid}"
436
- label = element.get("label")
437
- description = element.get("description")
438
-
439
- label_emb = model.encode([label])
440
- desc_emb = model.encode([description])
441
-
442
- lista.append({link: [label_emb, desc_emb]})
443
-
444
- label_dataset_emb = model.encode([i])
445
- desc_dataset_emb = model.encode([j])
446
-
447
- for emb in lista:
448
- for k, v in emb.items():
449
- cossim_label = model.similarity(label_dataset_emb, v[0][0])
450
- desc_label = model.similarity(desc_dataset_emb, v[1][0])
451
- emb_mean = np.mean([cossim_label, desc_label])
452
- lista_1.append({k: emb_mean})
453
-
454
- sorted_data = sorted(lista_1, key=lambda x: list(x.values())[0], reverse=True)
455
- st.write(f"Applying Candidate Matching module... (4/5) [{number}/{len(list_with_full_names)}]")
456
- if sorted_data:
457
- sorted_top = sorted_data[0]
458
- for k, v in sorted_top.items():
459
- qid = k.split("/")[-1]
460
 
461
- wikidata2wikipedia = f"""
462
- SELECT ?wikipedia
463
- WHERE {{
464
- ?wikipedia schema:about wd:{qid} .
465
- ?wikipedia schema:isPartOf <https://en.wikipedia.org/> .
466
- }}
467
- """
468
- results = get_resultss(wikidata2wikipedia)
469
-
470
- for result in results["results"]["bindings"]:
471
- for key, value in result.items():
472
- wikipedia = value.get("value", "None")
473
 
474
- sparql = SPARQLWrapper("http://dbpedia.org/sparql")
475
- wikidata2dbpedia = f"""
476
- SELECT ?dbpedia
477
- WHERE {{
478
- ?dbpedia owl:sameAs <http://www.wikidata.org/entity/{qid}>.
479
- }}
480
- """
481
- sparql.setQuery(wikidata2dbpedia)
482
- sparql.setReturnFormat(JSON)
483
- results = sparql.query().convert()
484
- for result in results["results"]["bindings"]:
485
- dbpedia = result["dbpedia"]["value"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
 
487
- st.text(f"The correct entity for '{o}' is:")
488
- st.success(f"Wikipedia: {wikipedia}")
489
- st.success(f"Wikidata: {k}")
490
- st.success(f"DBpedia: {dbpedia}")
491
- else:
492
- st.warning(f"The entity: {o} is NIL.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  else:
494
  st.warning(f"The mention '{input_mention_user}' was NOT found in the sentence.")
495
  else:
 
298
  if input_sentence_user and input_mention_user:
299
  # check if the mention is in the sentence
300
  if input_mention_user in input_sentence_user:
301
+ with st.spinner("Applying Data Normalization module... (1/5)")
302
+ # Data Normalization
303
+ start_time = time.time()
304
+
305
+ list_with_full_names = []
306
+ list_with_names_to_show = []
307
+
308
+ if disambi == "Yes":
309
+ response = client.chat.completions.create(
310
+ messages=[
311
+ {
312
+ "role": "system",
313
+ "content": """
314
+ I will give you one or more labels within a sentence. Your task is as follows:
315
+
316
+ Identify each label in the sentence, and check if it is an acronym.
317
+
318
+ If the label is an acronym, respond with the full name of the acronym.
319
+ If the label is not an acronym, respond with the label exactly as it was given to you.
320
+ If a label contains multiple terms (e.g., 'phase and DIC microscopy'), treat each term within the label as a separate label.
321
+
322
+ This means you should identify and explain each part of the label individually.
323
+ Each part should be on its own line in the response.
324
+ Context-Specific Terms: If the sentence context suggests a relevant term that applies to each label (such as "study" in 'morphological, sedimentological, and stratigraphical study'), add that term to each label’s explanation.
325
+
326
+ Use context clues to determine the appropriate term to add (e.g., 'study' or 'microscopy').
327
+ Output Format: Your response should contain only the explanations, formatted as follows:
328
+
329
+ Each label or part of a label should be on a new line.
330
+ Do not include any additional text, and do not repeat the original sentence.
331
+ Example 1:
332
+
333
+ Input:
334
+
335
+ label: phase and DIC microscopy
336
+ context: Tardigrades have been extracted from samples using centrifugation with Ludox AM™ and mounted on individual microscope slides in Hoyer's medium for identification under phase and DIC microscopy.
337
+ Expected response:
338
+
339
+ phase: phase microscopy
340
+ DIC microscopy: Differential interference contrast microscopy
341
+ Example 2:
342
+
343
+ Input:
344
+
345
+ label: morphological, sedimentological, and stratigraphical study
346
+ context: This paper presents results of a morphological, sedimentological, and stratigraphical study of relict beach ridges formed on a prograded coastal barrier in Bream Bay, North Island New Zealand.
347
+ Expected response:
348
+
349
+ morphological: morphological study
350
+ sedimentological: sedimentological study
351
+ stratigraphical: stratigraphical study
352
+ IMPORTANT:
353
+
354
+ Each label, even if nested within another, should be treated as an individual item.
355
+ Each individual label or acronym should be output on a separate line.
356
+ """
357
+ },
358
+ {
359
+ "role": "user",
360
+ "content": f"label:{input_mention_user}, context:{input_sentence_user}"
361
+ }
362
+ ],
363
+ temperature=1.0,
364
+ top_p=1.0,
365
+ max_tokens=1000,
366
+ model=model_name
367
+ )
368
+
369
+
370
+ kati = response.choices[0].message.content.splitlines()
371
+ print(response.choices[0].message.content)
372
+ for i in kati:
373
+ context = i.split(":")[-1].strip()
374
+ original_name = i.split(":")[0].strip()
375
+ list_with_full_names.append(context)
376
+ list_with_names_to_show.append(original_name)
377
+
378
+ name = ",".join(list_with_full_names)
379
+
380
+ else:
381
+ name = input_mention_user
382
+ list_with_full_names.append(name)
383
+ list_with_names_to_show.append(name)
384
+
385
+ input_sentence_user = input_sentence_user.replace(input_mention_user, name) # Changing the mention to the correct one
386
+
387
  response = client.chat.completions.create(
388
+ messages=[
389
+ {
390
+ "role": "system",
391
+ "content": "Given a label or labels within a sentence, provide a brief description (2-3 sentences) explaining what the label represents, similar to how a Wikipedia entry would. Format your response as follows: label: description. I want only the description of the label, not the role in the context. Include the label in the description as well. For example: Sentiment analysis: Sentiment analysis is the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information.\nText analysis: Text mining, text data mining (TDM) or text analytics is the process of deriving high-quality information from text. It involves the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources.",
392
+ },
393
+ {
394
+ "role": "user",
395
+ "content": f"label:{name}, context:{input_sentence_user}"
396
+ }
397
+ ],
398
+ temperature=1.0,
399
+ top_p=1.0,
400
+ max_tokens=1000,
401
+ model=model_name
402
+ )
403
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
+ z = response.choices[0].message.content.splitlines()
406
  print(response.choices[0].message.content)
407
+ list_with_contexts = []
408
+ for i in z:
409
  context = i.split(":")[-1].strip()
410
+ list_with_contexts.append(context)
411
+ st.write("✅ Applied Data Normilzation module (1/5)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  # Candidate Retrieval & Information Gathering
413
  async def big_main(mention, single, combi):
414
  mention = mention.split(",")
415
+ with st.spinner("Applying Candidate Retrieval module... (2/5)"):
416
+ for i in mention:
417
+ await mains(i, single, combi)
418
+ st.write(" Applied Candidate Retrieval module (2/5)")
419
+ with st.spinner("Applying Information Gathering module... (3/5)"):
420
+ for i in mention:
421
+ await main(i)
422
+ st.write("✅ Applied Information Gathering module (3/5)")
423
 
424
  asyncio.run(big_main(name, single, combi))
425
 
426
  number = 0
427
  for i,j,o in zip(list_with_full_names,list_with_contexts,list_with_names_to_show):
428
  number += 1
429
+ with st.spinner(f"Applying Candidate Selection module... (4/5) [{number}/{len(list_with_full_names)}]")):
430
+ with open(f"/home/user/app/info_extraction/{i}.json", "r") as f:
431
+ json_file = json.load(f)
432
+ lista = []
433
+ lista_1 = []
434
+ for element in json_file:
435
+ qid = element.get("qid")
436
+ link = f"https://www.wikidata.org/wiki/{qid}"
437
+ label = element.get("label")
438
+ description = element.get("description")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
 
440
+ label_emb = model.encode([label])
441
+ desc_emb = model.encode([description])
 
 
 
 
 
 
 
 
 
 
442
 
443
+ lista.append({link: [label_emb, desc_emb]})
444
+
445
+ label_dataset_emb = model.encode([i])
446
+ desc_dataset_emb = model.encode([j])
447
+
448
+ for emb in lista:
449
+ for k, v in emb.items():
450
+ cossim_label = model.similarity(label_dataset_emb, v[0][0])
451
+ desc_label = model.similarity(desc_dataset_emb, v[1][0])
452
+ emb_mean = np.mean([cossim_label, desc_label])
453
+ lista_1.append({k: emb_mean})
454
+ print(k)
455
+
456
+ sorted_data = sorted(lista_1, key=lambda x: list(x.values())[0], reverse=True)
457
+ st.write(f"✅ Applined Candidate Selection module (4/5) [{number}/{len(list_with_full_names)}]")
458
+ with st.spinner(f"Applying Candidate Matching module... (5/5) [{number}/{len(list_with_full_names)}]"):
459
+ if sorted_data:
460
+ sorted_top = sorted_data[0]
461
+ for k, v in sorted_top.items():
462
+ qid = k.split("/")[-1]
463
+
464
+ wikidata2wikipedia = f"""
465
+ SELECT ?wikipedia
466
+ WHERE {{
467
+ ?wikipedia schema:about wd:{qid} .
468
+ ?wikipedia schema:isPartOf <https://en.wikipedia.org/> .
469
+ }}
470
+ """
471
+ results = get_resultss(wikidata2wikipedia)
472
+
473
+ for result in results["results"]["bindings"]:
474
+ for key, value in result.items():
475
+ wikipedia = value.get("value", "None")
476
 
477
+ sparql = SPARQLWrapper("http://dbpedia.org/sparql")
478
+ wikidata2dbpedia = f"""
479
+ SELECT ?dbpedia
480
+ WHERE {{
481
+ ?dbpedia owl:sameAs <http://www.wikidata.org/entity/{qid}>.
482
+ }}
483
+ """
484
+ sparql.setQuery(wikidata2dbpedia)
485
+ sparql.setReturnFormat(JSON)
486
+ results = sparql.query().convert()
487
+ for result in results["results"]["bindings"]:
488
+ dbpedia = result["dbpedia"]["value"]
489
+
490
+ st.text(f"The correct entity for '{o}' is:")
491
+ st.success(f"Wikipedia: {wikipedia}")
492
+ st.success(f"Wikidata: {k}")
493
+ st.success(f"DBpedia: {dbpedia}")
494
+ else:
495
+ st.warning(f"The entity: {o} is NIL.")
496
+ st.write(f"✅ Applied Candidate Matching module (5/5) [{number}/{len(list_with_full_names)}]")
497
  else:
498
  st.warning(f"The mention '{input_mention_user}' was NOT found in the sentence.")
499
  else: