alessandro trinca tornidor commited on
Commit
38adeb3
·
1 Parent(s): 91172b4

feat: refactor get_synsets_by_word_and_language() to reduce code duplication

Browse files
Files changed (2) hide show
  1. my_ghost_writer/app.py +16 -5
  2. my_ghost_writer/thesaurus.py +57 -166
my_ghost_writer/app.py CHANGED
@@ -132,16 +132,27 @@ def get_thesaurus_wordnet(body: RequestQueryThesaurusWordsapiBody | str) -> JSON
132
  try:
133
  response = pymongo_operations_rw.get_document_by_word(query=query)
134
  t1 = datetime.now()
135
- duration = (t1 - t0).total_seconds()
136
- app_logger.info(f"found local data, duration: {duration:.3f}s.")
137
- return JSONResponse(status_code=200, content={"duration": duration, "thesaurus": response, "source": "local"})
138
  except (PyMongoError, AssertionError) as pme:
139
  app_logger.info(f"{pme}! Let's try the remote service...")
140
 
141
  response = get_synsets_by_word_and_language(query, lang="eng")
142
  t1 = datetime.now()
143
- duration = (t1 - t0).total_seconds()
144
- app_logger.info(f"response.status_code: {response.status_code}, duration: {duration:.3f}s.")
 
 
 
 
 
 
 
 
 
 
 
145
  return JSONResponse(status_code=200, content={"duration": duration, "thesaurus": response, "source": "wordnet"})
146
 
147
 
 
132
  try:
133
  response = pymongo_operations_rw.get_document_by_word(query=query)
134
  t1 = datetime.now()
135
+ duration_t2t1 = (t1 - t0).total_seconds()
136
+ app_logger.info(f"found local data, duration: {duration_t2t1:.3f}s.")
137
+ return JSONResponse(status_code=200, content={"duration": duration_t2t1, "thesaurus": response, "source": "local"})
138
  except (PyMongoError, AssertionError) as pme:
139
  app_logger.info(f"{pme}! Let's try the remote service...")
140
 
141
  response = get_synsets_by_word_and_language(query, lang="eng")
142
  t1 = datetime.now()
143
+ duration_t1t0 = (t1 - t0).total_seconds()
144
+ n_results = len(response["results"])
145
+ app_logger.error(f"response, n_results: {n_results}; duration: {duration_t1t0:.3f}s.")
146
+ app_logger.info("=============================================================")
147
+ duration = duration_t1t0
148
+ if use_mongo:
149
+ app_logger.debug(f"use_mongo:{use_mongo}, inserting response '{response}' by query '{query}' on db...")
150
+ pymongo_operations_rw.insert_document(response)
151
+ del response["_id"] # since we inserted the wordsapi response on mongodb now it have a bson _id object not serializable by default
152
+ t2 = datetime.now()
153
+ duration_t2t1 = (t2 - t1).total_seconds()
154
+ app_logger.info(f"mongo insert, duration: {duration_t2t1:.3f}s.")
155
+ duration = duration_t1t0 + duration_t2t1
156
  return JSONResponse(status_code=200, content={"duration": duration, "thesaurus": response, "source": "wordnet"})
157
 
158
 
my_ghost_writer/thesaurus.py CHANGED
@@ -1,7 +1,6 @@
1
- from typing import Any
2
-
3
  from nltk.corpus import wordnet31 as wn
4
 
 
5
  from my_ghost_writer.type_hints import ResponseWordsAPI
6
 
7
 
@@ -10,181 +9,73 @@ def get_current_info_wordnet():
10
 
11
 
12
  def get_synsets_by_word_and_language(word: str, lang: str = "eng") -> ResponseWordsAPI:
13
- results = []
14
- for synset in wn.synsets(word, lang=lang):
15
- # Synonyms (lemmas for this synset, excluding the input word)
16
- synonyms = sorted(
17
- set(
18
- l.name().replace('_', ' ')
19
- for l in synset.lemmas(lang=lang)
20
- if l.name().lower() != word.lower()
21
- )
22
- )
23
- # Antonyms (from lemmas)
24
- antonyms = sorted(
25
- set(
26
- ant.name().replace('_', ' ')
27
- for l in synset.lemmas(lang=lang)
28
- for ant in l.antonyms()
29
- )
30
- )
31
- # Derivationally related forms (from lemmas)
32
- derivation = sorted(
33
- set(
34
- d.name().replace('_', ' ')
35
- for l in synset.lemmas(lang=lang)
36
- for d in l.derivationally_related_forms()
37
- )
38
- )
39
- # Pertainyms (from lemmas)
40
- pertains_to = sorted(
41
- set(
42
- p.name().replace('_', ' ')
43
- for l in synset.lemmas(lang=lang)
44
- for p in l.pertainyms()
45
- )
46
- )
47
- # Synset relations
48
- type_of = sorted(
49
- set(
50
- l.name().replace('_', ' ')
51
- for h in synset.hypernyms()
52
- for l in h.lemmas(lang=lang)
53
- )
54
- )
55
- # Hyponyms (hasTypes)
56
- has_types = sorted(
57
- set(
58
- l.name().replace('_', ' ')
59
- for h in synset.hyponyms()
60
- for l in h.lemmas(lang=lang)
61
- )
62
- )
63
- # Holonyms (partOf)
64
- part_of = sorted(
65
- set(
66
- l.name().replace('_', ' ')
67
- for h in synset.member_holonyms() + synset.part_holonyms() + synset.substance_holonyms()
68
- for l in h.lemmas(lang=lang)
69
- )
70
- )
71
- # Meronyms (hasParts)
72
- has_parts = sorted(
73
- set(
74
- l.name().replace('_', ' ')
75
- for h in synset.member_meronyms() + synset.part_meronyms() + synset.substance_meronyms()
76
- for l in h.lemmas(lang=lang)
77
- )
78
- )
79
- instance_of = sorted(
80
- set(
81
- l.name().replace('_', ' ')
82
- for h in synset.instance_hypernyms()
83
- for l in h.lemmas(lang=lang)
84
- )
85
- )
86
- has_instances = sorted(
87
- set(
88
- l.name().replace('_', ' ')
89
- for h in synset.instance_hyponyms()
90
- for l in h.lemmas(lang=lang)
91
- )
92
- )
93
- similar_to = sorted(
94
- set(
95
- l.name().replace('_', ' ')
96
- for h in synset.similar_tos()
97
- for l in h.lemmas(lang=lang)
98
- )
99
- )
100
- also = sorted(
101
  set(
102
- l.name().replace('_', ' ')
103
- for h in synset.also_sees()
104
- for l in h.lemmas(lang=lang)
105
  )
106
  )
107
- entails = sorted(
108
- set(
109
- l.name().replace('_', ' ')
110
- for h in synset.entailments()
111
- for l in h.lemmas(lang=lang)
112
- )
113
- )
114
- causes = sorted(
115
- set(
116
- l.name().replace('_', ' ')
117
- for h in synset.causes()
118
- for l in h.lemmas(lang=lang)
119
- )
120
- )
121
- verb_groups = sorted(
122
- set(
123
- l.name().replace('_', ' ')
124
- for h in synset.verb_groups()
125
- for l in h.lemmas(lang=lang)
126
- )
127
- )
128
- has_substances = sorted(
129
- set(
130
- l.name().replace('_', ' ')
131
- for h in synset.substance_meronyms()
132
- for l in h.lemmas(lang=lang)
133
- )
134
- )
135
- in_category = sorted(
136
  set(
137
- l.name().replace('_', ' ')
138
- for h in synset.topic_domains()
139
- for l in h.lemmas(lang=lang)
140
  )
141
  )
142
- usage_of = sorted(
 
 
 
 
 
 
 
143
  set(
144
- l.name().replace('_', ' ')
145
- for h in synset.usage_domains()
146
- for l in h.lemmas(lang=lang)
147
  )
148
  )
149
- obj = {
150
- "definition": synset.definition(lang=lang),
151
- }
152
  if synonyms:
153
  obj["synonyms"] = synonyms
154
- if type_of:
155
- obj["typeOf"] = type_of
156
- if has_types:
157
- obj["hasTypes"] = has_types
158
- if part_of:
159
- obj["partOf"] = part_of
160
- if has_parts:
161
- obj["hasParts"] = has_parts
162
- if antonyms:
163
- obj["antonyms"] = antonyms
164
- if derivation:
165
- obj["derivation"] = derivation
166
- if pertains_to:
167
- obj["pertainsTo"] = pertains_to
168
- if instance_of:
169
- obj["instanceOf"] = instance_of
170
- if has_instances:
171
- obj["hasInstances"] = has_instances
172
- if similar_to:
173
- obj["similarTo"] = similar_to
174
- if also:
175
- obj["also"] = also
176
- if entails:
177
- obj["entails"] = entails
178
- if has_substances:
179
- obj["hasSubstances"] = has_substances
180
- if in_category:
181
- obj["inCategory"] = in_category
182
- if usage_of:
183
- obj["usageOf"] = usage_of
184
- if causes:
185
- obj["causes"] = causes
186
- if verb_groups:
187
- obj["verbGroups"] = verb_groups
188
  results.append(obj)
189
  return {
190
  "word": word,
 
 
 
1
  from nltk.corpus import wordnet31 as wn
2
 
3
+ from my_ghost_writer.constants import app_logger
4
  from my_ghost_writer.type_hints import ResponseWordsAPI
5
 
6
 
 
9
 
10
 
11
  def get_synsets_by_word_and_language(word: str, lang: str = "eng") -> ResponseWordsAPI:
12
+ app_logger.info("start...")
13
+ def lemma_names(synsets):
14
+ return sorted(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  set(
16
+ lemma.name().replace('_', ' ')
17
+ for syn in synsets
18
+ for lemma in syn.lemmas(lang=lang)
19
  )
20
  )
21
+
22
+ def lemma_related(lemmas_input, lemmas_method):
23
+ return sorted(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  set(
25
+ rel.name().replace('_', ' ')
26
+ for lemma in lemmas_input
27
+ for rel in getattr(lemma, lemmas_method)()
28
  )
29
  )
30
+
31
+ results = []
32
+ for synset in wn.synsets(word, lang=lang):
33
+ lemmas = synset.lemmas(lang=lang)
34
+ obj = {"definition": synset.definition(lang=lang)}
35
+
36
+ # Single-line fields
37
+ synonyms = sorted(
38
  set(
39
+ lemma.name().replace('_', ' ')
40
+ for lemma in lemmas
41
+ if lemma.name().lower() != word.lower()
42
  )
43
  )
 
 
 
44
  if synonyms:
45
  obj["synonyms"] = synonyms
46
+
47
+ # Lemma-based relations
48
+ for field, method in [
49
+ ("antonyms", "antonyms"),
50
+ ("derivation", "derivationally_related_forms"),
51
+ ("pertainsTo", "pertainyms"),
52
+ ]:
53
+ values = lemma_related(lemmas, method)
54
+ if values:
55
+ obj[field] = values
56
+
57
+ # Synset-based relations
58
+ synset_relations = [
59
+ ("typeOf", synset.hypernyms()),
60
+ ("hasTypes", synset.hyponyms()),
61
+ ("partOf", synset.member_holonyms() + synset.part_holonyms() + synset.substance_holonyms()),
62
+ ("hasParts", synset.member_meronyms() + synset.part_meronyms() + synset.substance_meronyms()),
63
+ ("instanceOf", synset.instance_hypernyms()),
64
+ ("hasInstances", synset.instance_hyponyms()),
65
+ ("similarTo", synset.similar_tos()),
66
+ ("also", synset.also_sees()),
67
+ ("entails", synset.entailments()),
68
+ ("hasSubstances", synset.substance_meronyms()),
69
+ ("inCategory", synset.topic_domains()),
70
+ ("usageOf", synset.usage_domains()),
71
+ ("causes", synset.causes()),
72
+ ("verbGroups", synset.verb_groups()),
73
+ ]
74
+ for field, syns in synset_relations:
75
+ values = lemma_names(syns)
76
+ if values:
77
+ obj[field] = values
78
+
 
79
  results.append(obj)
80
  return {
81
  "word": word,