alessandro trinca tornidor
commited on
Commit
·
5ffb0e3
1
Parent(s):
a901fdc
feat: multi-words substitution support, backend part
Browse files- .vscode/launch.json +71 -1
- my_ghost_writer/app.py +66 -4
- my_ghost_writer/constants.py +2 -1
- my_ghost_writer/text_parsers2.py +91 -18
- my_ghost_writer/type_hints.py +16 -1
.vscode/launch.json
CHANGED
@@ -12,7 +12,77 @@
|
|
12 |
"env": {
|
13 |
"IS_TESTING": "TRUE",
|
14 |
"LOG_LEVEL": "DEBUG"
|
15 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
]
|
18 |
}
|
|
|
12 |
"env": {
|
13 |
"IS_TESTING": "TRUE",
|
14 |
"LOG_LEVEL": "DEBUG"
|
15 |
+
}
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"type": "msedge",
|
19 |
+
"request": "launch",
|
20 |
+
"name": "Launch app port 8000",
|
21 |
+
"url": "http://localhost:8000/#",
|
22 |
+
"runtimeExecutable": "/snap/bin/chromium",
|
23 |
+
"file": "${workspaceFolder}/lite.koboldai.net/index.html",
|
24 |
+
"webRoot": "${workspaceFolder}/lite.koboldai.net",
|
25 |
+
"trace": true,
|
26 |
+
"port": 8000,
|
27 |
+
"runtimeArgs": [
|
28 |
+
"--remote-debugging-port=9222"
|
29 |
+
]
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"type": "msedge",
|
33 |
+
"name": "Launch Edge (Visible)",
|
34 |
+
"request": "launch",
|
35 |
+
"runtimeArgs": [
|
36 |
+
"--remote-debugging-port=9222"
|
37 |
+
],
|
38 |
+
"url": "http://localhost:8000", // or your dev server address
|
39 |
+
"presentation": {
|
40 |
+
"hidden": false
|
41 |
+
}
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"type": "chrome",
|
45 |
+
"request": "attach",
|
46 |
+
"name": "Attach to browser",
|
47 |
+
"port": 9222,
|
48 |
+
"address": "localhost"
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"type": "pwa-msedge",
|
52 |
+
"name": "Launch Microsoft Edge",
|
53 |
+
"request": "launch",
|
54 |
+
"runtimeArgs": [
|
55 |
+
"--remote-debugging-port=8000"
|
56 |
+
],
|
57 |
+
"url": "http://localhost:8000/#",
|
58 |
+
"file": "${workspaceFolder}/lite.koboldai.net/index.html",
|
59 |
+
"webRoot": "${workspaceFolder}/lite.koboldai.net",
|
60 |
+
"trace": true
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"type": "vscode-edge-devtools.debug",
|
64 |
+
"name": "Open Edge DevTools",
|
65 |
+
"request": "attach",
|
66 |
+
"url": "c:\\Users\\trincuz\\.vscode\\extensions\\ms-edgedevtools.vscode-edge-devtools-2.1.9\\out\\startpage\\index.html",
|
67 |
+
"presentation": {
|
68 |
+
"hidden": true
|
69 |
+
}
|
70 |
}
|
71 |
+
],
|
72 |
+
"compounds": [
|
73 |
+
{
|
74 |
+
"name": "Launch Edge and attach DevTools",
|
75 |
+
"configurations": [
|
76 |
+
"Launch Microsoft Edge",
|
77 |
+
"Open Edge DevTools"
|
78 |
+
]
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"name": "Launch Edge (Visible) and attach DevTools",
|
82 |
+
"configurations": [
|
83 |
+
"Launch Edge (Visible)",
|
84 |
+
"Open Edge DevTools"
|
85 |
+
]
|
86 |
+
},
|
87 |
]
|
88 |
}
|
my_ghost_writer/app.py
CHANGED
@@ -24,10 +24,11 @@ from my_ghost_writer.constants import (ALLOWED_ORIGIN_LIST, API_MODE, DOMAIN, IS
|
|
24 |
ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP, ME_CONFIG_MONGODB_USE_OK, PORT, RAPIDAPI_HOST, STATIC_FOLDER,
|
25 |
STATIC_FOLDER_LITEKOBOLDAINET, WORDSAPI_KEY, WORDSAPI_URL, app_logger)
|
26 |
from my_ghost_writer.pymongo_utils import mongodb_health_check
|
27 |
-
from my_ghost_writer.text_parsers2 import extract_contextual_info_by_indices, process_synonym_groups
|
28 |
from my_ghost_writer.thesaurus import get_current_info_wordnet, get_synsets_by_word_and_language
|
29 |
-
from my_ghost_writer.type_hints import RequestQueryThesaurusInflatedBody,
|
30 |
-
|
|
|
31 |
|
32 |
|
33 |
async def mongo_health_check_background_task():
|
@@ -261,7 +262,7 @@ def get_thesaurus_wordsapi(body: RequestQueryThesaurusWordsapiBody | str) -> JSO
|
|
261 |
raise HTTPException(status_code=response.status_code, detail=msg)
|
262 |
|
263 |
|
264 |
-
@app.post("/thesaurus-inflated", response_model=
|
265 |
async def get_synonyms(body: RequestQueryThesaurusInflatedBody):
|
266 |
"""
|
267 |
Get contextually appropriate synonyms for a word at specific indices in text.
|
@@ -360,6 +361,67 @@ async def get_synonyms(body: RequestQueryThesaurusInflatedBody):
|
|
360 |
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
361 |
|
362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
@app.exception_handler(HTTPException)
|
364 |
def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
365 |
origin = request.headers.get("origin")
|
|
|
24 |
ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP, ME_CONFIG_MONGODB_USE_OK, PORT, RAPIDAPI_HOST, STATIC_FOLDER,
|
25 |
STATIC_FOLDER_LITEKOBOLDAINET, WORDSAPI_KEY, WORDSAPI_URL, app_logger)
|
26 |
from my_ghost_writer.pymongo_utils import mongodb_health_check
|
27 |
+
from my_ghost_writer.text_parsers2 import extract_contextual_info_by_indices, process_synonym_groups, find_synonyms_for_phrase
|
28 |
from my_ghost_writer.thesaurus import get_current_info_wordnet, get_synsets_by_word_and_language
|
29 |
+
from my_ghost_writer.type_hints import (RequestQueryThesaurusInflatedBody, RequestQueryThesaurusWordsapiBody,
|
30 |
+
RequestSplitText, RequestTextFrequencyBody, MultiWordSynonymResponse,
|
31 |
+
SingleWordSynonymResponse)
|
32 |
|
33 |
|
34 |
async def mongo_health_check_background_task():
|
|
|
262 |
raise HTTPException(status_code=response.status_code, detail=msg)
|
263 |
|
264 |
|
265 |
+
@app.post("/thesaurus-inflated", response_model=SingleWordSynonymResponse)
|
266 |
async def get_synonyms(body: RequestQueryThesaurusInflatedBody):
|
267 |
"""
|
268 |
Get contextually appropriate synonyms for a word at specific indices in text.
|
|
|
361 |
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
362 |
|
363 |
|
364 |
+
@app.post("/thesaurus-inflated-phrase", response_model=MultiWordSynonymResponse)
|
365 |
+
async def get_synonyms_for_phrase(body: RequestQueryThesaurusInflatedBody):
|
366 |
+
"""
|
367 |
+
Get contextual synonyms for a selected phrase (one or more words).
|
368 |
+
It identifies all meaningful words in the selection and returns
|
369 |
+
synonym groups for each.
|
370 |
+
"""
|
371 |
+
app_logger.info(f"body tye:{type(body)}!")
|
372 |
+
app_logger.info(f"body:{body}!")
|
373 |
+
t0 = datetime.now()
|
374 |
+
try:
|
375 |
+
body_validated = RequestQueryThesaurusInflatedBody.model_validate_json(body)
|
376 |
+
end = body_validated.end
|
377 |
+
start = body_validated.start
|
378 |
+
text = body_validated.text
|
379 |
+
word = body_validated.word
|
380 |
+
except ValidationError:
|
381 |
+
assert isinstance(body, RequestQueryThesaurusInflatedBody), f"body MUST be of type RequestSplitText, not of '{type(body)}'!"
|
382 |
+
end = body.end
|
383 |
+
start = body.start
|
384 |
+
text = body.text
|
385 |
+
word = body.word
|
386 |
+
app_logger.info(f"end:{end}!")
|
387 |
+
app_logger.info(f"start:{start}!")
|
388 |
+
app_logger.info(f"text:{text}!")
|
389 |
+
app_logger.info(f"word:{word}!")
|
390 |
+
|
391 |
+
try:
|
392 |
+
# The new function in text_parsers2 does all the heavy lifting
|
393 |
+
results = find_synonyms_for_phrase(
|
394 |
+
text=body.text,
|
395 |
+
start_idx=body.start,
|
396 |
+
end_idx=body.end
|
397 |
+
)
|
398 |
+
t1 = datetime.now()
|
399 |
+
duration = (t1 - t0).total_seconds()
|
400 |
+
app_logger.info(f"got find_synonyms_for_phrase() result in: {duration:.3f}s. ...")
|
401 |
+
app_logger.info(results)
|
402 |
+
|
403 |
+
message = f"Got {len(results)} synonym groups." if results else "No words with synonyms found in the selected phrase."
|
404 |
+
|
405 |
+
t2 = datetime.now()
|
406 |
+
duration = (t2 - t1).total_seconds()
|
407 |
+
app_logger.info(f"got MultiWordSynonymResponse() result in: {duration:.3f}s. ...")
|
408 |
+
# Construct the final response using our Pydantic model
|
409 |
+
return MultiWordSynonymResponse(
|
410 |
+
success=True,
|
411 |
+
original_phrase=body.word,
|
412 |
+
original_indices={"start": body.start, "end": body.end},
|
413 |
+
results=results,
|
414 |
+
message=message
|
415 |
+
)
|
416 |
+
|
417 |
+
except HTTPException:
|
418 |
+
# Re-raise known HTTP exceptions to be handled by FastAPI's handler
|
419 |
+
raise
|
420 |
+
except Exception as e:
|
421 |
+
app_logger.error(f"Unexpected error in get_synonyms_for_phrase: '{e}'", exc_info=True)
|
422 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
423 |
+
|
424 |
+
|
425 |
@app.exception_handler(HTTPException)
|
426 |
def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
427 |
origin = request.headers.get("origin")
|
my_ghost_writer/constants.py
CHANGED
@@ -32,7 +32,8 @@ ME_CONFIG_MONGODB_TIMEOUT_LOCAL = int(os.getenv("ME_CONFIG_MONGODB_TIMEOUT_LOCAL
|
|
32 |
ME_CONFIG_MONGODB_TIMEOUT_REMOTE = int(os.getenv("ME_CONFIG_MONGODB_TIMEOUT_REMOTE", 3000))
|
33 |
ME_CONFIG_MONGODB_TIMEOUT = int(os.getenv( "ME_CONFIG_MONGODB_TIMEOUT", ME_CONFIG_MONGODB_TIMEOUT_LOCAL if ME_CONFIG_MONGODB_URL == ME_CONFIG_MONGODB_URL_LOCAL else ME_CONFIG_MONGODB_TIMEOUT_REMOTE))
|
34 |
ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP = int(os.getenv("ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP", 900))
|
35 |
-
DEFAULT_DBNAME_THESAURUS = "thesaurus"
|
36 |
DEFAULT_COLLECTION_THESAURUS =os.getenv("DEFAULT_COLLECTION_THESAURUS", "wordsapi")
|
|
|
|
|
37 |
session_logger.setup_logging(json_logs=LOG_JSON_FORMAT, log_level=LOG_LEVEL)
|
38 |
app_logger = structlog.stdlib.get_logger(__name__)
|
|
|
32 |
ME_CONFIG_MONGODB_TIMEOUT_REMOTE = int(os.getenv("ME_CONFIG_MONGODB_TIMEOUT_REMOTE", 3000))
|
33 |
ME_CONFIG_MONGODB_TIMEOUT = int(os.getenv( "ME_CONFIG_MONGODB_TIMEOUT", ME_CONFIG_MONGODB_TIMEOUT_LOCAL if ME_CONFIG_MONGODB_URL == ME_CONFIG_MONGODB_URL_LOCAL else ME_CONFIG_MONGODB_TIMEOUT_REMOTE))
|
34 |
ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP = int(os.getenv("ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP", 900))
|
|
|
35 |
DEFAULT_COLLECTION_THESAURUS =os.getenv("DEFAULT_COLLECTION_THESAURUS", "wordsapi")
|
36 |
+
DEFAULT_DBNAME_THESAURUS = "thesaurus"
|
37 |
+
ELIGIBLE_POS = {'NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV'}
|
38 |
session_logger.setup_logging(json_logs=LOG_JSON_FORMAT, log_level=LOG_LEVEL)
|
39 |
app_logger = structlog.stdlib.get_logger(__name__)
|
my_ghost_writer/text_parsers2.py
CHANGED
@@ -8,7 +8,8 @@ import pyinflect
|
|
8 |
from typing import List, Dict, Any, Optional
|
9 |
from fastapi import HTTPException
|
10 |
|
11 |
-
from my_ghost_writer.constants import SPACY_MODEL_NAME, app_logger
|
|
|
12 |
|
13 |
|
14 |
# Load spaCy model
|
@@ -23,8 +24,8 @@ except OSError:
|
|
23 |
|
24 |
# Ensure NLTK data is downloaded
|
25 |
try:
|
26 |
-
nltk.download('wordnet', quiet=
|
27 |
-
nltk.download('omw-1.4', quiet=
|
28 |
except Exception as e:
|
29 |
app_logger.error(f"Failed to download NLTK data: {e}")
|
30 |
|
@@ -34,6 +35,79 @@ def is_nlp_available() -> bool:
|
|
34 |
return nlp is not None
|
35 |
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int, target_word: str) -> Dict[str, Any]:
|
38 |
"""Extract grammatical and contextual information using character indices"""
|
39 |
if nlp is None:
|
@@ -43,15 +117,7 @@ def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int,
|
|
43 |
if start_idx < 0 or end_idx > len(text) or start_idx >= end_idx:
|
44 |
raise HTTPException(status_code=400, detail="Invalid start/end indices")
|
45 |
|
46 |
-
extracted_word = text[start_idx:end_idx].strip()
|
47 |
-
if extracted_word.lower() != target_word.lower():
|
48 |
-
raise HTTPException(
|
49 |
-
status_code=400,
|
50 |
-
detail=f"Word mismatch: expected '{target_word}', got '{extracted_word}'"
|
51 |
-
)
|
52 |
-
|
53 |
try:
|
54 |
-
# Process the entire text with spaCy
|
55 |
doc = nlp(text)
|
56 |
|
57 |
# Find the token that corresponds to our character indices
|
@@ -104,7 +170,7 @@ def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int,
|
|
104 |
}
|
105 |
|
106 |
except Exception as ex:
|
107 |
-
app_logger.error(f"Error in contextual analysis: {ex}")
|
108 |
raise HTTPException(status_code=500, detail=f"Error analyzing context: {str(ex)}")
|
109 |
|
110 |
|
@@ -136,12 +202,15 @@ def get_wordnet_synonyms(word: str, pos_tag: Optional[str] = None) -> List[Dict[
|
|
136 |
'pos': synset.pos()
|
137 |
}
|
138 |
|
|
|
|
|
139 |
for lemma in synset.lemmas():
|
140 |
synonym = lemma.name().replace('_', ' ')
|
141 |
if synonym.lower() != word.lower():
|
142 |
-
|
143 |
|
144 |
-
if
|
|
|
145 |
synonyms_by_sense.append(sense_data)
|
146 |
|
147 |
return synonyms_by_sense
|
@@ -161,7 +230,7 @@ def inflect_synonym(synonym: str, original_token_info: Dict[str, Any]) -> str:
|
|
161 |
|
162 |
# Handle capitalization first using .get() for safety
|
163 |
if original_token_info.get('is_title'):
|
164 |
-
synonym = synonym.
|
165 |
elif original_token_info.get('is_upper'):
|
166 |
synonym = synonym.upper()
|
167 |
elif original_token_info.get('is_lower', True): # Default to lower
|
@@ -181,7 +250,10 @@ def inflect_synonym(synonym: str, original_token_info: Dict[str, Any]) -> str:
|
|
181 |
doc = nlp(synonym)
|
182 |
if doc and len(doc) > 0:
|
183 |
inflected = doc[0]._.inflect(tag)
|
184 |
-
|
|
|
|
|
|
|
185 |
|
186 |
except Exception as ex:
|
187 |
app_logger.warning(f"Inflection error for '{synonym}': '{ex}'")
|
@@ -195,7 +267,8 @@ def process_synonym_groups(word: str, context_info: Dict[str, Any]) -> List[Dict
|
|
195 |
"""Process synonym groups with inflection matching"""
|
196 |
# Get synonyms from wn
|
197 |
t0 = datetime.now()
|
198 |
-
|
|
|
199 |
t1 = datetime.now()
|
200 |
duration = (t1 - t0).total_seconds()
|
201 |
app_logger.info(f"# 1/Got get_wordnet_synonyms result with '{word}' word in {duration:.3f}s.")
|
@@ -221,7 +294,7 @@ def process_synonym_groups(word: str, context_info: Dict[str, Any]) -> List[Dict
|
|
221 |
processed_sense["synonyms"].append({
|
222 |
"base_form": base_form,
|
223 |
"inflected_form": inflected_form,
|
224 |
-
"matches_context": inflected_form != base_form
|
225 |
})
|
226 |
|
227 |
processed_synonyms.append(processed_sense)
|
|
|
8 |
from typing import List, Dict, Any, Optional
|
9 |
from fastapi import HTTPException
|
10 |
|
11 |
+
from my_ghost_writer.constants import SPACY_MODEL_NAME, app_logger, ELIGIBLE_POS
|
12 |
+
from my_ghost_writer.type_hints import WordSynonymResult, ContextInfo, SynonymGroup
|
13 |
|
14 |
|
15 |
# Load spaCy model
|
|
|
24 |
|
25 |
# Ensure NLTK data is downloaded
|
26 |
try:
|
27 |
+
nltk.download('wordnet', quiet=False)
|
28 |
+
nltk.download('omw-1.4', quiet=False)
|
29 |
except Exception as e:
|
30 |
app_logger.error(f"Failed to download NLTK data: {e}")
|
31 |
|
|
|
35 |
return nlp is not None
|
36 |
|
37 |
|
38 |
+
# --- NEW: Main function for handling multi-word selections ---
|
39 |
+
def find_synonyms_for_phrase(text: str, start_idx: int, end_idx: int) -> List[WordSynonymResult]:
|
40 |
+
"""
|
41 |
+
Finds synonyms for all eligible words within a selected text span.
|
42 |
+
It analyzes the span, filters for meaningful words (nouns, verbs, etc.),
|
43 |
+
and returns a list of synonym results for each.
|
44 |
+
"""
|
45 |
+
if nlp is None:
|
46 |
+
raise HTTPException(status_code=503, detail="NLP service is unavailable")
|
47 |
+
|
48 |
+
doc = nlp(text)
|
49 |
+
# Use 'expand' to ensure the span covers full tokens even with partial selection
|
50 |
+
span = doc.char_span(start_idx, end_idx, alignment_mode="expand")
|
51 |
+
|
52 |
+
if span is None:
|
53 |
+
app_logger.warning(f"Could not create a valid token span from indices {start_idx}-{end_idx}.")
|
54 |
+
# Return an empty list if no valid span can be formed, the client can handle this
|
55 |
+
return []
|
56 |
+
|
57 |
+
# Define which POS tags are eligible for synonym lookup
|
58 |
+
results: List[WordSynonymResult] = []
|
59 |
+
|
60 |
+
for token in span:
|
61 |
+
# Process only if the token is an eligible part of speech and not a stop word or punctuation
|
62 |
+
if token.pos_ in ELIGIBLE_POS and not token.is_stop and not token.is_punct:
|
63 |
+
try:
|
64 |
+
# 1. Get context for this specific token
|
65 |
+
context_info_dict = extract_contextual_info_by_indices(
|
66 |
+
text, token.idx, token.idx + len(token.text), token.text
|
67 |
+
)
|
68 |
+
|
69 |
+
# 2. Get synonym groups using the token's lemma for a better search
|
70 |
+
synonym_groups_list = process_synonym_groups(context_info_dict['lemma'], context_info_dict)
|
71 |
+
|
72 |
+
# 3. If we find synonyms, build the result object for this word
|
73 |
+
if synonym_groups_list:
|
74 |
+
# Restructure dicts into Pydantic models for type safety
|
75 |
+
context_info_model = ContextInfo(
|
76 |
+
pos=context_info_dict['pos'],
|
77 |
+
sentence=context_info_dict['context_sentence'],
|
78 |
+
grammatical_form=context_info_dict['tag'],
|
79 |
+
context_words=context_info_dict['context_words'],
|
80 |
+
dependency=context_info_dict['dependency']
|
81 |
+
)
|
82 |
+
local_start_idx = token.idx - start_idx
|
83 |
+
local_end_idx = local_start_idx + len(token.text)
|
84 |
+
sliced_sentence = text[start_idx:end_idx]
|
85 |
+
sliced_word = sliced_sentence[local_start_idx:local_end_idx]
|
86 |
+
assert sliced_word == token.text, (f"Mismatch! sliced_word ({sliced_word}) != token.text ({token.text}), but these substrings should be equal.\n"
|
87 |
+
f" start_idx:{start_idx}, End_word:{end_idx}. local_start_idx:{local_start_idx}, local_end_idx:{local_end_idx}.")
|
88 |
+
word_result = WordSynonymResult(
|
89 |
+
original_word=token.text,
|
90 |
+
original_indices={"start": local_start_idx, "end": local_end_idx},
|
91 |
+
context_info=context_info_model,
|
92 |
+
synonym_groups=[SynonymGroup(**sg) for sg in synonym_groups_list],
|
93 |
+
debug_info={
|
94 |
+
"spacy_token_indices": {
|
95 |
+
"start": context_info_dict['char_start'],
|
96 |
+
"end": context_info_dict['char_end']
|
97 |
+
},
|
98 |
+
"lemma": context_info_dict['lemma']
|
99 |
+
}
|
100 |
+
)
|
101 |
+
results.append(word_result)
|
102 |
+
|
103 |
+
except HTTPException as http_ex:
|
104 |
+
app_logger.warning(f"Could not process token '{token.text}': '{http_ex.detail}'")
|
105 |
+
except Exception as ex:
|
106 |
+
app_logger.error(f"Unexpected error processing token '{token.text}': '{ex}'", exc_info=True)
|
107 |
+
|
108 |
+
return results
|
109 |
+
|
110 |
+
|
111 |
def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int, target_word: str) -> Dict[str, Any]:
|
112 |
"""Extract grammatical and contextual information using character indices"""
|
113 |
if nlp is None:
|
|
|
117 |
if start_idx < 0 or end_idx > len(text) or start_idx >= end_idx:
|
118 |
raise HTTPException(status_code=400, detail="Invalid start/end indices")
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
try:
|
|
|
121 |
doc = nlp(text)
|
122 |
|
123 |
# Find the token that corresponds to our character indices
|
|
|
170 |
}
|
171 |
|
172 |
except Exception as ex:
|
173 |
+
app_logger.error(f"Error in contextual analysis: {ex}", exc_info=True)
|
174 |
raise HTTPException(status_code=500, detail=f"Error analyzing context: {str(ex)}")
|
175 |
|
176 |
|
|
|
202 |
'pos': synset.pos()
|
203 |
}
|
204 |
|
205 |
+
# Use a set to avoid duplicate synonyms from different lemmas in the same synset
|
206 |
+
unique_synonyms = set()
|
207 |
for lemma in synset.lemmas():
|
208 |
synonym = lemma.name().replace('_', ' ')
|
209 |
if synonym.lower() != word.lower():
|
210 |
+
unique_synonyms.add(synonym)
|
211 |
|
212 |
+
if unique_synonyms:
|
213 |
+
sense_data['synonyms'] = sorted(list(unique_synonyms))
|
214 |
synonyms_by_sense.append(sense_data)
|
215 |
|
216 |
return synonyms_by_sense
|
|
|
230 |
|
231 |
# Handle capitalization first using .get() for safety
|
232 |
if original_token_info.get('is_title'):
|
233 |
+
synonym = synonym.title() # .title() is better for multi-word phrases
|
234 |
elif original_token_info.get('is_upper'):
|
235 |
synonym = synonym.upper()
|
236 |
elif original_token_info.get('is_lower', True): # Default to lower
|
|
|
250 |
doc = nlp(synonym)
|
251 |
if doc and len(doc) > 0:
|
252 |
inflected = doc[0]._.inflect(tag)
|
253 |
+
if inflected:
|
254 |
+
# Re-join with the rest of the phrase if it was multi-word
|
255 |
+
return inflected + synonym[len(doc[0].text):]
|
256 |
+
return synonym # Return original if inflection fails
|
257 |
|
258 |
except Exception as ex:
|
259 |
app_logger.warning(f"Inflection error for '{synonym}': '{ex}'")
|
|
|
267 |
"""Process synonym groups with inflection matching"""
|
268 |
# Get synonyms from wn
|
269 |
t0 = datetime.now()
|
270 |
+
# Get synonyms from wn using the lemma
|
271 |
+
synonyms_by_sense = get_wordnet_synonyms(context_info['lemma'], context_info['pos'])
|
272 |
t1 = datetime.now()
|
273 |
duration = (t1 - t0).total_seconds()
|
274 |
app_logger.info(f"# 1/Got get_wordnet_synonyms result with '{word}' word in {duration:.3f}s.")
|
|
|
294 |
processed_sense["synonyms"].append({
|
295 |
"base_form": base_form,
|
296 |
"inflected_form": inflected_form,
|
297 |
+
"matches_context": inflected_form.lower() != base_form.lower()
|
298 |
})
|
299 |
|
300 |
processed_synonyms.append(processed_sense)
|
my_ghost_writer/type_hints.py
CHANGED
@@ -44,7 +44,7 @@ class ContextInfo(BaseModel):
|
|
44 |
context_words: list[str]
|
45 |
dependency: str
|
46 |
|
47 |
-
class
|
48 |
success: bool
|
49 |
original_word: str
|
50 |
original_indices: dict[str, int]
|
@@ -53,6 +53,21 @@ class SynonymResponse(BaseModel):
|
|
53 |
message: Optional[str] = None
|
54 |
debug_info: Optional[dict[str, Any]] = None
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
class HealthCheckResponse(BaseModel):
|
57 |
success: bool
|
58 |
status: str
|
|
|
44 |
context_words: list[str]
|
45 |
dependency: str
|
46 |
|
47 |
+
class SingleWordSynonymResponse(BaseModel):
|
48 |
success: bool
|
49 |
original_word: str
|
50 |
original_indices: dict[str, int]
|
|
|
53 |
message: Optional[str] = None
|
54 |
debug_info: Optional[dict[str, Any]] = None
|
55 |
|
56 |
+
class WordSynonymResult(BaseModel):
|
57 |
+
original_word: str
|
58 |
+
original_indices: dict[str, int]
|
59 |
+
context_info: ContextInfo
|
60 |
+
synonym_groups: list[SynonymGroup]
|
61 |
+
debug_info: Optional[dict[str, Any]] = None
|
62 |
+
|
63 |
+
class MultiWordSynonymResponse(BaseModel):
|
64 |
+
success: bool
|
65 |
+
original_phrase: str
|
66 |
+
original_indices: dict[str, int]
|
67 |
+
results: list[WordSynonymResult]
|
68 |
+
message: Optional[str] = None
|
69 |
+
|
70 |
+
|
71 |
class HealthCheckResponse(BaseModel):
|
72 |
success: bool
|
73 |
status: str
|