alessandro trinca tornidor
commited on
Commit
·
5ffb0e3
1
Parent(s):
a901fdc
feat: multi-words substitution support, backend part
Browse files- .vscode/launch.json +71 -1
- my_ghost_writer/app.py +66 -4
- my_ghost_writer/constants.py +2 -1
- my_ghost_writer/text_parsers2.py +91 -18
- my_ghost_writer/type_hints.py +16 -1
.vscode/launch.json
CHANGED
|
@@ -12,7 +12,77 @@
|
|
| 12 |
"env": {
|
| 13 |
"IS_TESTING": "TRUE",
|
| 14 |
"LOG_LEVEL": "DEBUG"
|
| 15 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
]
|
| 18 |
}
|
|
|
|
| 12 |
"env": {
|
| 13 |
"IS_TESTING": "TRUE",
|
| 14 |
"LOG_LEVEL": "DEBUG"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"type": "msedge",
|
| 19 |
+
"request": "launch",
|
| 20 |
+
"name": "Launch app port 8000",
|
| 21 |
+
"url": "http://localhost:8000/#",
|
| 22 |
+
"runtimeExecutable": "/snap/bin/chromium",
|
| 23 |
+
"file": "${workspaceFolder}/lite.koboldai.net/index.html",
|
| 24 |
+
"webRoot": "${workspaceFolder}/lite.koboldai.net",
|
| 25 |
+
"trace": true,
|
| 26 |
+
"port": 8000,
|
| 27 |
+
"runtimeArgs": [
|
| 28 |
+
"--remote-debugging-port=9222"
|
| 29 |
+
]
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"type": "msedge",
|
| 33 |
+
"name": "Launch Edge (Visible)",
|
| 34 |
+
"request": "launch",
|
| 35 |
+
"runtimeArgs": [
|
| 36 |
+
"--remote-debugging-port=9222"
|
| 37 |
+
],
|
| 38 |
+
"url": "http://localhost:8000", // or your dev server address
|
| 39 |
+
"presentation": {
|
| 40 |
+
"hidden": false
|
| 41 |
+
}
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"type": "chrome",
|
| 45 |
+
"request": "attach",
|
| 46 |
+
"name": "Attach to browser",
|
| 47 |
+
"port": 9222,
|
| 48 |
+
"address": "localhost"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"type": "pwa-msedge",
|
| 52 |
+
"name": "Launch Microsoft Edge",
|
| 53 |
+
"request": "launch",
|
| 54 |
+
"runtimeArgs": [
|
| 55 |
+
"--remote-debugging-port=8000"
|
| 56 |
+
],
|
| 57 |
+
"url": "http://localhost:8000/#",
|
| 58 |
+
"file": "${workspaceFolder}/lite.koboldai.net/index.html",
|
| 59 |
+
"webRoot": "${workspaceFolder}/lite.koboldai.net",
|
| 60 |
+
"trace": true
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"type": "vscode-edge-devtools.debug",
|
| 64 |
+
"name": "Open Edge DevTools",
|
| 65 |
+
"request": "attach",
|
| 66 |
+
"url": "c:\\Users\\trincuz\\.vscode\\extensions\\ms-edgedevtools.vscode-edge-devtools-2.1.9\\out\\startpage\\index.html",
|
| 67 |
+
"presentation": {
|
| 68 |
+
"hidden": true
|
| 69 |
+
}
|
| 70 |
}
|
| 71 |
+
],
|
| 72 |
+
"compounds": [
|
| 73 |
+
{
|
| 74 |
+
"name": "Launch Edge and attach DevTools",
|
| 75 |
+
"configurations": [
|
| 76 |
+
"Launch Microsoft Edge",
|
| 77 |
+
"Open Edge DevTools"
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"name": "Launch Edge (Visible) and attach DevTools",
|
| 82 |
+
"configurations": [
|
| 83 |
+
"Launch Edge (Visible)",
|
| 84 |
+
"Open Edge DevTools"
|
| 85 |
+
]
|
| 86 |
+
},
|
| 87 |
]
|
| 88 |
}
|
my_ghost_writer/app.py
CHANGED
|
@@ -24,10 +24,11 @@ from my_ghost_writer.constants import (ALLOWED_ORIGIN_LIST, API_MODE, DOMAIN, IS
|
|
| 24 |
ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP, ME_CONFIG_MONGODB_USE_OK, PORT, RAPIDAPI_HOST, STATIC_FOLDER,
|
| 25 |
STATIC_FOLDER_LITEKOBOLDAINET, WORDSAPI_KEY, WORDSAPI_URL, app_logger)
|
| 26 |
from my_ghost_writer.pymongo_utils import mongodb_health_check
|
| 27 |
-
from my_ghost_writer.text_parsers2 import extract_contextual_info_by_indices, process_synonym_groups
|
| 28 |
from my_ghost_writer.thesaurus import get_current_info_wordnet, get_synsets_by_word_and_language
|
| 29 |
-
from my_ghost_writer.type_hints import RequestQueryThesaurusInflatedBody,
|
| 30 |
-
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
async def mongo_health_check_background_task():
|
|
@@ -261,7 +262,7 @@ def get_thesaurus_wordsapi(body: RequestQueryThesaurusWordsapiBody | str) -> JSO
|
|
| 261 |
raise HTTPException(status_code=response.status_code, detail=msg)
|
| 262 |
|
| 263 |
|
| 264 |
-
@app.post("/thesaurus-inflated", response_model=
|
| 265 |
async def get_synonyms(body: RequestQueryThesaurusInflatedBody):
|
| 266 |
"""
|
| 267 |
Get contextually appropriate synonyms for a word at specific indices in text.
|
|
@@ -360,6 +361,67 @@ async def get_synonyms(body: RequestQueryThesaurusInflatedBody):
|
|
| 360 |
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 361 |
|
| 362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
@app.exception_handler(HTTPException)
|
| 364 |
def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
| 365 |
origin = request.headers.get("origin")
|
|
|
|
| 24 |
ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP, ME_CONFIG_MONGODB_USE_OK, PORT, RAPIDAPI_HOST, STATIC_FOLDER,
|
| 25 |
STATIC_FOLDER_LITEKOBOLDAINET, WORDSAPI_KEY, WORDSAPI_URL, app_logger)
|
| 26 |
from my_ghost_writer.pymongo_utils import mongodb_health_check
|
| 27 |
+
from my_ghost_writer.text_parsers2 import extract_contextual_info_by_indices, process_synonym_groups, find_synonyms_for_phrase
|
| 28 |
from my_ghost_writer.thesaurus import get_current_info_wordnet, get_synsets_by_word_and_language
|
| 29 |
+
from my_ghost_writer.type_hints import (RequestQueryThesaurusInflatedBody, RequestQueryThesaurusWordsapiBody,
|
| 30 |
+
RequestSplitText, RequestTextFrequencyBody, MultiWordSynonymResponse,
|
| 31 |
+
SingleWordSynonymResponse)
|
| 32 |
|
| 33 |
|
| 34 |
async def mongo_health_check_background_task():
|
|
|
|
| 262 |
raise HTTPException(status_code=response.status_code, detail=msg)
|
| 263 |
|
| 264 |
|
| 265 |
+
@app.post("/thesaurus-inflated", response_model=SingleWordSynonymResponse)
|
| 266 |
async def get_synonyms(body: RequestQueryThesaurusInflatedBody):
|
| 267 |
"""
|
| 268 |
Get contextually appropriate synonyms for a word at specific indices in text.
|
|
|
|
| 361 |
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 362 |
|
| 363 |
|
| 364 |
+
@app.post("/thesaurus-inflated-phrase", response_model=MultiWordSynonymResponse)
|
| 365 |
+
async def get_synonyms_for_phrase(body: RequestQueryThesaurusInflatedBody):
|
| 366 |
+
"""
|
| 367 |
+
Get contextual synonyms for a selected phrase (one or more words).
|
| 368 |
+
It identifies all meaningful words in the selection and returns
|
| 369 |
+
synonym groups for each.
|
| 370 |
+
"""
|
| 371 |
+
app_logger.info(f"body tye:{type(body)}!")
|
| 372 |
+
app_logger.info(f"body:{body}!")
|
| 373 |
+
t0 = datetime.now()
|
| 374 |
+
try:
|
| 375 |
+
body_validated = RequestQueryThesaurusInflatedBody.model_validate_json(body)
|
| 376 |
+
end = body_validated.end
|
| 377 |
+
start = body_validated.start
|
| 378 |
+
text = body_validated.text
|
| 379 |
+
word = body_validated.word
|
| 380 |
+
except ValidationError:
|
| 381 |
+
assert isinstance(body, RequestQueryThesaurusInflatedBody), f"body MUST be of type RequestSplitText, not of '{type(body)}'!"
|
| 382 |
+
end = body.end
|
| 383 |
+
start = body.start
|
| 384 |
+
text = body.text
|
| 385 |
+
word = body.word
|
| 386 |
+
app_logger.info(f"end:{end}!")
|
| 387 |
+
app_logger.info(f"start:{start}!")
|
| 388 |
+
app_logger.info(f"text:{text}!")
|
| 389 |
+
app_logger.info(f"word:{word}!")
|
| 390 |
+
|
| 391 |
+
try:
|
| 392 |
+
# The new function in text_parsers2 does all the heavy lifting
|
| 393 |
+
results = find_synonyms_for_phrase(
|
| 394 |
+
text=body.text,
|
| 395 |
+
start_idx=body.start,
|
| 396 |
+
end_idx=body.end
|
| 397 |
+
)
|
| 398 |
+
t1 = datetime.now()
|
| 399 |
+
duration = (t1 - t0).total_seconds()
|
| 400 |
+
app_logger.info(f"got find_synonyms_for_phrase() result in: {duration:.3f}s. ...")
|
| 401 |
+
app_logger.info(results)
|
| 402 |
+
|
| 403 |
+
message = f"Got {len(results)} synonym groups." if results else "No words with synonyms found in the selected phrase."
|
| 404 |
+
|
| 405 |
+
t2 = datetime.now()
|
| 406 |
+
duration = (t2 - t1).total_seconds()
|
| 407 |
+
app_logger.info(f"got MultiWordSynonymResponse() result in: {duration:.3f}s. ...")
|
| 408 |
+
# Construct the final response using our Pydantic model
|
| 409 |
+
return MultiWordSynonymResponse(
|
| 410 |
+
success=True,
|
| 411 |
+
original_phrase=body.word,
|
| 412 |
+
original_indices={"start": body.start, "end": body.end},
|
| 413 |
+
results=results,
|
| 414 |
+
message=message
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
except HTTPException:
|
| 418 |
+
# Re-raise known HTTP exceptions to be handled by FastAPI's handler
|
| 419 |
+
raise
|
| 420 |
+
except Exception as e:
|
| 421 |
+
app_logger.error(f"Unexpected error in get_synonyms_for_phrase: '{e}'", exc_info=True)
|
| 422 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 423 |
+
|
| 424 |
+
|
| 425 |
@app.exception_handler(HTTPException)
|
| 426 |
def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
| 427 |
origin = request.headers.get("origin")
|
my_ghost_writer/constants.py
CHANGED
|
@@ -32,7 +32,8 @@ ME_CONFIG_MONGODB_TIMEOUT_LOCAL = int(os.getenv("ME_CONFIG_MONGODB_TIMEOUT_LOCAL
|
|
| 32 |
ME_CONFIG_MONGODB_TIMEOUT_REMOTE = int(os.getenv("ME_CONFIG_MONGODB_TIMEOUT_REMOTE", 3000))
|
| 33 |
ME_CONFIG_MONGODB_TIMEOUT = int(os.getenv( "ME_CONFIG_MONGODB_TIMEOUT", ME_CONFIG_MONGODB_TIMEOUT_LOCAL if ME_CONFIG_MONGODB_URL == ME_CONFIG_MONGODB_URL_LOCAL else ME_CONFIG_MONGODB_TIMEOUT_REMOTE))
|
| 34 |
ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP = int(os.getenv("ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP", 900))
|
| 35 |
-
DEFAULT_DBNAME_THESAURUS = "thesaurus"
|
| 36 |
DEFAULT_COLLECTION_THESAURUS =os.getenv("DEFAULT_COLLECTION_THESAURUS", "wordsapi")
|
|
|
|
|
|
|
| 37 |
session_logger.setup_logging(json_logs=LOG_JSON_FORMAT, log_level=LOG_LEVEL)
|
| 38 |
app_logger = structlog.stdlib.get_logger(__name__)
|
|
|
|
| 32 |
ME_CONFIG_MONGODB_TIMEOUT_REMOTE = int(os.getenv("ME_CONFIG_MONGODB_TIMEOUT_REMOTE", 3000))
|
| 33 |
ME_CONFIG_MONGODB_TIMEOUT = int(os.getenv( "ME_CONFIG_MONGODB_TIMEOUT", ME_CONFIG_MONGODB_TIMEOUT_LOCAL if ME_CONFIG_MONGODB_URL == ME_CONFIG_MONGODB_URL_LOCAL else ME_CONFIG_MONGODB_TIMEOUT_REMOTE))
|
| 34 |
ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP = int(os.getenv("ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP", 900))
|
|
|
|
| 35 |
DEFAULT_COLLECTION_THESAURUS =os.getenv("DEFAULT_COLLECTION_THESAURUS", "wordsapi")
|
| 36 |
+
DEFAULT_DBNAME_THESAURUS = "thesaurus"
|
| 37 |
+
ELIGIBLE_POS = {'NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV'}
|
| 38 |
session_logger.setup_logging(json_logs=LOG_JSON_FORMAT, log_level=LOG_LEVEL)
|
| 39 |
app_logger = structlog.stdlib.get_logger(__name__)
|
my_ghost_writer/text_parsers2.py
CHANGED
|
@@ -8,7 +8,8 @@ import pyinflect
|
|
| 8 |
from typing import List, Dict, Any, Optional
|
| 9 |
from fastapi import HTTPException
|
| 10 |
|
| 11 |
-
from my_ghost_writer.constants import SPACY_MODEL_NAME, app_logger
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
# Load spaCy model
|
|
@@ -23,8 +24,8 @@ except OSError:
|
|
| 23 |
|
| 24 |
# Ensure NLTK data is downloaded
|
| 25 |
try:
|
| 26 |
-
nltk.download('wordnet', quiet=
|
| 27 |
-
nltk.download('omw-1.4', quiet=
|
| 28 |
except Exception as e:
|
| 29 |
app_logger.error(f"Failed to download NLTK data: {e}")
|
| 30 |
|
|
@@ -34,6 +35,79 @@ def is_nlp_available() -> bool:
|
|
| 34 |
return nlp is not None
|
| 35 |
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int, target_word: str) -> Dict[str, Any]:
|
| 38 |
"""Extract grammatical and contextual information using character indices"""
|
| 39 |
if nlp is None:
|
|
@@ -43,15 +117,7 @@ def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int,
|
|
| 43 |
if start_idx < 0 or end_idx > len(text) or start_idx >= end_idx:
|
| 44 |
raise HTTPException(status_code=400, detail="Invalid start/end indices")
|
| 45 |
|
| 46 |
-
extracted_word = text[start_idx:end_idx].strip()
|
| 47 |
-
if extracted_word.lower() != target_word.lower():
|
| 48 |
-
raise HTTPException(
|
| 49 |
-
status_code=400,
|
| 50 |
-
detail=f"Word mismatch: expected '{target_word}', got '{extracted_word}'"
|
| 51 |
-
)
|
| 52 |
-
|
| 53 |
try:
|
| 54 |
-
# Process the entire text with spaCy
|
| 55 |
doc = nlp(text)
|
| 56 |
|
| 57 |
# Find the token that corresponds to our character indices
|
|
@@ -104,7 +170,7 @@ def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int,
|
|
| 104 |
}
|
| 105 |
|
| 106 |
except Exception as ex:
|
| 107 |
-
app_logger.error(f"Error in contextual analysis: {ex}")
|
| 108 |
raise HTTPException(status_code=500, detail=f"Error analyzing context: {str(ex)}")
|
| 109 |
|
| 110 |
|
|
@@ -136,12 +202,15 @@ def get_wordnet_synonyms(word: str, pos_tag: Optional[str] = None) -> List[Dict[
|
|
| 136 |
'pos': synset.pos()
|
| 137 |
}
|
| 138 |
|
|
|
|
|
|
|
| 139 |
for lemma in synset.lemmas():
|
| 140 |
synonym = lemma.name().replace('_', ' ')
|
| 141 |
if synonym.lower() != word.lower():
|
| 142 |
-
|
| 143 |
|
| 144 |
-
if
|
|
|
|
| 145 |
synonyms_by_sense.append(sense_data)
|
| 146 |
|
| 147 |
return synonyms_by_sense
|
|
@@ -161,7 +230,7 @@ def inflect_synonym(synonym: str, original_token_info: Dict[str, Any]) -> str:
|
|
| 161 |
|
| 162 |
# Handle capitalization first using .get() for safety
|
| 163 |
if original_token_info.get('is_title'):
|
| 164 |
-
synonym = synonym.
|
| 165 |
elif original_token_info.get('is_upper'):
|
| 166 |
synonym = synonym.upper()
|
| 167 |
elif original_token_info.get('is_lower', True): # Default to lower
|
|
@@ -181,7 +250,10 @@ def inflect_synonym(synonym: str, original_token_info: Dict[str, Any]) -> str:
|
|
| 181 |
doc = nlp(synonym)
|
| 182 |
if doc and len(doc) > 0:
|
| 183 |
inflected = doc[0]._.inflect(tag)
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
except Exception as ex:
|
| 187 |
app_logger.warning(f"Inflection error for '{synonym}': '{ex}'")
|
|
@@ -195,7 +267,8 @@ def process_synonym_groups(word: str, context_info: Dict[str, Any]) -> List[Dict
|
|
| 195 |
"""Process synonym groups with inflection matching"""
|
| 196 |
# Get synonyms from wn
|
| 197 |
t0 = datetime.now()
|
| 198 |
-
|
|
|
|
| 199 |
t1 = datetime.now()
|
| 200 |
duration = (t1 - t0).total_seconds()
|
| 201 |
app_logger.info(f"# 1/Got get_wordnet_synonyms result with '{word}' word in {duration:.3f}s.")
|
|
@@ -221,7 +294,7 @@ def process_synonym_groups(word: str, context_info: Dict[str, Any]) -> List[Dict
|
|
| 221 |
processed_sense["synonyms"].append({
|
| 222 |
"base_form": base_form,
|
| 223 |
"inflected_form": inflected_form,
|
| 224 |
-
"matches_context": inflected_form != base_form
|
| 225 |
})
|
| 226 |
|
| 227 |
processed_synonyms.append(processed_sense)
|
|
|
|
| 8 |
from typing import List, Dict, Any, Optional
|
| 9 |
from fastapi import HTTPException
|
| 10 |
|
| 11 |
+
from my_ghost_writer.constants import SPACY_MODEL_NAME, app_logger, ELIGIBLE_POS
|
| 12 |
+
from my_ghost_writer.type_hints import WordSynonymResult, ContextInfo, SynonymGroup
|
| 13 |
|
| 14 |
|
| 15 |
# Load spaCy model
|
|
|
|
| 24 |
|
| 25 |
# Ensure NLTK data is downloaded
|
| 26 |
try:
|
| 27 |
+
nltk.download('wordnet', quiet=False)
|
| 28 |
+
nltk.download('omw-1.4', quiet=False)
|
| 29 |
except Exception as e:
|
| 30 |
app_logger.error(f"Failed to download NLTK data: {e}")
|
| 31 |
|
|
|
|
| 35 |
return nlp is not None
|
| 36 |
|
| 37 |
|
| 38 |
+
# --- NEW: Main function for handling multi-word selections ---
|
| 39 |
+
def find_synonyms_for_phrase(text: str, start_idx: int, end_idx: int) -> List[WordSynonymResult]:
|
| 40 |
+
"""
|
| 41 |
+
Finds synonyms for all eligible words within a selected text span.
|
| 42 |
+
It analyzes the span, filters for meaningful words (nouns, verbs, etc.),
|
| 43 |
+
and returns a list of synonym results for each.
|
| 44 |
+
"""
|
| 45 |
+
if nlp is None:
|
| 46 |
+
raise HTTPException(status_code=503, detail="NLP service is unavailable")
|
| 47 |
+
|
| 48 |
+
doc = nlp(text)
|
| 49 |
+
# Use 'expand' to ensure the span covers full tokens even with partial selection
|
| 50 |
+
span = doc.char_span(start_idx, end_idx, alignment_mode="expand")
|
| 51 |
+
|
| 52 |
+
if span is None:
|
| 53 |
+
app_logger.warning(f"Could not create a valid token span from indices {start_idx}-{end_idx}.")
|
| 54 |
+
# Return an empty list if no valid span can be formed, the client can handle this
|
| 55 |
+
return []
|
| 56 |
+
|
| 57 |
+
# Define which POS tags are eligible for synonym lookup
|
| 58 |
+
results: List[WordSynonymResult] = []
|
| 59 |
+
|
| 60 |
+
for token in span:
|
| 61 |
+
# Process only if the token is an eligible part of speech and not a stop word or punctuation
|
| 62 |
+
if token.pos_ in ELIGIBLE_POS and not token.is_stop and not token.is_punct:
|
| 63 |
+
try:
|
| 64 |
+
# 1. Get context for this specific token
|
| 65 |
+
context_info_dict = extract_contextual_info_by_indices(
|
| 66 |
+
text, token.idx, token.idx + len(token.text), token.text
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# 2. Get synonym groups using the token's lemma for a better search
|
| 70 |
+
synonym_groups_list = process_synonym_groups(context_info_dict['lemma'], context_info_dict)
|
| 71 |
+
|
| 72 |
+
# 3. If we find synonyms, build the result object for this word
|
| 73 |
+
if synonym_groups_list:
|
| 74 |
+
# Restructure dicts into Pydantic models for type safety
|
| 75 |
+
context_info_model = ContextInfo(
|
| 76 |
+
pos=context_info_dict['pos'],
|
| 77 |
+
sentence=context_info_dict['context_sentence'],
|
| 78 |
+
grammatical_form=context_info_dict['tag'],
|
| 79 |
+
context_words=context_info_dict['context_words'],
|
| 80 |
+
dependency=context_info_dict['dependency']
|
| 81 |
+
)
|
| 82 |
+
local_start_idx = token.idx - start_idx
|
| 83 |
+
local_end_idx = local_start_idx + len(token.text)
|
| 84 |
+
sliced_sentence = text[start_idx:end_idx]
|
| 85 |
+
sliced_word = sliced_sentence[local_start_idx:local_end_idx]
|
| 86 |
+
assert sliced_word == token.text, (f"Mismatch! sliced_word ({sliced_word}) != token.text ({token.text}), but these substrings should be equal.\n"
|
| 87 |
+
f" start_idx:{start_idx}, End_word:{end_idx}. local_start_idx:{local_start_idx}, local_end_idx:{local_end_idx}.")
|
| 88 |
+
word_result = WordSynonymResult(
|
| 89 |
+
original_word=token.text,
|
| 90 |
+
original_indices={"start": local_start_idx, "end": local_end_idx},
|
| 91 |
+
context_info=context_info_model,
|
| 92 |
+
synonym_groups=[SynonymGroup(**sg) for sg in synonym_groups_list],
|
| 93 |
+
debug_info={
|
| 94 |
+
"spacy_token_indices": {
|
| 95 |
+
"start": context_info_dict['char_start'],
|
| 96 |
+
"end": context_info_dict['char_end']
|
| 97 |
+
},
|
| 98 |
+
"lemma": context_info_dict['lemma']
|
| 99 |
+
}
|
| 100 |
+
)
|
| 101 |
+
results.append(word_result)
|
| 102 |
+
|
| 103 |
+
except HTTPException as http_ex:
|
| 104 |
+
app_logger.warning(f"Could not process token '{token.text}': '{http_ex.detail}'")
|
| 105 |
+
except Exception as ex:
|
| 106 |
+
app_logger.error(f"Unexpected error processing token '{token.text}': '{ex}'", exc_info=True)
|
| 107 |
+
|
| 108 |
+
return results
|
| 109 |
+
|
| 110 |
+
|
| 111 |
def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int, target_word: str) -> Dict[str, Any]:
|
| 112 |
"""Extract grammatical and contextual information using character indices"""
|
| 113 |
if nlp is None:
|
|
|
|
| 117 |
if start_idx < 0 or end_idx > len(text) or start_idx >= end_idx:
|
| 118 |
raise HTTPException(status_code=400, detail="Invalid start/end indices")
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
try:
|
|
|
|
| 121 |
doc = nlp(text)
|
| 122 |
|
| 123 |
# Find the token that corresponds to our character indices
|
|
|
|
| 170 |
}
|
| 171 |
|
| 172 |
except Exception as ex:
|
| 173 |
+
app_logger.error(f"Error in contextual analysis: {ex}", exc_info=True)
|
| 174 |
raise HTTPException(status_code=500, detail=f"Error analyzing context: {str(ex)}")
|
| 175 |
|
| 176 |
|
|
|
|
| 202 |
'pos': synset.pos()
|
| 203 |
}
|
| 204 |
|
| 205 |
+
# Use a set to avoid duplicate synonyms from different lemmas in the same synset
|
| 206 |
+
unique_synonyms = set()
|
| 207 |
for lemma in synset.lemmas():
|
| 208 |
synonym = lemma.name().replace('_', ' ')
|
| 209 |
if synonym.lower() != word.lower():
|
| 210 |
+
unique_synonyms.add(synonym)
|
| 211 |
|
| 212 |
+
if unique_synonyms:
|
| 213 |
+
sense_data['synonyms'] = sorted(list(unique_synonyms))
|
| 214 |
synonyms_by_sense.append(sense_data)
|
| 215 |
|
| 216 |
return synonyms_by_sense
|
|
|
|
| 230 |
|
| 231 |
# Handle capitalization first using .get() for safety
|
| 232 |
if original_token_info.get('is_title'):
|
| 233 |
+
synonym = synonym.title() # .title() is better for multi-word phrases
|
| 234 |
elif original_token_info.get('is_upper'):
|
| 235 |
synonym = synonym.upper()
|
| 236 |
elif original_token_info.get('is_lower', True): # Default to lower
|
|
|
|
| 250 |
doc = nlp(synonym)
|
| 251 |
if doc and len(doc) > 0:
|
| 252 |
inflected = doc[0]._.inflect(tag)
|
| 253 |
+
if inflected:
|
| 254 |
+
# Re-join with the rest of the phrase if it was multi-word
|
| 255 |
+
return inflected + synonym[len(doc[0].text):]
|
| 256 |
+
return synonym # Return original if inflection fails
|
| 257 |
|
| 258 |
except Exception as ex:
|
| 259 |
app_logger.warning(f"Inflection error for '{synonym}': '{ex}'")
|
|
|
|
| 267 |
"""Process synonym groups with inflection matching"""
|
| 268 |
# Get synonyms from wn
|
| 269 |
t0 = datetime.now()
|
| 270 |
+
# Get synonyms from wn using the lemma
|
| 271 |
+
synonyms_by_sense = get_wordnet_synonyms(context_info['lemma'], context_info['pos'])
|
| 272 |
t1 = datetime.now()
|
| 273 |
duration = (t1 - t0).total_seconds()
|
| 274 |
app_logger.info(f"# 1/Got get_wordnet_synonyms result with '{word}' word in {duration:.3f}s.")
|
|
|
|
| 294 |
processed_sense["synonyms"].append({
|
| 295 |
"base_form": base_form,
|
| 296 |
"inflected_form": inflected_form,
|
| 297 |
+
"matches_context": inflected_form.lower() != base_form.lower()
|
| 298 |
})
|
| 299 |
|
| 300 |
processed_synonyms.append(processed_sense)
|
my_ghost_writer/type_hints.py
CHANGED
|
@@ -44,7 +44,7 @@ class ContextInfo(BaseModel):
|
|
| 44 |
context_words: list[str]
|
| 45 |
dependency: str
|
| 46 |
|
| 47 |
-
class
|
| 48 |
success: bool
|
| 49 |
original_word: str
|
| 50 |
original_indices: dict[str, int]
|
|
@@ -53,6 +53,21 @@ class SynonymResponse(BaseModel):
|
|
| 53 |
message: Optional[str] = None
|
| 54 |
debug_info: Optional[dict[str, Any]] = None
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
class HealthCheckResponse(BaseModel):
|
| 57 |
success: bool
|
| 58 |
status: str
|
|
|
|
| 44 |
context_words: list[str]
|
| 45 |
dependency: str
|
| 46 |
|
| 47 |
+
class SingleWordSynonymResponse(BaseModel):
|
| 48 |
success: bool
|
| 49 |
original_word: str
|
| 50 |
original_indices: dict[str, int]
|
|
|
|
| 53 |
message: Optional[str] = None
|
| 54 |
debug_info: Optional[dict[str, Any]] = None
|
| 55 |
|
| 56 |
+
class WordSynonymResult(BaseModel):
|
| 57 |
+
original_word: str
|
| 58 |
+
original_indices: dict[str, int]
|
| 59 |
+
context_info: ContextInfo
|
| 60 |
+
synonym_groups: list[SynonymGroup]
|
| 61 |
+
debug_info: Optional[dict[str, Any]] = None
|
| 62 |
+
|
| 63 |
+
class MultiWordSynonymResponse(BaseModel):
|
| 64 |
+
success: bool
|
| 65 |
+
original_phrase: str
|
| 66 |
+
original_indices: dict[str, int]
|
| 67 |
+
results: list[WordSynonymResult]
|
| 68 |
+
message: Optional[str] = None
|
| 69 |
+
|
| 70 |
+
|
| 71 |
class HealthCheckResponse(BaseModel):
|
| 72 |
success: bool
|
| 73 |
status: str
|