Spaces:

aletrn
/

mgw

Sleeping

App Files Files Community

alessandro trinca tornidor commited on Jul 11

Commit

5ffb0e3

1 Parent(s): a901fdc

feat: multi-words substitution support, backend part

Browse files

Files changed (5) hide show

.vscode/launch.json +71 -1
my_ghost_writer/app.py +66 -4
my_ghost_writer/constants.py +2 -1
my_ghost_writer/text_parsers2.py +91 -18
my_ghost_writer/type_hints.py +16 -1

.vscode/launch.json CHANGED Viewed

@@ -12,7 +12,77 @@
             "env": {
                 "IS_TESTING": "TRUE",
                 "LOG_LEVEL": "DEBUG"
-            },
         }
     ]
 }

             "env": {
                 "IS_TESTING": "TRUE",
                 "LOG_LEVEL": "DEBUG"
+            }
+        },
+        {
+            "type": "msedge",
+            "request": "launch",
+            "name": "Launch app port 8000",
+            "url": "http://localhost:8000/#",
+            "runtimeExecutable": "/snap/bin/chromium",
+            "file": "${workspaceFolder}/lite.koboldai.net/index.html",
+            "webRoot": "${workspaceFolder}/lite.koboldai.net",
+            "trace": true,
+            "port": 8000,
+            "runtimeArgs": [
+                "--remote-debugging-port=9222"
+            ]
+        },
+        {
+            "type": "msedge",
+            "name": "Launch Edge (Visible)",
+            "request": "launch",
+            "runtimeArgs": [
+                "--remote-debugging-port=9222"
+            ],
+            "url": "http://localhost:8000", // or your dev server address
+            "presentation": {
+                "hidden": false
+            }
+        },
+        {
+            "type": "chrome",
+            "request": "attach",
+            "name": "Attach to browser",
+            "port": 9222,
+            "address": "localhost"
+        },
+        {
+            "type": "pwa-msedge",
+            "name": "Launch Microsoft Edge",
+            "request": "launch",
+            "runtimeArgs": [
+                "--remote-debugging-port=8000"
+            ],
+            "url": "http://localhost:8000/#",
+            "file": "${workspaceFolder}/lite.koboldai.net/index.html",
+            "webRoot": "${workspaceFolder}/lite.koboldai.net",
+            "trace": true
+        },
+        {
+            "type": "vscode-edge-devtools.debug",
+            "name": "Open Edge DevTools",
+            "request": "attach",
+            "url": "c:\\Users\\trincuz\\.vscode\\extensions\\ms-edgedevtools.vscode-edge-devtools-2.1.9\\out\\startpage\\index.html",
+            "presentation": {
+                "hidden": true
+            }
         }
+    ],
+    "compounds": [
+        {
+            "name": "Launch Edge and attach DevTools",
+            "configurations": [
+                "Launch Microsoft Edge",
+                "Open Edge DevTools"
+            ]
+        },
+        {
+            "name": "Launch Edge (Visible) and attach DevTools",
+            "configurations": [
+                "Launch Edge (Visible)",
+                "Open Edge DevTools"
+            ]
+        },
     ]
 }

my_ghost_writer/app.py CHANGED Viewed

@@ -24,10 +24,11 @@ from my_ghost_writer.constants import (ALLOWED_ORIGIN_LIST, API_MODE, DOMAIN, IS
    ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP, ME_CONFIG_MONGODB_USE_OK, PORT, RAPIDAPI_HOST, STATIC_FOLDER,
    STATIC_FOLDER_LITEKOBOLDAINET, WORDSAPI_KEY, WORDSAPI_URL, app_logger)
 from my_ghost_writer.pymongo_utils import mongodb_health_check
-from my_ghost_writer.text_parsers2 import extract_contextual_info_by_indices, process_synonym_groups
 from my_ghost_writer.thesaurus import get_current_info_wordnet, get_synsets_by_word_and_language
-from my_ghost_writer.type_hints import RequestQueryThesaurusInflatedBody, SynonymResponse
-from my_ghost_writer.type_hints import RequestQueryThesaurusWordsapiBody, RequestSplitText, RequestTextFrequencyBody
 async def mongo_health_check_background_task():
@@ -261,7 +262,7 @@ def get_thesaurus_wordsapi(body: RequestQueryThesaurusWordsapiBody | str) -> JSO
         raise HTTPException(status_code=response.status_code, detail=msg)
-@app.post("/thesaurus-inflated", response_model=SynonymResponse)
 async def get_synonyms(body: RequestQueryThesaurusInflatedBody):
     """
     Get contextually appropriate synonyms for a word at specific indices in text.
@@ -360,6 +361,67 @@ async def get_synonyms(body: RequestQueryThesaurusInflatedBody):
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.exception_handler(HTTPException)
 def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
     origin = request.headers.get("origin")

    ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP, ME_CONFIG_MONGODB_USE_OK, PORT, RAPIDAPI_HOST, STATIC_FOLDER,
    STATIC_FOLDER_LITEKOBOLDAINET, WORDSAPI_KEY, WORDSAPI_URL, app_logger)
 from my_ghost_writer.pymongo_utils import mongodb_health_check
+from my_ghost_writer.text_parsers2 import extract_contextual_info_by_indices, process_synonym_groups, find_synonyms_for_phrase
 from my_ghost_writer.thesaurus import get_current_info_wordnet, get_synsets_by_word_and_language
+from my_ghost_writer.type_hints import (RequestQueryThesaurusInflatedBody, RequestQueryThesaurusWordsapiBody,
+                                        RequestSplitText, RequestTextFrequencyBody, MultiWordSynonymResponse,
+                                        SingleWordSynonymResponse)
 async def mongo_health_check_background_task():
         raise HTTPException(status_code=response.status_code, detail=msg)
+@app.post("/thesaurus-inflated", response_model=SingleWordSynonymResponse)
 async def get_synonyms(body: RequestQueryThesaurusInflatedBody):
     """
     Get contextually appropriate synonyms for a word at specific indices in text.
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+@app.post("/thesaurus-inflated-phrase", response_model=MultiWordSynonymResponse)
+async def get_synonyms_for_phrase(body: RequestQueryThesaurusInflatedBody):
+    """
+    Get contextual synonyms for a selected phrase (one or more words).
+    It identifies all meaningful words in the selection and returns
+    synonym groups for each.
+    """
+    app_logger.info(f"body tye:{type(body)}!")
+    app_logger.info(f"body:{body}!")
+    t0 = datetime.now()
+    try:
+        body_validated = RequestQueryThesaurusInflatedBody.model_validate_json(body)
+        end = body_validated.end
+        start = body_validated.start
+        text = body_validated.text
+        word = body_validated.word
+    except ValidationError:
+        assert isinstance(body, RequestQueryThesaurusInflatedBody), f"body MUST be of type RequestSplitText, not of '{type(body)}'!"
+        end = body.end
+        start = body.start
+        text = body.text
+        word = body.word
+    app_logger.info(f"end:{end}!")
+    app_logger.info(f"start:{start}!")
+    app_logger.info(f"text:{text}!")
+    app_logger.info(f"word:{word}!")
+    try:
+        # The new function in text_parsers2 does all the heavy lifting
+        results = find_synonyms_for_phrase(
+            text=body.text,
+            start_idx=body.start,
+            end_idx=body.end
+        )
+        t1 = datetime.now()
+        duration = (t1 - t0).total_seconds()
+        app_logger.info(f"got find_synonyms_for_phrase() result in: {duration:.3f}s. ...")
+        app_logger.info(results)
+        message = f"Got {len(results)} synonym groups." if results else "No words with synonyms found in the selected phrase."
+        t2 = datetime.now()
+        duration = (t2 - t1).total_seconds()
+        app_logger.info(f"got MultiWordSynonymResponse() result in: {duration:.3f}s. ...")
+        # Construct the final response using our Pydantic model
+        return MultiWordSynonymResponse(
+            success=True,
+            original_phrase=body.word,
+            original_indices={"start": body.start, "end": body.end},
+            results=results,
+            message=message
+        )
+    except HTTPException:
+        # Re-raise known HTTP exceptions to be handled by FastAPI's handler
+        raise
+    except Exception as e:
+        app_logger.error(f"Unexpected error in get_synonyms_for_phrase: '{e}'", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.exception_handler(HTTPException)
 def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
     origin = request.headers.get("origin")

my_ghost_writer/constants.py CHANGED Viewed

@@ -32,7 +32,8 @@ ME_CONFIG_MONGODB_TIMEOUT_LOCAL = int(os.getenv("ME_CONFIG_MONGODB_TIMEOUT_LOCAL
 ME_CONFIG_MONGODB_TIMEOUT_REMOTE = int(os.getenv("ME_CONFIG_MONGODB_TIMEOUT_REMOTE", 3000))
 ME_CONFIG_MONGODB_TIMEOUT = int(os.getenv(    "ME_CONFIG_MONGODB_TIMEOUT",     ME_CONFIG_MONGODB_TIMEOUT_LOCAL if ME_CONFIG_MONGODB_URL == ME_CONFIG_MONGODB_URL_LOCAL else ME_CONFIG_MONGODB_TIMEOUT_REMOTE))
 ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP = int(os.getenv("ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP", 900))
-DEFAULT_DBNAME_THESAURUS = "thesaurus"
 DEFAULT_COLLECTION_THESAURUS =os.getenv("DEFAULT_COLLECTION_THESAURUS", "wordsapi")
 session_logger.setup_logging(json_logs=LOG_JSON_FORMAT, log_level=LOG_LEVEL)
 app_logger = structlog.stdlib.get_logger(__name__)

 ME_CONFIG_MONGODB_TIMEOUT_REMOTE = int(os.getenv("ME_CONFIG_MONGODB_TIMEOUT_REMOTE", 3000))
 ME_CONFIG_MONGODB_TIMEOUT = int(os.getenv(    "ME_CONFIG_MONGODB_TIMEOUT",     ME_CONFIG_MONGODB_TIMEOUT_LOCAL if ME_CONFIG_MONGODB_URL == ME_CONFIG_MONGODB_URL_LOCAL else ME_CONFIG_MONGODB_TIMEOUT_REMOTE))
 ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP = int(os.getenv("ME_CONFIG_MONGODB_HEALTHCHECK_SLEEP", 900))
 DEFAULT_COLLECTION_THESAURUS =os.getenv("DEFAULT_COLLECTION_THESAURUS", "wordsapi")
+DEFAULT_DBNAME_THESAURUS = "thesaurus"
+ELIGIBLE_POS = {'NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV'}
 session_logger.setup_logging(json_logs=LOG_JSON_FORMAT, log_level=LOG_LEVEL)
 app_logger = structlog.stdlib.get_logger(__name__)

my_ghost_writer/text_parsers2.py CHANGED Viewed

@@ -8,7 +8,8 @@ import pyinflect
 from typing import List, Dict, Any, Optional
 from fastapi import HTTPException
-from my_ghost_writer.constants import SPACY_MODEL_NAME, app_logger
 # Load spaCy model
@@ -23,8 +24,8 @@ except OSError:
 # Ensure NLTK data is downloaded
 try:
-    nltk.download('wordnet', quiet=True)
-    nltk.download('omw-1.4', quiet=True)
 except Exception as e:
     app_logger.error(f"Failed to download NLTK data: {e}")
@@ -34,6 +35,79 @@ def is_nlp_available() -> bool:
     return nlp is not None
 def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int, target_word: str) -> Dict[str, Any]:
     """Extract grammatical and contextual information using character indices"""
     if nlp is None:
@@ -43,15 +117,7 @@ def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int,
     if start_idx < 0 or end_idx > len(text) or start_idx >= end_idx:
         raise HTTPException(status_code=400, detail="Invalid start/end indices")
-    extracted_word = text[start_idx:end_idx].strip()
-    if extracted_word.lower() != target_word.lower():
-        raise HTTPException(
-            status_code=400,
-            detail=f"Word mismatch: expected '{target_word}', got '{extracted_word}'"
-        )
     try:
-        # Process the entire text with spaCy
         doc = nlp(text)
         # Find the token that corresponds to our character indices
@@ -104,7 +170,7 @@ def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int,
         }
     except Exception as ex:
-        app_logger.error(f"Error in contextual analysis: {ex}")
         raise HTTPException(status_code=500, detail=f"Error analyzing context: {str(ex)}")
@@ -136,12 +202,15 @@ def get_wordnet_synonyms(word: str, pos_tag: Optional[str] = None) -> List[Dict[
                 'pos': synset.pos()
             }
             for lemma in synset.lemmas():
                 synonym = lemma.name().replace('_', ' ')
                 if synonym.lower() != word.lower():
-                    sense_data['synonyms'].append(synonym)
-            if sense_data['synonyms']:  # Only add if we have synonyms
                 synonyms_by_sense.append(sense_data)
         return synonyms_by_sense
@@ -161,7 +230,7 @@ def inflect_synonym(synonym: str, original_token_info: Dict[str, Any]) -> str:
     # Handle capitalization first using .get() for safety
     if original_token_info.get('is_title'):
-        synonym = synonym.capitalize()
     elif original_token_info.get('is_upper'):
         synonym = synonym.upper()
     elif original_token_info.get('is_lower', True):  # Default to lower
@@ -181,7 +250,10 @@ def inflect_synonym(synonym: str, original_token_info: Dict[str, Any]) -> str:
             doc = nlp(synonym)
             if doc and len(doc) > 0:
                 inflected = doc[0]._.inflect(tag)
-                return inflected if inflected else synonym
     except Exception as ex:
         app_logger.warning(f"Inflection error for '{synonym}': '{ex}'")
@@ -195,7 +267,8 @@ def process_synonym_groups(word: str, context_info: Dict[str, Any]) -> List[Dict
     """Process synonym groups with inflection matching"""
     # Get synonyms from wn
     t0 = datetime.now()
-    synonyms_by_sense = get_wordnet_synonyms(word, context_info['pos'])
     t1 = datetime.now()
     duration = (t1 - t0).total_seconds()
     app_logger.info(f"# 1/Got get_wordnet_synonyms result with '{word}' word in {duration:.3f}s.")
@@ -221,7 +294,7 @@ def process_synonym_groups(word: str, context_info: Dict[str, Any]) -> List[Dict
             processed_sense["synonyms"].append({
                 "base_form": base_form,
                 "inflected_form": inflected_form,
-                "matches_context": inflected_form != base_form
             })
         processed_synonyms.append(processed_sense)

 from typing import List, Dict, Any, Optional
 from fastapi import HTTPException
+from my_ghost_writer.constants import SPACY_MODEL_NAME, app_logger, ELIGIBLE_POS
+from my_ghost_writer.type_hints import WordSynonymResult, ContextInfo, SynonymGroup
 # Load spaCy model
 # Ensure NLTK data is downloaded
 try:
+    nltk.download('wordnet', quiet=False)
+    nltk.download('omw-1.4', quiet=False)
 except Exception as e:
     app_logger.error(f"Failed to download NLTK data: {e}")
     return nlp is not None
+# --- NEW: Main function for handling multi-word selections ---
+def find_synonyms_for_phrase(text: str, start_idx: int, end_idx: int) -> List[WordSynonymResult]:
+    """
+    Finds synonyms for all eligible words within a selected text span.
+    It analyzes the span, filters for meaningful words (nouns, verbs, etc.),
+    and returns a list of synonym results for each.
+    """
+    if nlp is None:
+        raise HTTPException(status_code=503, detail="NLP service is unavailable")
+    doc = nlp(text)
+    # Use 'expand' to ensure the span covers full tokens even with partial selection
+    span = doc.char_span(start_idx, end_idx, alignment_mode="expand")
+    if span is None:
+        app_logger.warning(f"Could not create a valid token span from indices {start_idx}-{end_idx}.")
+        # Return an empty list if no valid span can be formed, the client can handle this
+        return []
+    # Define which POS tags are eligible for synonym lookup
+    results: List[WordSynonymResult] = []
+    for token in span:
+        # Process only if the token is an eligible part of speech and not a stop word or punctuation
+        if token.pos_ in ELIGIBLE_POS and not token.is_stop and not token.is_punct:
+            try:
+                # 1. Get context for this specific token
+                context_info_dict = extract_contextual_info_by_indices(
+                    text, token.idx, token.idx + len(token.text), token.text
+                )
+                # 2. Get synonym groups using the token's lemma for a better search
+                synonym_groups_list = process_synonym_groups(context_info_dict['lemma'], context_info_dict)
+                # 3. If we find synonyms, build the result object for this word
+                if synonym_groups_list:
+                    # Restructure dicts into Pydantic models for type safety
+                    context_info_model = ContextInfo(
+                        pos=context_info_dict['pos'],
+                        sentence=context_info_dict['context_sentence'],
+                        grammatical_form=context_info_dict['tag'],
+                        context_words=context_info_dict['context_words'],
+                        dependency=context_info_dict['dependency']
+                    )
+                    local_start_idx = token.idx - start_idx
+                    local_end_idx = local_start_idx + len(token.text)
+                    sliced_sentence = text[start_idx:end_idx]
+                    sliced_word = sliced_sentence[local_start_idx:local_end_idx]
+                    assert sliced_word == token.text, (f"Mismatch! sliced_word ({sliced_word}) != token.text ({token.text}), but these substrings should be equal.\n"
+                                                       f" start_idx:{start_idx}, End_word:{end_idx}. local_start_idx:{local_start_idx}, local_end_idx:{local_end_idx}.")
+                    word_result = WordSynonymResult(
+                        original_word=token.text,
+                        original_indices={"start": local_start_idx, "end": local_end_idx},
+                        context_info=context_info_model,
+                        synonym_groups=[SynonymGroup(**sg) for sg in synonym_groups_list],
+                        debug_info={
+                            "spacy_token_indices": {
+                                "start": context_info_dict['char_start'],
+                                "end": context_info_dict['char_end']
+                            },
+                            "lemma": context_info_dict['lemma']
+                        }
+                    )
+                    results.append(word_result)
+            except HTTPException as http_ex:
+                app_logger.warning(f"Could not process token '{token.text}': '{http_ex.detail}'")
+            except Exception as ex:
+                app_logger.error(f"Unexpected error processing token '{token.text}': '{ex}'", exc_info=True)
+    return results
 def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int, target_word: str) -> Dict[str, Any]:
     """Extract grammatical and contextual information using character indices"""
     if nlp is None:
     if start_idx < 0 or end_idx > len(text) or start_idx >= end_idx:
         raise HTTPException(status_code=400, detail="Invalid start/end indices")
     try:
         doc = nlp(text)
         # Find the token that corresponds to our character indices
         }
     except Exception as ex:
+        app_logger.error(f"Error in contextual analysis: {ex}", exc_info=True)
         raise HTTPException(status_code=500, detail=f"Error analyzing context: {str(ex)}")
                 'pos': synset.pos()
             }
+            # Use a set to avoid duplicate synonyms from different lemmas in the same synset
+            unique_synonyms = set()
             for lemma in synset.lemmas():
                 synonym = lemma.name().replace('_', ' ')
                 if synonym.lower() != word.lower():
+                    unique_synonyms.add(synonym)
+            if unique_synonyms:
+                sense_data['synonyms'] = sorted(list(unique_synonyms))
                 synonyms_by_sense.append(sense_data)
         return synonyms_by_sense
     # Handle capitalization first using .get() for safety
     if original_token_info.get('is_title'):
+        synonym = synonym.title() # .title() is better for multi-word phrases
     elif original_token_info.get('is_upper'):
         synonym = synonym.upper()
     elif original_token_info.get('is_lower', True):  # Default to lower
             doc = nlp(synonym)
             if doc and len(doc) > 0:
                 inflected = doc[0]._.inflect(tag)
+                if inflected:
+                    # Re-join with the rest of the phrase if it was multi-word
+                    return inflected + synonym[len(doc[0].text):]
+                return synonym # Return original if inflection fails
     except Exception as ex:
         app_logger.warning(f"Inflection error for '{synonym}': '{ex}'")
     """Process synonym groups with inflection matching"""
     # Get synonyms from wn
     t0 = datetime.now()
+    # Get synonyms from wn using the lemma
+    synonyms_by_sense = get_wordnet_synonyms(context_info['lemma'], context_info['pos'])
     t1 = datetime.now()
     duration = (t1 - t0).total_seconds()
     app_logger.info(f"# 1/Got get_wordnet_synonyms result with '{word}' word in {duration:.3f}s.")
             processed_sense["synonyms"].append({
                 "base_form": base_form,
                 "inflected_form": inflected_form,
+                "matches_context": inflected_form.lower() != base_form.lower()
             })
         processed_synonyms.append(processed_sense)

my_ghost_writer/type_hints.py CHANGED Viewed

@@ -44,7 +44,7 @@ class ContextInfo(BaseModel):
     context_words: list[str]
     dependency: str
-class SynonymResponse(BaseModel):
     success: bool
     original_word: str
     original_indices: dict[str, int]
@@ -53,6 +53,21 @@ class SynonymResponse(BaseModel):
     message: Optional[str] = None
     debug_info: Optional[dict[str, Any]] = None
 class HealthCheckResponse(BaseModel):
     success: bool
     status: str

     context_words: list[str]
     dependency: str
+class SingleWordSynonymResponse(BaseModel):
     success: bool
     original_word: str
     original_indices: dict[str, int]
     message: Optional[str] = None
     debug_info: Optional[dict[str, Any]] = None
+class WordSynonymResult(BaseModel):
+    original_word: str
+    original_indices: dict[str, int]
+    context_info: ContextInfo
+    synonym_groups: list[SynonymGroup]
+    debug_info: Optional[dict[str, Any]] = None
+class MultiWordSynonymResponse(BaseModel):
+    success: bool
+    original_phrase: str
+    original_indices: dict[str, int]
+    results: list[WordSynonymResult]
+    message: Optional[str] = None
 class HealthCheckResponse(BaseModel):
     success: bool
     status: str