from datetime import datetime from typing import Any, Optional import nltk # pynflect needed to avoid different inflection import pyinflect import spacy from fastapi import HTTPException from nltk.corpus.reader import Synset from my_ghost_writer.constants import ELIGIBLE_POS, NLTK_DATA, SPACY_MODEL_NAME, app_logger from my_ghost_writer.custom_synonym_handler import CustomSynonymHandler from my_ghost_writer.thesaurus import wn from my_ghost_writer.type_hints import ContextInfo, RelatedWordGroup, RelatedWordOption, RelatedWordWordResult, \ TermRelationships custom_synonym_handler = CustomSynonymHandler() # Load spaCy model nlp = None try: nlp = spacy.load(SPACY_MODEL_NAME) app_logger.info(f"spacy model {SPACY_MODEL_NAME} has type:'{type(nlp)}'") except (OSError, IOError) as io_ex: app_logger.error(io_ex) app_logger.error( f"spaCy model '{SPACY_MODEL_NAME}' not found. Please install it with: 'python -m spacy download {SPACY_MODEL_NAME}'" ) # Ensure NLTK data is downloaded try: app_logger.info(f"Downloading NLTK data to the folder:'{NLTK_DATA}'") nltk.download('punkt_tab', quiet=False, download_dir=NLTK_DATA) nltk.download('wordnet', quiet=False, download_dir=NLTK_DATA) nltk.download('wordnet31', quiet=False, download_dir=NLTK_DATA) except Exception as e: app_logger.error(f"Failed to download NLTK data: {e}") def is_nlp_available() -> bool: """Check if spaCy model is available""" return nlp is not None def find_synonyms_for_phrase(text: str, start_idx: int, end_idx: int) -> list[RelatedWordWordResult]: """ Finds related words for all eligible words within a selected text span. It analyzes the span, filters for meaningful words (nouns, verbs, etc.), and returns a list of related word results for each. Raises: HTTPException: If the spaCy model is unavailable. Args: text: The input text (str). start_idx: The start index of the phrase within the text (int). end_idx: The end index of the phrase within the text (int). Returns: A list of RelatedWordWordResult objects, representing the related words for each eligible word (list[RelatedWordWordResult]). """ if nlp is None: app_logger.error( f"spaCy model '{SPACY_MODEL_NAME}' not found. Please install it with: 'python -m spacy download {SPACY_MODEL_NAME}'" ) raise HTTPException(status_code=503, detail="NLP service is unavailable") doc = nlp(text) # Use 'expand' to ensure the span covers full tokens even with partial selection span = doc.char_span(start_idx, end_idx, alignment_mode="expand") if span is None: app_logger.warning(f"Could not create a valid token span from indices {start_idx}-{end_idx}.") # Return an empty list if no valid span can be formed, the client can handle this return [] # Define which POS tags are eligible for synonym lookup results: list[RelatedWordWordResult] = [] for token in span: # Process only if the token is an eligible part of speech and not a stop word or punctuation if token.pos_ in ELIGIBLE_POS and not token.is_stop and not token.is_punct: try: # 1. Get context for this specific token context_info_dict = extract_contextual_info_by_indices( text, token.idx, token.idx + len(token.text), token.text ) # 2. Get related word groups using the token's lemma for a better search related_word_groups_list = process_synonym_groups(context_info_dict["lemma"], context_info_dict) # 3. If we find related words, build the result object for this word if related_word_groups_list: # Restructure dicts into Pydantic models for type safety context_info_model = ContextInfo( pos=context_info_dict["pos"], sentence=context_info_dict["context_sentence"], grammatical_form=context_info_dict["tag"], context_words=context_info_dict["context_words"], dependency=context_info_dict["dependency"], ) local_start_idx = token.idx - start_idx local_end_idx = local_start_idx + len(token.text) sliced_sentence = text[start_idx:end_idx] sliced_word = sliced_sentence[local_start_idx:local_end_idx] assert sliced_word == token.text, (f"Mismatch! sliced_word ({sliced_word}) != token.text ({token.text}), but these substrings should be equal.\n" f" start_idx:{start_idx}, End_word:{end_idx}. local_start_idx:{local_start_idx}, local_end_idx:{local_end_idx}.") word_result = RelatedWordWordResult( original_word=token.text, original_indices={"start": local_start_idx, "end": local_end_idx}, context_info=context_info_model, related_word_groups=related_word_groups_list, debug_info={ "spacy_token_indices": { "start": context_info_dict["char_start"], "end": context_info_dict["char_end"], }, "lemma": context_info_dict["lemma"] } ) results.append(word_result) except HTTPException as http_ex: app_logger.warning(f"Could not process token '{token.text}': '{http_ex.detail}'") except Exception as synonym_ex: app_logger.error(f"Unexpected error processing token '{token.text}': '{synonym_ex}'", exc_info=True) return results def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int, target_word: str) -> dict[str, Any]: """ Extract grammatical and contextual information using character indices. Raises: HTTPException: If the spaCy model is unavailable or if the indices are invalid. Args: text: The input text (str). start_idx: The start index of the word within the text (int). end_idx: The end index of the word within the text (int). target_word: The target word (str). Returns: A dictionary containing contextual information about the word (dict[str, Any). """ if nlp is None: raise HTTPException(status_code=500, detail="spaCy model not available") # Verify the indices match the expected word if start_idx < 0 or end_idx > len(text) or start_idx >= end_idx: raise HTTPException(status_code=400, detail="Invalid start/end indices") try: doc = nlp(text) # Find the token that corresponds to our character indices target_token = None for token in doc: # Check if this token overlaps with our target indices if (token.idx <= start_idx < token.idx + len(token.text) or start_idx <= token.idx < end_idx): target_token = token break # If the primary loop didn't find a token, it's an unexpected state, # but the original code to handle this was unreachable. # The most likely failure is now a word/index mismatch, handled above. if target_token is None or str(target_token) != target_word: raise HTTPException( status_code=400, detail=f"Could not find token for word '{target_word}' at indices {start_idx}-{end_idx}" ) # Extract surrounding context (±5 words) sentence_tokens = [t for t in target_token.sent if not t.is_space] target_position_in_sentence = None for i, token in enumerate(sentence_tokens): if token == target_token: target_position_in_sentence = i break # Get the context window context_start = max(0, target_position_in_sentence - 5) if target_position_in_sentence else 0 context_end = min(len(sentence_tokens), target_position_in_sentence + 6) if target_position_in_sentence else len(sentence_tokens) context_words = [t.text for t in sentence_tokens[context_start:context_end]] return { "word": target_token.text, "lemma": target_token.lemma_, "pos": target_token.pos_, "tag": target_token.tag_, "is_title": target_token.is_title, "is_upper": target_token.is_upper, "is_lower": target_token.is_lower, "dependency": target_token.dep_, "context_sentence": target_token.sent.text, "context_words": context_words, "sentence_position": target_position_in_sentence, "char_start": target_token.idx, "char_end": target_token.idx + len(target_token.text), "original_indices": {"start": start_idx, "end": end_idx}, } except Exception as indices_ex: app_logger.error(f"Error in contextual analysis: {indices_ex}", exc_info=True) raise HTTPException(status_code=500, detail=f"Error analyzing context: {str(indices_ex)}") def get_wordnet_synonyms(word: str, pos_tag: Optional[str] = None) -> list[dict[str, Any]]: """ Gets related words from WordNet and custom synonym handler, returning a list of dictionaries containing the raw data, grouped by relation type. Args: word: The word to get related words for (str). pos_tag: An optional part-of-speech tag to filter WordNet results (Optional[str]). Returns: A list of dictionaries, where each dictionary represents a group of related words (list[dict[str, Any]]). """ related_word_groups_raw: list[dict[str, Any]] = [] word_lower = word.lower() # 1. Custom Related Word Lookup (all relationships) _extract_related_word_groups_custom(related_word_groups_raw, word_lower) # 2. WordNet Lookup try: # Map spaCy POS to wn POS pos_map = { "NOUN": wn.NOUN, "VERB": wn.VERB, "ADJ": wn.ADJ, "ADV": wn.ADV, } # Get all synsets for the word synsets = wn.synsets(word) # Filter by POS if provided if pos_tag and pos_tag in pos_map: synsets = [s for s in synsets if s.pos() == pos_map[pos_tag]] # Process each synset and its relations for synset in synsets: result = _get_related_words(synset, TermRelationships.SYNONYM, word_lower) related_word_groups_raw.append(result) for lemma in synset.lemmas(): result = _get_related_words(lemma, TermRelationships.ANTONYM, word_lower) related_word_groups_raw.append(result) for rel_type in [ TermRelationships.HYPERNYM, TermRelationships.HYPONYM, TermRelationships.MERONYM, TermRelationships.HOLONYM, TermRelationships.ALSO_SEE, TermRelationships.CAUSE, # todo: try to understand how to fix the related missing methods # TermRelationships.DERIVATIONALLY_RELATED_FORM, # TermRelationships.ENTAILMENT, # TermRelationships.PERTAINYM, TermRelationships.SIMILAR_TO ]: result = _get_related_words(synset, rel_type, word_lower) related_word_groups_raw.append(result) except Exception as ex1: app_logger.error(f"Error getting wn synonyms: '{ex1}' with: word:{type(word)}, '{word}', pos_tag: {type(pos_tag)}, '{pos_tag}'") raise HTTPException(status_code=500, detail=f"Error retrieving related words: '{str(ex1)}'") return [related_words for related_words in related_word_groups_raw if related_words is not None] def _extract_related_word_groups_custom(related_word_groups_raw, word_lower): for rel_type in TermRelationships: custom_groups = custom_synonym_handler.get_related(word_lower, rel_type) if custom_groups: for related in custom_groups: words = related["words"] definition = related.get("definition", "") related_word_options = [] for word_from_related_words in words: related_word_options.append({ "base_form": word_from_related_words, "is_custom": True, "definition": definition, }) related_word_groups_raw.append({ "relation_type": rel_type, "source": "custom", "definition": definition, "examples": [], "wordnet_pos": None, "related_words": related_word_options, }) def _get_base_form_by_synset_type(local_lemma: str, inner_word_lower: str, related_words: list[dict]) -> list[dict]: lemma_name = local_lemma.replace("_", " ") if lemma_name.lower() != inner_word_lower: related_words.append({ "base_form": lemma_name }) return related_words def _get_related_words(related_object, relation_type: TermRelationships, inner_word_lower: str) -> dict|None: related_words = [] if relation_type == TermRelationships.SYNONYM: # related_object is a Synset for local_lemma in related_object.lemmas(): _get_base_form_by_synset_type(local_lemma.name(), inner_word_lower, related_words) elif relation_type == TermRelationships.ANTONYM: # related_object is a Lemma for ant in related_object.antonyms(): _get_base_form_by_synset_type(ant.name(), inner_word_lower, related_words) else: # related_object is a Synset # Get related synsets from the appropriate method relation_methods = { TermRelationships.HYPERNYM: related_object.hypernyms, TermRelationships.HYPONYM: related_object.hyponyms, TermRelationships.MERONYM: lambda: related_object.member_meronyms() + related_object.substance_meronyms() + related_object.part_meronyms(), TermRelationships.HOLONYM: lambda: related_object.member_holonyms() + related_object.substance_holonyms() + related_object.part_holonyms(), TermRelationships.ALSO_SEE: related_object.also_sees, TermRelationships.CAUSE: related_object.causes, # TermRelationships.DERIVATIONALLY_RELATED_FORM: related_object.derivationally_related_forms, # TermRelationships.ENTAILMENT: related_object.entails, # TermRelationships.PERTAINYM: related_object.pertainyms, TermRelationships.SIMILAR_TO: related_object.similar_tos, } get_words_fn = relation_methods.get(relation_type) if get_words_fn: for related_synset in get_words_fn(): _extract_lemmas_or_names_from_synset(inner_word_lower, related_synset, related_words) if related_words: return { "relation_type": relation_type, "source": "wordnet", "definition": _get_related_object_definition(related_object), "examples": _get_related_object_examples(related_object), "wordnet_pos": _get_related_wordnet_pos(related_object), "related_words": related_words, } return None def _extract_lemmas_or_names_from_synset(inner_word_lower, related_synset, related_words): # Some methods return Lemma objects, handle both cases if hasattr(related_synset, "lemmas"): for local_lemma in related_synset.lemmas(): _get_base_form_by_synset_type(local_lemma.name(), inner_word_lower, related_words) elif hasattr(related_synset, "name"): _get_base_form_by_synset_type(related_synset.name(), inner_word_lower, related_words) def _get_related_wordnet_pos(related_object: Synset): return related_object.pos() if hasattr(related_object, "pos") else None def _get_related_object_examples(related_object: Synset, n: int = 2) -> list[str]: return related_object.examples()[:n] if hasattr(related_object, "examples") else [] def _get_related_object_definition(related_object: Synset) -> str: return related_object.definition() if hasattr(related_object, "definition") else "" def inflect_synonym(synonym: str, original_token_info: dict[str, Any]) -> str: """Adapt the input synonym arg to match the original word's grammatical form""" if nlp is None: return synonym pos = original_token_info.get("pos") tag = original_token_info.get("tag") # Handle capitalization first using .get() for safety if original_token_info.get("is_title"): synonym = synonym.title() # .title() is better for multi-word phrases elif original_token_info.get("is_upper"): synonym = synonym.upper() elif original_token_info.get("is_lower", True): # Default to lower synonym = synonym.lower() # Handle grammatical inflection try: # Define all tags that require inflection in one place inflection_tags = { "NOUN": ["NNS", "NNPS"], "VERB": ["VBD", "VBN", "VBZ", "VBG"], "ADJ": ["JJR", "JJS"], } # Single check for all inflection cases if pos in inflection_tags and tag in inflection_tags.get(pos, []): doc = nlp(synonym) if doc and len(doc) > 0: inflected = doc[0]._.inflect(tag) if inflected: # Re-join with the rest of the phrase if it was multi-word return inflected + synonym[len(doc[0].text):] return synonym # Return original if inflection fails except Exception as ex2: app_logger.warning(f"Inflection error for '{synonym}': '{ex2}'") # Return the original synonym if inflection fails return synonym def process_synonym_groups(word: str, context_info: dict[str, Any]) -> list[RelatedWordGroup]: """Process given related word groups with inflection matching Args: word (str): the word context_info (dict[str, Any]): the original form of data Returns: list[RelatedWordGroup]: List of the processed related words """ # Get related words from wn t0 = datetime.now() # Get related words from wn using the lemma related_words_raw = get_wordnet_synonyms(context_info["lemma"], context_info["pos"]) t1 = datetime.now() duration = (t1 - t0).total_seconds() app_logger.info(f"# 1/Got get_wordnet_synonyms result with '{word}' word in {duration:.3f}s.") if not related_words_raw: return [] # Process each related word group processed_groups: list[RelatedWordGroup] = [] for related_group in related_words_raw: app_logger.info(f"related_group:'{related_group}'") relation_type = related_group["relation_type"] definition = related_group.get("definition", "") examples = related_group.get("examples", []) wordnet_pos = related_group.get("wordnet_pos") related_words = related_group["related_words"] processed_options: list[RelatedWordOption] = [] for related_word in related_words: base_form = related_word["base_form"] inflected_form = inflect_synonym(base_form, context_info) related_word_option = RelatedWordOption( base_form=base_form, inflected_form=inflected_form, matches_context=inflected_form.lower() != base_form.lower() ) if "is_custom" in related_word: related_word_option.is_custom = related_word["is_custom"] processed_options.append(related_word_option) app_logger.info(f"wordnet_pos:{type(wordnet_pos)}, '{wordnet_pos}'") processed_groups.append( RelatedWordGroup( relation_type=relation_type, definition=definition, examples=examples, related_words=processed_options, wordnet_pos=wordnet_pos ) ) return processed_groups