Spaces:

aletrn
/

mgw

Sleeping

mgw

File size: 20,435 Bytes

a901fdc
a707261
a901fdc
b8cdad9
1705761
b8cdad9
a707261
b8cdad9
d6e9ab3
b8cdad9
a707261
88320db
41f6b1c
a707261
 
1705761
b8cdad9
88320db
b8cdad9
224f2bb
b8cdad9
c2c8fde
 
41f6b1c
 
1705761
224f2bb
1705761
b8cdad9
 
 
498803f
 
 
 
b8cdad9
1705761
b8cdad9
 
 
 
 
 
 
a707261
5ffb0e3
a707261
5ffb0e3
a707261
 
 
 
 
 
 
 
 
 
5ffb0e3
 
224f2bb
 
 
5ffb0e3
 
 
 
 
 
 
 
 
 
 
 
a707261
5ffb0e3
 
 
 
 
 
 
 
 
 
a707261
 
5ffb0e3
a707261
 
5ffb0e3
 
a707261
 
 
 
 
5ffb0e3
 
 
 
 
 
 
a707261
5ffb0e3
 
 
a707261
5ffb0e3
 
a707261
 
5ffb0e3
a707261
5ffb0e3
 
 
 
 
 
41f6b1c
 
5ffb0e3
 
 
 
39ba49c
a707261
 
 
 
 
 
 
 
 
 
 
 
 
b8cdad9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab09955
 
 
989f544
b8cdad9
 
 
 
 
 
 
 
 
 
 
 
 
1705761
b8cdad9
 
 
 
 
 
a707261
 
 
 
 
 
 
 
 
 
 
 
 
 
b8cdad9
 
41f6b1c
 
 
b8cdad9
 
39ba49c
a707261
 
 
 
 
 
 
39ba49c
a707261
 
 
 
39ba49c
a434f16
a707261
0343c29
a707261
a434f16
a901fdc
b8cdad9
a707261
 
 
 
b8cdad9
 
 
a901fdc
b8cdad9
 
 
 
 
a707261
b8cdad9
a707261
0343c29
b8cdad9
a707261
0343c29
a707261
0343c29
 
 
 
 
 
 
 
a707261
0343c29
b8cdad9
15f45fc
a707261
 
a434f16
0343c29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a434f16
b8cdad9
39ba49c
b240c9d
39ba49c
b8cdad9
 
 
a707261
 
b8cdad9
1705761
a707261
5ffb0e3
a707261
b8cdad9
a707261
b8cdad9
 
 
 
1705761
 
a707261
 
 
1705761
 
 
 
 
 
 
5ffb0e3
 
 
 
b8cdad9
15f45fc
 
ab09955
b8cdad9
 
 
 
a707261
 
 
 
 
 
 
 
 
 
 
a901fdc
a707261
 
a901fdc
 
 
b8cdad9
a707261
b8cdad9
 
a707261

from datetime import datetime
from typing import Any, Optional

import nltk
# pynflect needed to avoid different inflection
import pyinflect
import spacy
from fastapi import HTTPException
from nltk.corpus.reader import Synset

from my_ghost_writer.constants import ELIGIBLE_POS, NLTK_DATA, SPACY_MODEL_NAME, app_logger
from my_ghost_writer.custom_synonym_handler import CustomSynonymHandler
from my_ghost_writer.thesaurus import wn
from my_ghost_writer.type_hints import ContextInfo, RelatedWordGroup, RelatedWordOption, RelatedWordWordResult, \
    TermRelationships


custom_synonym_handler = CustomSynonymHandler()
# Load spaCy model
nlp = None
try:
    nlp = spacy.load(SPACY_MODEL_NAME)
    app_logger.info(f"spacy model {SPACY_MODEL_NAME} has type:'{type(nlp)}'")
except (OSError, IOError) as io_ex:
    app_logger.error(io_ex)
    app_logger.error(
        f"spaCy model '{SPACY_MODEL_NAME}' not found. Please install it with: 'python -m spacy download {SPACY_MODEL_NAME}'"
    )

# Ensure NLTK data is downloaded
try:
    app_logger.info(f"Downloading NLTK data to the folder:'{NLTK_DATA}'")
    nltk.download('punkt_tab', quiet=False, download_dir=NLTK_DATA)
    nltk.download('wordnet', quiet=False, download_dir=NLTK_DATA)
    nltk.download('wordnet31', quiet=False, download_dir=NLTK_DATA)
except Exception as e:
    app_logger.error(f"Failed to download NLTK data: {e}")


def is_nlp_available() -> bool:
    """Check if spaCy model is available"""
    return nlp is not None


def find_synonyms_for_phrase(text: str, start_idx: int, end_idx: int) -> list[RelatedWordWordResult]:
    """
    Finds related words for all eligible words within a selected text span.
    It analyzes the span, filters for meaningful words (nouns, verbs, etc.),
    and returns a list of related word results for each.
    Raises: HTTPException: If the spaCy model is unavailable.

    Args:
        text: The input text (str).
        start_idx: The start index of the phrase within the text (int).
        end_idx: The end index of the phrase within the text (int).

    Returns:
        A list of RelatedWordWordResult objects, representing the related words for each eligible word (list[RelatedWordWordResult]).
    """
    if nlp is None:
        app_logger.error(
            f"spaCy model '{SPACY_MODEL_NAME}' not found. Please install it with: 'python -m spacy download {SPACY_MODEL_NAME}'"
        )
        raise HTTPException(status_code=503, detail="NLP service is unavailable")

    doc = nlp(text)
    # Use 'expand' to ensure the span covers full tokens even with partial selection
    span = doc.char_span(start_idx, end_idx, alignment_mode="expand")

    if span is None:
        app_logger.warning(f"Could not create a valid token span from indices {start_idx}-{end_idx}.")
        # Return an empty list if no valid span can be formed, the client can handle this
        return []

    # Define which POS tags are eligible for synonym lookup
    results: list[RelatedWordWordResult] = []

    for token in span:
        # Process only if the token is an eligible part of speech and not a stop word or punctuation
        if token.pos_ in ELIGIBLE_POS and not token.is_stop and not token.is_punct:
            try:
                # 1. Get context for this specific token
                context_info_dict = extract_contextual_info_by_indices(
                    text, token.idx, token.idx + len(token.text), token.text
                )

                # 2. Get related word groups using the token's lemma for a better search
                related_word_groups_list = process_synonym_groups(context_info_dict["lemma"], context_info_dict)

                # 3. If we find related words, build the result object for this word
                if related_word_groups_list:
                    # Restructure dicts into Pydantic models for type safety
                    context_info_model = ContextInfo(
                        pos=context_info_dict["pos"],
                        sentence=context_info_dict["context_sentence"],
                        grammatical_form=context_info_dict["tag"],
                        context_words=context_info_dict["context_words"],
                        dependency=context_info_dict["dependency"],
                    )
                    local_start_idx = token.idx - start_idx
                    local_end_idx = local_start_idx + len(token.text)
                    sliced_sentence = text[start_idx:end_idx]
                    sliced_word = sliced_sentence[local_start_idx:local_end_idx]
                    assert sliced_word == token.text, (f"Mismatch! sliced_word ({sliced_word}) != token.text ({token.text}), but these substrings should be equal.\n"
                                                       f" start_idx:{start_idx}, End_word:{end_idx}. local_start_idx:{local_start_idx}, local_end_idx:{local_end_idx}.")
                    word_result = RelatedWordWordResult(
                        original_word=token.text,
                        original_indices={"start": local_start_idx, "end": local_end_idx},
                        context_info=context_info_model,
                        related_word_groups=related_word_groups_list,
                        debug_info={
                            "spacy_token_indices": {
                                "start": context_info_dict["char_start"],
                                "end": context_info_dict["char_end"],
                            },
                            "lemma": context_info_dict["lemma"]
                        }
                    )
                    results.append(word_result)

            except HTTPException as http_ex:
                app_logger.warning(f"Could not process token '{token.text}': '{http_ex.detail}'")
            except Exception as synonym_ex:
                app_logger.error(f"Unexpected error processing token '{token.text}': '{synonym_ex}'", exc_info=True)

    return results


def extract_contextual_info_by_indices(text: str, start_idx: int, end_idx: int, target_word: str) -> dict[str, Any]:
    """
    Extract grammatical and contextual information using character indices.
    Raises: HTTPException: If the spaCy model is unavailable or if the indices are invalid.

    Args:
        text: The input text (str).
        start_idx: The start index of the word within the text (int).
        end_idx: The end index of the word within the text (int).
        target_word: The target word (str).

    Returns:
        A dictionary containing contextual information about the word (dict[str, Any).
    """
    if nlp is None:
        raise HTTPException(status_code=500, detail="spaCy model not available")

    # Verify the indices match the expected word
    if start_idx < 0 or end_idx > len(text) or start_idx >= end_idx:
        raise HTTPException(status_code=400, detail="Invalid start/end indices")

    try:
        doc = nlp(text)

        # Find the token that corresponds to our character indices
        target_token = None
        for token in doc:
            # Check if this token overlaps with our target indices
            if (token.idx <= start_idx < token.idx + len(token.text) or
                    start_idx <= token.idx < end_idx):
                target_token = token
                break

        # If the primary loop didn't find a token, it's an unexpected state,
        # but the original code to handle this was unreachable.
        # The most likely failure is now a word/index mismatch, handled above.
        if target_token is None or str(target_token) != target_word:
            raise HTTPException(
                status_code=400,
                detail=f"Could not find token for word '{target_word}' at indices {start_idx}-{end_idx}"
            )

        # Extract surrounding context (±5 words)
        sentence_tokens = [t for t in target_token.sent if not t.is_space]
        target_position_in_sentence = None
        for i, token in enumerate(sentence_tokens):
            if token == target_token:
                target_position_in_sentence = i
                break

        # Get the context window
        context_start = max(0, target_position_in_sentence - 5) if target_position_in_sentence else 0
        context_end = min(len(sentence_tokens),
                          target_position_in_sentence + 6) if target_position_in_sentence else len(sentence_tokens)
        context_words = [t.text for t in sentence_tokens[context_start:context_end]]

        return {
            "word": target_token.text,
            "lemma": target_token.lemma_,
            "pos": target_token.pos_,
            "tag": target_token.tag_,
            "is_title": target_token.is_title,
            "is_upper": target_token.is_upper,
            "is_lower": target_token.is_lower,
            "dependency": target_token.dep_,
            "context_sentence": target_token.sent.text,
            "context_words": context_words,
            "sentence_position": target_position_in_sentence,
            "char_start": target_token.idx,
            "char_end": target_token.idx + len(target_token.text),
            "original_indices": {"start": start_idx, "end": end_idx},
        }

    except Exception as indices_ex:
        app_logger.error(f"Error in contextual analysis: {indices_ex}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Error analyzing context: {str(indices_ex)}")


def get_wordnet_synonyms(word: str, pos_tag: Optional[str] = None) -> list[dict[str, Any]]:
    """
    Gets related words from WordNet and custom synonym handler,
    returning a list of dictionaries containing the raw data, grouped by relation type.

    Args:
        word: The word to get related words for (str).
        pos_tag: An optional part-of-speech tag to filter WordNet results (Optional[str]).

    Returns:
        A list of dictionaries, where each dictionary represents a group of related words (list[dict[str, Any]]).
    """
    related_word_groups_raw: list[dict[str, Any]] = []
    word_lower = word.lower()

    # 1. Custom Related Word Lookup (all relationships)
    _extract_related_word_groups_custom(related_word_groups_raw, word_lower)
    # 2. WordNet Lookup
    try:
        # Map spaCy POS to wn POS
        pos_map = {
            "NOUN": wn.NOUN,
            "VERB": wn.VERB,
            "ADJ": wn.ADJ,
            "ADV": wn.ADV,
        }

        # Get all synsets for the word
        synsets = wn.synsets(word)

        # Filter by POS if provided
        if pos_tag and pos_tag in pos_map:
            synsets = [s for s in synsets if s.pos() == pos_map[pos_tag]]

        # Process each synset and its relations
        for synset in synsets:
            result = _get_related_words(synset, TermRelationships.SYNONYM, word_lower)
            related_word_groups_raw.append(result)
            for lemma in synset.lemmas():
                result = _get_related_words(lemma, TermRelationships.ANTONYM, word_lower)
                related_word_groups_raw.append(result)
            for rel_type in [
                TermRelationships.HYPERNYM, TermRelationships.HYPONYM, TermRelationships.MERONYM,
                TermRelationships.HOLONYM, TermRelationships.ALSO_SEE, TermRelationships.CAUSE,
                # todo: try to understand how to fix the related missing methods
                # TermRelationships.DERIVATIONALLY_RELATED_FORM,
                # TermRelationships.ENTAILMENT,
                # TermRelationships.PERTAINYM,
                TermRelationships.SIMILAR_TO
            ]:
                result = _get_related_words(synset, rel_type, word_lower)
                related_word_groups_raw.append(result)

    except Exception as ex1:
        app_logger.error(f"Error getting wn synonyms: '{ex1}' with: word:{type(word)}, '{word}', pos_tag: {type(pos_tag)}, '{pos_tag}'")
        raise HTTPException(status_code=500, detail=f"Error retrieving related words: '{str(ex1)}'")

    return [related_words for related_words in related_word_groups_raw if related_words is not None]


def _extract_related_word_groups_custom(related_word_groups_raw, word_lower):
    for rel_type in TermRelationships:
        custom_groups = custom_synonym_handler.get_related(word_lower, rel_type)
        if custom_groups:
            for related in custom_groups:
                words = related["words"]
                definition = related.get("definition", "")
                related_word_options = []
                for word_from_related_words in words:
                    related_word_options.append({
                        "base_form": word_from_related_words,
                        "is_custom": True,
                        "definition": definition,
                    })
                related_word_groups_raw.append({
                    "relation_type": rel_type,
                    "source": "custom",
                    "definition": definition,
                    "examples": [],
                    "wordnet_pos": None,
                    "related_words": related_word_options,
                })


def _get_base_form_by_synset_type(local_lemma: str, inner_word_lower: str, related_words: list[dict]) -> list[dict]:
    lemma_name = local_lemma.replace("_", " ")
    if lemma_name.lower() != inner_word_lower:
        related_words.append({
            "base_form": lemma_name
        })
    return related_words


def _get_related_words(related_object, relation_type: TermRelationships, inner_word_lower: str) -> dict|None:
    related_words = []

    if relation_type == TermRelationships.SYNONYM:
        # related_object is a Synset
        for local_lemma in related_object.lemmas():
            _get_base_form_by_synset_type(local_lemma.name(), inner_word_lower, related_words)
    elif relation_type == TermRelationships.ANTONYM:
        # related_object is a Lemma
        for ant in related_object.antonyms():
            _get_base_form_by_synset_type(ant.name(), inner_word_lower, related_words)
    else:
        # related_object is a Synset
        # Get related synsets from the appropriate method
        relation_methods = {
            TermRelationships.HYPERNYM: related_object.hypernyms,
            TermRelationships.HYPONYM: related_object.hyponyms,
            TermRelationships.MERONYM: lambda: related_object.member_meronyms() + related_object.substance_meronyms() + related_object.part_meronyms(),
            TermRelationships.HOLONYM: lambda: related_object.member_holonyms() + related_object.substance_holonyms() + related_object.part_holonyms(),
            TermRelationships.ALSO_SEE: related_object.also_sees,
            TermRelationships.CAUSE: related_object.causes,
            # TermRelationships.DERIVATIONALLY_RELATED_FORM: related_object.derivationally_related_forms,
            # TermRelationships.ENTAILMENT: related_object.entails,
            # TermRelationships.PERTAINYM: related_object.pertainyms,
            TermRelationships.SIMILAR_TO: related_object.similar_tos,
        }
        get_words_fn = relation_methods.get(relation_type)
        if get_words_fn:
            for related_synset in get_words_fn():
                _extract_lemmas_or_names_from_synset(inner_word_lower, related_synset, related_words)
    if related_words:
        return {
            "relation_type": relation_type,
            "source": "wordnet",
            "definition": _get_related_object_definition(related_object),
            "examples": _get_related_object_examples(related_object),
            "wordnet_pos": _get_related_wordnet_pos(related_object),
            "related_words": related_words,
        }
    return None


def _extract_lemmas_or_names_from_synset(inner_word_lower, related_synset, related_words):
    # Some methods return Lemma objects, handle both cases
    if hasattr(related_synset, "lemmas"):
        for local_lemma in related_synset.lemmas():
            _get_base_form_by_synset_type(local_lemma.name(), inner_word_lower, related_words)
    elif hasattr(related_synset, "name"):
        _get_base_form_by_synset_type(related_synset.name(), inner_word_lower, related_words)


def _get_related_wordnet_pos(related_object: Synset):
    return related_object.pos() if hasattr(related_object, "pos") else None


def _get_related_object_examples(related_object: Synset, n: int = 2) -> list[str]:
    return related_object.examples()[:n] if hasattr(related_object, "examples") else []


def _get_related_object_definition(related_object: Synset) -> str:
    return related_object.definition() if hasattr(related_object, "definition") else ""


def inflect_synonym(synonym: str, original_token_info: dict[str, Any]) -> str:
    """Adapt the input synonym arg to match the original word's grammatical form"""

    if nlp is None:
        return synonym

    pos = original_token_info.get("pos")
    tag = original_token_info.get("tag")

    # Handle capitalization first using .get() for safety
    if original_token_info.get("is_title"):
        synonym = synonym.title() # .title() is better for multi-word phrases
    elif original_token_info.get("is_upper"):
        synonym = synonym.upper()
    elif original_token_info.get("is_lower", True):  # Default to lower
        synonym = synonym.lower()

    # Handle grammatical inflection
    try:
        # Define all tags that require inflection in one place
        inflection_tags = {
            "NOUN": ["NNS", "NNPS"],
            "VERB": ["VBD", "VBN", "VBZ", "VBG"],
            "ADJ": ["JJR", "JJS"],
        }

        # Single check for all inflection cases
        if pos in inflection_tags and tag in inflection_tags.get(pos, []):
            doc = nlp(synonym)
            if doc and len(doc) > 0:
                inflected = doc[0]._.inflect(tag)
                if inflected:
                    # Re-join with the rest of the phrase if it was multi-word
                    return inflected + synonym[len(doc[0].text):]
                return synonym # Return original if inflection fails

    except Exception as ex2:
        app_logger.warning(f"Inflection error for '{synonym}': '{ex2}'")
        # Return the original synonym if inflection fails

    return synonym


def process_synonym_groups(word: str, context_info: dict[str, Any]) -> list[RelatedWordGroup]:
    """Process given related word groups with inflection matching

    Args:
        word (str): the word
        context_info (dict[str, Any]): the original form of data

    Returns:
        list[RelatedWordGroup]: List of the processed related words
    """
    # Get related words from wn
    t0 = datetime.now()
    # Get related words from wn using the lemma
    related_words_raw = get_wordnet_synonyms(context_info["lemma"], context_info["pos"])
    t1 = datetime.now()
    duration = (t1 - t0).total_seconds()
    app_logger.info(f"# 1/Got get_wordnet_synonyms result with '{word}' word in {duration:.3f}s.")

    if not related_words_raw:
        return []

    # Process each related word group
    processed_groups: list[RelatedWordGroup] = []
    for related_group in related_words_raw:
        app_logger.info(f"related_group:'{related_group}'")
        relation_type = related_group["relation_type"]
        definition = related_group.get("definition", "")
        examples = related_group.get("examples", [])
        wordnet_pos = related_group.get("wordnet_pos")
        related_words = related_group["related_words"]
        processed_options: list[RelatedWordOption] = []

        for related_word in related_words:
            base_form = related_word["base_form"]
            inflected_form = inflect_synonym(base_form, context_info)

            related_word_option = RelatedWordOption(
                base_form=base_form,
                inflected_form=inflected_form,
                matches_context=inflected_form.lower() != base_form.lower()
            )
            if "is_custom" in related_word:
                related_word_option.is_custom = related_word["is_custom"]
            processed_options.append(related_word_option)
        app_logger.info(f"wordnet_pos:{type(wordnet_pos)}, '{wordnet_pos}'")
        processed_groups.append(
            RelatedWordGroup(
                relation_type=relation_type,
                definition=definition,
                examples=examples,
                related_words=processed_options,
                wordnet_pos=wordnet_pos
            )
        )
    return processed_groups