alessandro trinca tornidor commited on
Commit
57447c1
·
1 Parent(s): a032dbd

refactor: update logs, remove wrong import, handle exception if missing idxRowChild and idxRowParent keys

Browse files
my_ghost_writer/app.py CHANGED
@@ -48,8 +48,9 @@ def get_words_frequency(body: RequestTextFrequencyBody | str) -> JSONResponse:
48
  app_logger.debug(f"body: {body}.")
49
  body = json.loads(body)
50
  text = body["text"]
51
- app_logger.info(f"LOG_LEVEL: '{LOG_LEVEL}', length of text: {len(text)}.")
52
- app_logger.debug(f"text from request: {text} ...")
 
53
  n_total_rows, words_stems_dict = text_stemming(text)
54
  dumped = json.dumps(words_stems_dict)
55
  app_logger.debug(f"dumped: {dumped} ...")
 
48
  app_logger.debug(f"body: {body}.")
49
  body = json.loads(body)
50
  text = body["text"]
51
+ app_logger.info(f"LOG_LEVEL: '{LOG_LEVEL}', length of text: {len(text)}, type of 'text':'{type(text)}'.")
52
+ if len(text) < 100:
53
+ app_logger.debug(f"text from request: {text} ...")
54
  n_total_rows, words_stems_dict = text_stemming(text)
55
  dumped = json.dumps(words_stems_dict)
56
  app_logger.debug(f"dumped: {dumped} ...")
my_ghost_writer/text_parsers.py CHANGED
@@ -19,7 +19,6 @@ def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict
19
  import json
20
  from nltk import PorterStemmer
21
  from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
22
- from my_ghost_writer.text_parsers import get_words_tokens_and_indexes
23
 
24
  ps = PorterStemmer()
25
  try:
@@ -33,7 +32,7 @@ def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict
33
  valid_textrows_with_num = [{"idxRow": i, "text": row} for i, row in enumerate(text.split("\n"))]
34
  app_logger.info("valid_textrows_with_num::str:")
35
  else:
36
- raise TypeError(f"Invalid input type. Expected json str or list of dictionaries, not '{type(text)}'.")
37
  app_logger.debug(valid_textrows_with_num)
38
  app_logger.debug("=============================")
39
  row_words_tokens = []
@@ -44,8 +43,12 @@ def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict
44
  for textrow in valid_textrows_with_num:
45
  row = textrow["text"]
46
  idx_rows.append(textrow["idxRow"])
47
- idx_rows_child.append(textrow["idxRowChild"])
48
- idx_rows_parent.append(textrow["idxRowParent"])
 
 
 
 
49
  row_words_tokens.append(wordpunct_tokenize(row))
50
  row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
51
  words_stems_dict = get_words_tokens_and_indexes(row_words_tokens, row_offsets_tokens, ps, idx_rows, idx_rows_child, idx_rows_parent)
@@ -64,6 +67,8 @@ def get_words_tokens_and_indexes(
64
  offsets_tokens_list (list): List of offsets for each token.
65
  ps (PorterStemmer): The stemmer to use.
66
  idx_rows_list (list[int]): List of row indices corresponding to the tokens.
 
 
67
 
68
  Returns:
69
  dict: Dictionary with stemmed words as keys and a list of dictionaries
@@ -89,6 +94,8 @@ def update_stems_list(current_stem_tuple: dict, word: str, offsets: list, n_row:
89
  offsets (list): List of offsets for the word.
90
  word (str): The word to stem.
91
  n_row (int): The row number in the original text.
 
 
92
 
93
  Returns:
94
  dict[str|list|int]: A dictionary with the stem string, its offsets and count.
 
19
  import json
20
  from nltk import PorterStemmer
21
  from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
 
22
 
23
  ps = PorterStemmer()
24
  try:
 
32
  valid_textrows_with_num = [{"idxRow": i, "text": row} for i, row in enumerate(text.split("\n"))]
33
  app_logger.info("valid_textrows_with_num::str:")
34
  else:
35
+ raise TypeError(f"Invalid input type. Expected plain text str, json str or list of dictionaries, not '{type(text)}'.")
36
  app_logger.debug(valid_textrows_with_num)
37
  app_logger.debug("=============================")
38
  row_words_tokens = []
 
43
  for textrow in valid_textrows_with_num:
44
  row = textrow["text"]
45
  idx_rows.append(textrow["idxRow"])
46
+ try:
47
+ idx_rows_child.append(textrow["idxRowChild"])
48
+ idx_rows_parent.append(textrow["idxRowParent"])
49
+ except KeyError:
50
+ idx_rows_child.append(None)
51
+ idx_rows_parent.append(None)
52
  row_words_tokens.append(wordpunct_tokenize(row))
53
  row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
54
  words_stems_dict = get_words_tokens_and_indexes(row_words_tokens, row_offsets_tokens, ps, idx_rows, idx_rows_child, idx_rows_parent)
 
67
  offsets_tokens_list (list): List of offsets for each token.
68
  ps (PorterStemmer): The stemmer to use.
69
  idx_rows_list (list[int]): List of row indices corresponding to the tokens.
70
+ idx_rows_child (list[int]): List of child row indices corresponding to the tokens.
71
+ idx_rows_parent (list[int]): List of parent row indices corresponding to the tokens.
72
 
73
  Returns:
74
  dict: Dictionary with stemmed words as keys and a list of dictionaries
 
94
  offsets (list): List of offsets for the word.
95
  word (str): The word to stem.
96
  n_row (int): The row number in the original text.
97
+ n_row_child (int): The child row number in the original text.
98
+ n_row_parent (int): The parent row number in the original text.
99
 
100
  Returns:
101
  dict[str|list|int]: A dictionary with the stem string, its offsets and count.