alessandro trinca tornidor
commited on
Commit
·
57447c1
1
Parent(s):
a032dbd
refactor: update logs, remove wrong import, handle exception if missing idxRowChild and idxRowParent keys
Browse files- my_ghost_writer/app.py +3 -2
- my_ghost_writer/text_parsers.py +11 -4
my_ghost_writer/app.py
CHANGED
|
@@ -48,8 +48,9 @@ def get_words_frequency(body: RequestTextFrequencyBody | str) -> JSONResponse:
|
|
| 48 |
app_logger.debug(f"body: {body}.")
|
| 49 |
body = json.loads(body)
|
| 50 |
text = body["text"]
|
| 51 |
-
app_logger.info(f"LOG_LEVEL: '{LOG_LEVEL}', length of text: {len(text)}.")
|
| 52 |
-
|
|
|
|
| 53 |
n_total_rows, words_stems_dict = text_stemming(text)
|
| 54 |
dumped = json.dumps(words_stems_dict)
|
| 55 |
app_logger.debug(f"dumped: {dumped} ...")
|
|
|
|
| 48 |
app_logger.debug(f"body: {body}.")
|
| 49 |
body = json.loads(body)
|
| 50 |
text = body["text"]
|
| 51 |
+
app_logger.info(f"LOG_LEVEL: '{LOG_LEVEL}', length of text: {len(text)}, type of 'text':'{type(text)}'.")
|
| 52 |
+
if len(text) < 100:
|
| 53 |
+
app_logger.debug(f"text from request: {text} ...")
|
| 54 |
n_total_rows, words_stems_dict = text_stemming(text)
|
| 55 |
dumped = json.dumps(words_stems_dict)
|
| 56 |
app_logger.debug(f"dumped: {dumped} ...")
|
my_ghost_writer/text_parsers.py
CHANGED
|
@@ -19,7 +19,6 @@ def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict
|
|
| 19 |
import json
|
| 20 |
from nltk import PorterStemmer
|
| 21 |
from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
|
| 22 |
-
from my_ghost_writer.text_parsers import get_words_tokens_and_indexes
|
| 23 |
|
| 24 |
ps = PorterStemmer()
|
| 25 |
try:
|
|
@@ -33,7 +32,7 @@ def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict
|
|
| 33 |
valid_textrows_with_num = [{"idxRow": i, "text": row} for i, row in enumerate(text.split("\n"))]
|
| 34 |
app_logger.info("valid_textrows_with_num::str:")
|
| 35 |
else:
|
| 36 |
-
raise TypeError(f"Invalid input type. Expected json str or list of dictionaries, not '{type(text)}'.")
|
| 37 |
app_logger.debug(valid_textrows_with_num)
|
| 38 |
app_logger.debug("=============================")
|
| 39 |
row_words_tokens = []
|
|
@@ -44,8 +43,12 @@ def text_stemming(text: str | RequestTextRowsParentList) -> ResponseTextRowsDict
|
|
| 44 |
for textrow in valid_textrows_with_num:
|
| 45 |
row = textrow["text"]
|
| 46 |
idx_rows.append(textrow["idxRow"])
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
row_words_tokens.append(wordpunct_tokenize(row))
|
| 50 |
row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
|
| 51 |
words_stems_dict = get_words_tokens_and_indexes(row_words_tokens, row_offsets_tokens, ps, idx_rows, idx_rows_child, idx_rows_parent)
|
|
@@ -64,6 +67,8 @@ def get_words_tokens_and_indexes(
|
|
| 64 |
offsets_tokens_list (list): List of offsets for each token.
|
| 65 |
ps (PorterStemmer): The stemmer to use.
|
| 66 |
idx_rows_list (list[int]): List of row indices corresponding to the tokens.
|
|
|
|
|
|
|
| 67 |
|
| 68 |
Returns:
|
| 69 |
dict: Dictionary with stemmed words as keys and a list of dictionaries
|
|
@@ -89,6 +94,8 @@ def update_stems_list(current_stem_tuple: dict, word: str, offsets: list, n_row:
|
|
| 89 |
offsets (list): List of offsets for the word.
|
| 90 |
word (str): The word to stem.
|
| 91 |
n_row (int): The row number in the original text.
|
|
|
|
|
|
|
| 92 |
|
| 93 |
Returns:
|
| 94 |
dict[str|list|int]: A dictionary with the stem string, its offsets and count.
|
|
|
|
| 19 |
import json
|
| 20 |
from nltk import PorterStemmer
|
| 21 |
from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
|
|
|
|
| 22 |
|
| 23 |
ps = PorterStemmer()
|
| 24 |
try:
|
|
|
|
| 32 |
valid_textrows_with_num = [{"idxRow": i, "text": row} for i, row in enumerate(text.split("\n"))]
|
| 33 |
app_logger.info("valid_textrows_with_num::str:")
|
| 34 |
else:
|
| 35 |
+
raise TypeError(f"Invalid input type. Expected plain text str, json str or list of dictionaries, not '{type(text)}'.")
|
| 36 |
app_logger.debug(valid_textrows_with_num)
|
| 37 |
app_logger.debug("=============================")
|
| 38 |
row_words_tokens = []
|
|
|
|
| 43 |
for textrow in valid_textrows_with_num:
|
| 44 |
row = textrow["text"]
|
| 45 |
idx_rows.append(textrow["idxRow"])
|
| 46 |
+
try:
|
| 47 |
+
idx_rows_child.append(textrow["idxRowChild"])
|
| 48 |
+
idx_rows_parent.append(textrow["idxRowParent"])
|
| 49 |
+
except KeyError:
|
| 50 |
+
idx_rows_child.append(None)
|
| 51 |
+
idx_rows_parent.append(None)
|
| 52 |
row_words_tokens.append(wordpunct_tokenize(row))
|
| 53 |
row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
|
| 54 |
words_stems_dict = get_words_tokens_and_indexes(row_words_tokens, row_offsets_tokens, ps, idx_rows, idx_rows_child, idx_rows_parent)
|
|
|
|
| 67 |
offsets_tokens_list (list): List of offsets for each token.
|
| 68 |
ps (PorterStemmer): The stemmer to use.
|
| 69 |
idx_rows_list (list[int]): List of row indices corresponding to the tokens.
|
| 70 |
+
idx_rows_child (list[int]): List of child row indices corresponding to the tokens.
|
| 71 |
+
idx_rows_parent (list[int]): List of parent row indices corresponding to the tokens.
|
| 72 |
|
| 73 |
Returns:
|
| 74 |
dict: Dictionary with stemmed words as keys and a list of dictionaries
|
|
|
|
| 94 |
offsets (list): List of offsets for the word.
|
| 95 |
word (str): The word to stem.
|
| 96 |
n_row (int): The row number in the original text.
|
| 97 |
+
n_row_child (int): The child row number in the original text.
|
| 98 |
+
n_row_parent (int): The parent row number in the original text.
|
| 99 |
|
| 100 |
Returns:
|
| 101 |
dict[str|list|int]: A dictionary with the stem string, its offsets and count.
|