File size: 12,001 Bytes
8d77218 000cfc3 8d77218 81e6837 832fa14 8d77218 570dbf9 832fa14 570dbf9 832fa14 81e6837 000cfc3 81e6837 000cfc3 570dbf9 81e6837 570dbf9 81e6837 570dbf9 81e6837 570dbf9 000cfc3 81e6837 000cfc3 570dbf9 81e6837 570dbf9 81e6837 000cfc3 81e6837 000cfc3 81e6837 000cfc3 81e6837 570dbf9 81e6837 570dbf9 81e6837 570dbf9 81e6837 570dbf9 81e6837 832fa14 8d77218 000cfc3 b8cdad9 000cfc3 8d77218 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
import json
import unittest
import nltk
from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
from tests import EVENTS_FOLDER
def get_inputs_for(valid_textrows_with_num):
row_words_tokens = []
row_offsets_tokens = []
idx_rows = []
idx_rows_child = []
idx_rows_parent = []
rows_dict = {}
for textrow in valid_textrows_with_num:
row = textrow["text"]
idx_row = textrow["idxRow"]
rows_dict[idx_row] = row
idx_rows.append(idx_row)
try:
idx_rows_child.append(textrow["idxRowChild"])
idx_rows_parent.append(textrow["idxRowParent"])
except KeyError:
idx_rows_child.append(None)
idx_rows_parent.append(None)
row_words_tokens.append(wordpunct_tokenize(row))
row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
return row_words_tokens, row_offsets_tokens, idx_rows, idx_rows_child, idx_rows_parent, rows_dict
class TestTextParsers(unittest.TestCase):
def setUp(self):
with open(EVENTS_FOLDER / "get_words_tokens_and_indexes_inputs.json", "r") as src:
self.get_words_tokens_and_indexes_inputs = json.load(src)
with open(EVENTS_FOLDER / "request_text_stemming_no_parents.json", "r") as src:
self.text_json_list_no_parents = json.load(src)
with open(EVENTS_FOLDER / "request_text_stemming_with_parents.json", "r") as src:
self.text_json_list_with_parents = json.load(src)
with open(EVENTS_FOLDER / "llm_generated_story_3.txt", "r") as src:
self.original_text = src.read()
self.text_split_newline = self.original_text.split("\n")
self.row_words_tokens = []
self.row_offsets_tokens = []
for row in self.text_split_newline:
self.row_words_tokens.append(wordpunct_tokenize(row))
self.row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download('averaged_perceptron_tagger_eng')
self.ps = nltk.PorterStemmer()
self.wnl = nltk.WordNetLemmatizer()
def test_text_stemming_text(self):
from my_ghost_writer.text_parsers import text_stemming
self.maxDiff = None
n_total_rows, words_stems_dict = text_stemming(self.original_text)
self.assertEqual(n_total_rows, len(self.text_split_newline))
# with open(EVENTS_FOLDER / "response_text_stemming_from_llm_generated_story_3.json", "w") as dst_json:
# json.dump({"n_total_rows": n_total_rows, "words_stems_dict": words_stems_dict}, dst_json, indent=2)
# pass
with open(EVENTS_FOLDER / "response_text_stemming_from_llm_generated_story_3.json", "r") as dst_json:
response_text_stemming_from_llm_generated_story_3 = json.load(dst_json)
expected_words_stems_dict = response_text_stemming_from_llm_generated_story_3["words_stems_dict"]
self.assertDictEqual(words_stems_dict, expected_words_stems_dict)
def test_text_stemming_input_str_json(self):
from my_ghost_writer.text_parsers import text_stemming
self.maxDiff = None
json_str = json.dumps(self.text_json_list_no_parents)
n_total_rows, words_stems_dict = text_stemming(json_str)
self.assertEqual(n_total_rows, len(self.text_json_list_no_parents))
# with open(EVENTS_FOLDER / "response_text_stemming_empty_rows.json", "w") as dst_json:
# json.dump({"n_total_rows": n_total_rows, "words_stems_dict": words_stems_dict}, dst_json, indent=2)
# pass
with open(EVENTS_FOLDER / "response_text_stemming_empty_rows.json", "r") as dst_json:
response_text_stemming_empty_rows = json.load(dst_json)
expected_words_stems_dict = response_text_stemming_empty_rows["words_stems_dict"]
self.assertDictEqual(words_stems_dict, expected_words_stems_dict)
def test_text_stemming_list_no_parents(self):
from my_ghost_writer.text_parsers import text_stemming
self.maxDiff = None
n_total_rows, words_stems_dict = text_stemming(self.text_json_list_no_parents)
self.assertEqual(n_total_rows, len(self.text_json_list_no_parents))
# with open(EVENTS_FOLDER / "response_text_stemming_no_parents.json", "w") as dst_json:
# json.dump({"n_total_rows": n_total_rows, "words_stems_dict": words_stems_dict}, dst_json, indent=2)
# pass
with open(EVENTS_FOLDER / "response_text_stemming_no_parents.json", "r") as dst_json:
response_text_stemming_no_parents = json.load(dst_json)
expected_words_stems_dict = response_text_stemming_no_parents["words_stems_dict"]
self.assertDictEqual(words_stems_dict, expected_words_stems_dict)
def test_text_stemming_list_with_parents(self):
from my_ghost_writer.text_parsers import text_stemming
self.maxDiff = None
n_total_rows, words_stems_dict = text_stemming(self.text_json_list_with_parents, n=3)
self.assertEqual(n_total_rows, len(self.text_json_list_with_parents))
# with open(EVENTS_FOLDER / "response_text_stemming_with_parents.json", "w") as dst_json:
# json.dump({"n_total_rows": n_total_rows, "words_stems_dict": words_stems_dict}, dst_json, indent=2)
# pass
with open(EVENTS_FOLDER / "response_text_stemming_with_parents.json", "r") as dst_json:
response_text_stemming_with_parents = json.load(dst_json)
expected_words_stems_dict = response_text_stemming_with_parents["words_stems_dict"]
self.assertDictEqual(words_stems_dict, expected_words_stems_dict)
def test_text_stemming_wrong_input(self):
from my_ghost_writer.text_parsers import text_stemming
with self.assertRaises(TypeError):
try:
text_stemming({"text": "This is a test."})
except TypeError as e:
self.assertEqual(str(e), "Invalid input type. Expected plain text str, json str or list of dictionaries, not '<class 'dict'>'.")
raise e
def test_update_stems_list(self):
from my_ghost_writer.text_parsers import update_stems_list
with open(EVENTS_FOLDER / "update_stem_list_inputs.json", "r") as src:
test_args = json.load(src)
test_args_inputs = test_args["input"]
test_args_outputs = test_args["output"]
for arg, expected in zip(test_args_inputs, test_args_outputs):
n_row = arg['n_row']
offsets = arg['offsets']
word = arg['word']
current_stem_tuple = arg['current_stem_tuple']
n_row_child = arg["n_row_child"]
n_row_parent = arg["n_row_parent"]
expected_offsets_array = expected['offsets_array']
expected_count = expected['count']
count, word_offsets = update_stems_list(current_stem_tuple, word, offsets, n_row=n_row, n_row_child=n_row_child, n_row_parent=n_row_parent)
self.assertEqual(count, expected_count)
self.assertEqual(word_offsets, expected_offsets_array)
def test_get_words_tokens_and_indexes_ngrams_no_parents(self):
from my_ghost_writer.text_parsers import get_words_tokens_and_indexes_ngrams
with open(EVENTS_FOLDER / "llm_generated_story_4.txt", "r") as src:
text = src.read()
valid_textrows_with_num = [{"idxRow": i, "text": row} for i, row in enumerate(text.split("\n"))]
row_words_tokens, row_offsets_tokens, idx_rows, idx_rows_child, idx_rows_parent, rows_dict = get_inputs_for(
valid_textrows_with_num
)
words_stems_dict = get_words_tokens_and_indexes_ngrams(
row_words_tokens,
row_offsets_tokens,
idx_rows,
idx_rows_child,
idx_rows_parent,
rows_dict=rows_dict,
n=5
)
# with open(EVENTS_FOLDER / "response_get_words_tokens_and_indexes_ngrams_text4_n5.json", "w") as dst_json:
# json.dump({"words_stems_dict": words_stems_dict}, dst_json, indent=2)
with open(EVENTS_FOLDER / "response_get_words_tokens_and_indexes_ngrams_text4_n5.json", "r") as dst_json:
response_get_words_tokens_and_indexes_ngrams_text4_n5 = json.load(dst_json)
expected_words_stems_dict = response_get_words_tokens_and_indexes_ngrams_text4_n5["words_stems_dict"]
self.assertDictEqual(words_stems_dict, expected_words_stems_dict)
def test_get_words_tokens_and_indexes_ngrams_with_parents(self):
from my_ghost_writer.text_parsers import get_words_tokens_and_indexes_ngrams
self.maxDiff = None
row_words_tokens, row_offsets_tokens, idx_rows, idx_rows_child, idx_rows_parent, rows_dict = get_inputs_for(
self.text_json_list_with_parents
)
words_stems_dict = get_words_tokens_and_indexes_ngrams(
row_words_tokens,
row_offsets_tokens,
idx_rows,
idx_rows_child,
idx_rows_parent,
rows_dict=rows_dict,
n=3
)
with open(EVENTS_FOLDER / "response_text_stemming_with_parents.json", "r") as dst_json:
response_text_stemming_with_parents = json.load(dst_json)
expected_words_stems_dict = response_text_stemming_with_parents["words_stems_dict"]
self.assertDictEqual(words_stems_dict, expected_words_stems_dict)
def test_get_sentence_with_word(self):
from my_ghost_writer.text_parsers import get_sentence_by_word
text = """Mr. Dursley always sat with his back to the window in his office on the ninth floor (the window was on the back of the factory). If he hadn't, he might have found it harder to concentrate on drills that morning: small drills and a normal drill or in the end the biggest drill he ever seen! He didn't see the owls swooping past in broad daylight, though people down in the street did; they pointed and gazed open-mouthed as owl after owl sped overhead. Most of them had never seen an owl even at nighttime. Mr. Dursley, however, had a perfectly normal, owl-free morning with plenty of thinking on his prefererred drill. He yelled at five different people. He made several important telephone"""
expected_sentence1 = 'Mr. Dursley always sat with his back to the window in his office on the ninth floor (the window was on the back of the factory).'
expected_sentence2 = "If he hadn't, he might have found it harder to concentrate on drills that morning: small drills and a normal drill or in the end the biggest drill he ever seen!"
expected_list_responses = [
{"word": "window", "expected_sentence": expected_sentence1, "start": 44, "end": 50, "start_position": 44, "end_position": 50},
{"word": "window", "expected_sentence": expected_sentence1, "start": 89, "end": 95, "start_position": 89, "end_position": 95},
{"word": "drill", "expected_sentence": expected_sentence2, "start": 109, "end": 114, "start_position": 238, "end_position": 243},
{"word": "drill", "expected_sentence": expected_sentence2, "start": 141, "end": 146, "start_position": 270, "end_position": 275}
]
for item in expected_list_responses:
expected_sentence = item["expected_sentence"]
expected_start_in_sentence, expected_end_in_sentence = item["start"], item["end"]
sentence, start_in_sentence, end_in_sentence = get_sentence_by_word(text, item["word"], item["start_position"], item["end_position"])
self.assertEqual(sentence, expected_sentence)
self.assertEqual(start_in_sentence, expected_start_in_sentence)
self.assertEqual(end_in_sentence, expected_end_in_sentence)
if __name__ == "__main__":
unittest.main()
|