File size: 12,001 Bytes
8d77218
 
 
 
 
 
 
 
 
000cfc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d77218
 
81e6837
 
 
 
 
 
 
 
832fa14
8d77218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570dbf9
832fa14
570dbf9
832fa14
 
81e6837
000cfc3
 
81e6837
000cfc3
 
570dbf9
 
81e6837
570dbf9
 
81e6837
570dbf9
81e6837
570dbf9
000cfc3
81e6837
000cfc3
 
 
570dbf9
 
81e6837
570dbf9
 
81e6837
 
000cfc3
81e6837
 
 
 
 
 
 
 
 
 
000cfc3
81e6837
000cfc3
81e6837
 
 
 
 
570dbf9
 
 
 
 
 
 
 
81e6837
570dbf9
 
 
 
81e6837
 
 
 
 
570dbf9
 
 
81e6837
 
 
 
 
 
570dbf9
81e6837
832fa14
8d77218
000cfc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8cdad9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
000cfc3
8d77218
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import json
import unittest

import nltk
from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer

from tests import EVENTS_FOLDER


def get_inputs_for(valid_textrows_with_num):
    row_words_tokens = []
    row_offsets_tokens = []
    idx_rows = []
    idx_rows_child = []
    idx_rows_parent = []
    rows_dict = {}
    for textrow in valid_textrows_with_num:
        row = textrow["text"]
        idx_row = textrow["idxRow"]
        rows_dict[idx_row] = row
        idx_rows.append(idx_row)
        try:
            idx_rows_child.append(textrow["idxRowChild"])
            idx_rows_parent.append(textrow["idxRowParent"])
        except KeyError:
            idx_rows_child.append(None)
            idx_rows_parent.append(None)
        row_words_tokens.append(wordpunct_tokenize(row))
        row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))
    return row_words_tokens, row_offsets_tokens, idx_rows, idx_rows_child, idx_rows_parent, rows_dict


class TestTextParsers(unittest.TestCase):
    def setUp(self):
        with open(EVENTS_FOLDER / "get_words_tokens_and_indexes_inputs.json", "r") as src:
            self.get_words_tokens_and_indexes_inputs = json.load(src)
        with open(EVENTS_FOLDER / "request_text_stemming_no_parents.json", "r") as src:
            self.text_json_list_no_parents = json.load(src)
        with open(EVENTS_FOLDER / "request_text_stemming_with_parents.json", "r") as src:
            self.text_json_list_with_parents = json.load(src)
        with open(EVENTS_FOLDER / "llm_generated_story_3.txt", "r") as src:
            self.original_text = src.read()
            self.text_split_newline = self.original_text.split("\n")

            self.row_words_tokens = []
            self.row_offsets_tokens = []
            for row in self.text_split_newline:
                self.row_words_tokens.append(wordpunct_tokenize(row))
                self.row_offsets_tokens.append(WordPunctTokenizer().span_tokenize(row))

            nltk.download("punkt")
            nltk.download('punkt_tab')
            nltk.download("wordnet")
            nltk.download("omw-1.4")
            nltk.download('averaged_perceptron_tagger_eng')

            self.ps = nltk.PorterStemmer()
            self.wnl = nltk.WordNetLemmatizer()

    def test_text_stemming_text(self):
        from my_ghost_writer.text_parsers import text_stemming
        self.maxDiff = None
        n_total_rows, words_stems_dict = text_stemming(self.original_text)
        self.assertEqual(n_total_rows, len(self.text_split_newline))
        # with open(EVENTS_FOLDER / "response_text_stemming_from_llm_generated_story_3.json", "w") as dst_json:
        #     json.dump({"n_total_rows": n_total_rows, "words_stems_dict": words_stems_dict}, dst_json, indent=2)
        #     pass
        with open(EVENTS_FOLDER / "response_text_stemming_from_llm_generated_story_3.json", "r") as dst_json:
            response_text_stemming_from_llm_generated_story_3 = json.load(dst_json)
            expected_words_stems_dict = response_text_stemming_from_llm_generated_story_3["words_stems_dict"]
        self.assertDictEqual(words_stems_dict, expected_words_stems_dict)

    def test_text_stemming_input_str_json(self):
        from my_ghost_writer.text_parsers import text_stemming
        self.maxDiff = None
        json_str = json.dumps(self.text_json_list_no_parents)
        n_total_rows, words_stems_dict = text_stemming(json_str)
        self.assertEqual(n_total_rows, len(self.text_json_list_no_parents))
        # with open(EVENTS_FOLDER / "response_text_stemming_empty_rows.json", "w") as dst_json:
        #     json.dump({"n_total_rows": n_total_rows, "words_stems_dict": words_stems_dict}, dst_json, indent=2)
        # pass
        with open(EVENTS_FOLDER / "response_text_stemming_empty_rows.json", "r") as dst_json:
            response_text_stemming_empty_rows = json.load(dst_json)
            expected_words_stems_dict = response_text_stemming_empty_rows["words_stems_dict"]
        self.assertDictEqual(words_stems_dict, expected_words_stems_dict)

    def test_text_stemming_list_no_parents(self):
        from my_ghost_writer.text_parsers import text_stemming
        self.maxDiff = None
        n_total_rows, words_stems_dict = text_stemming(self.text_json_list_no_parents)
        self.assertEqual(n_total_rows, len(self.text_json_list_no_parents))
        # with open(EVENTS_FOLDER / "response_text_stemming_no_parents.json", "w") as dst_json:
        #     json.dump({"n_total_rows": n_total_rows, "words_stems_dict": words_stems_dict}, dst_json, indent=2)
        # pass
        with open(EVENTS_FOLDER / "response_text_stemming_no_parents.json", "r") as dst_json:
            response_text_stemming_no_parents = json.load(dst_json)
            expected_words_stems_dict = response_text_stemming_no_parents["words_stems_dict"]
        self.assertDictEqual(words_stems_dict, expected_words_stems_dict)

    def test_text_stemming_list_with_parents(self):
        from my_ghost_writer.text_parsers import text_stemming
        self.maxDiff = None
        n_total_rows, words_stems_dict = text_stemming(self.text_json_list_with_parents, n=3)
        self.assertEqual(n_total_rows, len(self.text_json_list_with_parents))
        # with open(EVENTS_FOLDER / "response_text_stemming_with_parents.json", "w") as dst_json:
        #     json.dump({"n_total_rows": n_total_rows, "words_stems_dict": words_stems_dict}, dst_json, indent=2)
        # pass
        with open(EVENTS_FOLDER / "response_text_stemming_with_parents.json", "r") as dst_json:
            response_text_stemming_with_parents = json.load(dst_json)
            expected_words_stems_dict = response_text_stemming_with_parents["words_stems_dict"]
        self.assertDictEqual(words_stems_dict, expected_words_stems_dict)

    def test_text_stemming_wrong_input(self):
        from my_ghost_writer.text_parsers import text_stemming
        with self.assertRaises(TypeError):
            try:
                text_stemming({"text": "This is a test."})
            except TypeError as e:
                self.assertEqual(str(e), "Invalid input type. Expected plain text str, json str or list of dictionaries, not '<class 'dict'>'.")
                raise e

    def test_update_stems_list(self):
        from my_ghost_writer.text_parsers import update_stems_list
        with open(EVENTS_FOLDER / "update_stem_list_inputs.json", "r") as src:
            test_args = json.load(src)
            test_args_inputs = test_args["input"]
            test_args_outputs = test_args["output"]
        for arg, expected in zip(test_args_inputs, test_args_outputs):
            n_row = arg['n_row']
            offsets = arg['offsets']
            word = arg['word']
            current_stem_tuple = arg['current_stem_tuple']
            n_row_child = arg["n_row_child"]
            n_row_parent = arg["n_row_parent"]
            expected_offsets_array = expected['offsets_array']
            expected_count = expected['count']
            count, word_offsets = update_stems_list(current_stem_tuple, word, offsets, n_row=n_row, n_row_child=n_row_child, n_row_parent=n_row_parent)
            self.assertEqual(count, expected_count)
            self.assertEqual(word_offsets, expected_offsets_array)


    def test_get_words_tokens_and_indexes_ngrams_no_parents(self):
        from my_ghost_writer.text_parsers import get_words_tokens_and_indexes_ngrams

        with open(EVENTS_FOLDER / "llm_generated_story_4.txt", "r") as src:
            text = src.read()
            valid_textrows_with_num = [{"idxRow": i, "text": row} for i, row in enumerate(text.split("\n"))]

        row_words_tokens, row_offsets_tokens, idx_rows, idx_rows_child, idx_rows_parent, rows_dict = get_inputs_for(
            valid_textrows_with_num
        )
        words_stems_dict = get_words_tokens_and_indexes_ngrams(
            row_words_tokens,
            row_offsets_tokens,
            idx_rows,
            idx_rows_child,
            idx_rows_parent,
            rows_dict=rows_dict,
            n=5
        )
        # with open(EVENTS_FOLDER / "response_get_words_tokens_and_indexes_ngrams_text4_n5.json", "w") as dst_json:
        #     json.dump({"words_stems_dict": words_stems_dict}, dst_json, indent=2)
        with open(EVENTS_FOLDER / "response_get_words_tokens_and_indexes_ngrams_text4_n5.json", "r") as dst_json:
            response_get_words_tokens_and_indexes_ngrams_text4_n5 = json.load(dst_json)
            expected_words_stems_dict = response_get_words_tokens_and_indexes_ngrams_text4_n5["words_stems_dict"]
        self.assertDictEqual(words_stems_dict, expected_words_stems_dict)

    def test_get_words_tokens_and_indexes_ngrams_with_parents(self):
        from my_ghost_writer.text_parsers import get_words_tokens_and_indexes_ngrams
        self.maxDiff = None
        row_words_tokens, row_offsets_tokens, idx_rows, idx_rows_child, idx_rows_parent, rows_dict = get_inputs_for(
            self.text_json_list_with_parents
        )
        words_stems_dict = get_words_tokens_and_indexes_ngrams(
            row_words_tokens,
            row_offsets_tokens,
            idx_rows,
            idx_rows_child,
            idx_rows_parent,
            rows_dict=rows_dict,
            n=3
        )
        with open(EVENTS_FOLDER / "response_text_stemming_with_parents.json", "r") as dst_json:
            response_text_stemming_with_parents = json.load(dst_json)
            expected_words_stems_dict = response_text_stemming_with_parents["words_stems_dict"]
        self.assertDictEqual(words_stems_dict, expected_words_stems_dict)

    def test_get_sentence_with_word(self):
        from my_ghost_writer.text_parsers import get_sentence_by_word
        text = """Mr. Dursley always sat with his back to the window in his office on the ninth floor (the window was on the back of the factory). If he hadn't, he might have found it harder to concentrate on drills that morning: small drills and a normal drill or in the end the biggest drill he ever seen! He didn't see the owls swooping past in broad daylight, though people down in the street did; they pointed and gazed open-mouthed as owl after owl sped overhead. Most of them had never seen an owl even at nighttime. Mr. Dursley, however, had a perfectly normal, owl-free morning with plenty of thinking on his prefererred drill. He yelled at five different people. He made several important telephone"""
        expected_sentence1 = 'Mr. Dursley always sat with his back to the window in his office on the ninth floor (the window was on the back of the factory).'
        expected_sentence2 = "If he hadn't, he might have found it harder to concentrate on drills that morning: small drills and a normal drill or in the end the biggest drill he ever seen!"
        expected_list_responses = [
            {"word": "window", "expected_sentence": expected_sentence1, "start": 44, "end": 50, "start_position": 44, "end_position": 50},
            {"word": "window", "expected_sentence": expected_sentence1, "start": 89, "end": 95, "start_position": 89, "end_position": 95},
            {"word": "drill", "expected_sentence": expected_sentence2, "start": 109, "end": 114, "start_position": 238, "end_position": 243},
            {"word": "drill", "expected_sentence": expected_sentence2, "start": 141, "end": 146, "start_position": 270, "end_position": 275}
        ]
        for item in expected_list_responses:
            expected_sentence = item["expected_sentence"]
            expected_start_in_sentence, expected_end_in_sentence = item["start"], item["end"]
            sentence, start_in_sentence, end_in_sentence = get_sentence_by_word(text, item["word"], item["start_position"], item["end_position"])
            self.assertEqual(sentence, expected_sentence)
            self.assertEqual(start_in_sentence, expected_start_in_sentence)
            self.assertEqual(end_in_sentence, expected_end_in_sentence)


if __name__ == "__main__":
    unittest.main()