alessandro trinca tornidor commited on
Commit
e0362b5
·
1 Parent(s): 5ff9411

feat: add my_ghost_writer package

Browse files
my_ghost_writer/__init__.py ADDED
File without changes
my_ghost_writer/text_parsers.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Iterator
2
+
3
+
4
+ def clean_string(s: str) -> str:
5
+ """
6
+ Clean a given string by removing punctuation using
7
+ 1. nltk.classify.TextCat()'s remove_punctuation() method
8
+ 2. removing new line characters
9
+ and converting the string to lowercase.
10
+
11
+ Args:
12
+ s (str): The string to clean.
13
+
14
+ Returns:
15
+ str: The cleaned string.
16
+ """
17
+ from nltk.classify import TextCat
18
+ tc = TextCat()
19
+ cleaned_word = tc.remove_punctuation(text=s)
20
+ return cleaned_word.translate(str.maketrans("", "", "\n\r"))
21
+
22
+
23
+ def get_words_tokens_and_indexes(
24
+ words_tokens_list: list[str], offsets_tokens_list: list | Iterator, ps, min_len_words=3, sort_type=""
25
+ ) -> dict:
26
+ """
27
+ Get the words tokens and their indexes in the text.
28
+
29
+ Args:
30
+ words_tokens_list (list): List of words tokens.
31
+ offsets_tokens_list (list): List of offsets for each token.
32
+ ps (PorterStemmer): The stemmer to use.
33
+ min_len_words (int): Minimum length of words to include.
34
+ sort_type (str): The type of sorting to apply. Can be "" (no sorting - default), "count" (by words count), "word" (alphabetical).
35
+
36
+ Returns:
37
+ dict: Dictionary with stemmed words as keys and a list of dictionaries
38
+ containing the original word and its offsets as values.
39
+ """
40
+ words_stems_dict = {}
41
+ for n_row, (words_tokens, offsets_tokens) in enumerate(zip(words_tokens_list, offsets_tokens_list)):
42
+ for word, offsets in zip(words_tokens, offsets_tokens):
43
+ cleaned_word = clean_string(word)
44
+ if len(cleaned_word) < min_len_words:
45
+ continue
46
+ stem = ps.stem(word)
47
+ if stem not in words_stems_dict:
48
+ words_stems_dict[stem] = {"count": 0, "word_prefix": stem, "offsets_array": []}
49
+ count, word_offsets = update_stems_list(words_stems_dict[stem], word, offsets, n_row=n_row)
50
+ words_stems_dict[stem] = {"count": count, "word_prefix": stem, "offsets_array": word_offsets}
51
+ return words_stems_dict
52
+
53
+
54
+ def update_stems_list(current_stem_tuple: dict, word: str, offsets: list, n_row: int) -> tuple:
55
+ """
56
+ Update the stems list with the new stem and its count.
57
+
58
+ Args:
59
+ current_stem_tuple (tuple): Tuple containing the current stem count and list of words.
60
+ offsets (list): List of offsets for the word.
61
+ word (str): The word to stem.
62
+ n_row (int): The row number in the original text.
63
+
64
+ Returns:
65
+ dict[str|list|int]: A dictionary with the stem string, its offsets and count.
66
+ """
67
+ n, word_offsets = current_stem_tuple["count"], current_stem_tuple["offsets_array"]
68
+ n += 1
69
+ word_offsets.append({"word": word, "offsets": list(offsets), "n_row": n_row})
70
+ return n, word_offsets