alessandro trinca tornidor
commited on
Commit
·
e0362b5
1
Parent(s):
5ff9411
feat: add my_ghost_writer package
Browse files
my_ghost_writer/__init__.py
ADDED
File without changes
|
my_ghost_writer/text_parsers.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Iterator
|
2 |
+
|
3 |
+
|
4 |
+
def clean_string(s: str) -> str:
|
5 |
+
"""
|
6 |
+
Clean a given string by removing punctuation using
|
7 |
+
1. nltk.classify.TextCat()'s remove_punctuation() method
|
8 |
+
2. removing new line characters
|
9 |
+
and converting the string to lowercase.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
s (str): The string to clean.
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
str: The cleaned string.
|
16 |
+
"""
|
17 |
+
from nltk.classify import TextCat
|
18 |
+
tc = TextCat()
|
19 |
+
cleaned_word = tc.remove_punctuation(text=s)
|
20 |
+
return cleaned_word.translate(str.maketrans("", "", "\n\r"))
|
21 |
+
|
22 |
+
|
23 |
+
def get_words_tokens_and_indexes(
|
24 |
+
words_tokens_list: list[str], offsets_tokens_list: list | Iterator, ps, min_len_words=3, sort_type=""
|
25 |
+
) -> dict:
|
26 |
+
"""
|
27 |
+
Get the words tokens and their indexes in the text.
|
28 |
+
|
29 |
+
Args:
|
30 |
+
words_tokens_list (list): List of words tokens.
|
31 |
+
offsets_tokens_list (list): List of offsets for each token.
|
32 |
+
ps (PorterStemmer): The stemmer to use.
|
33 |
+
min_len_words (int): Minimum length of words to include.
|
34 |
+
sort_type (str): The type of sorting to apply. Can be "" (no sorting - default), "count" (by words count), "word" (alphabetical).
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
dict: Dictionary with stemmed words as keys and a list of dictionaries
|
38 |
+
containing the original word and its offsets as values.
|
39 |
+
"""
|
40 |
+
words_stems_dict = {}
|
41 |
+
for n_row, (words_tokens, offsets_tokens) in enumerate(zip(words_tokens_list, offsets_tokens_list)):
|
42 |
+
for word, offsets in zip(words_tokens, offsets_tokens):
|
43 |
+
cleaned_word = clean_string(word)
|
44 |
+
if len(cleaned_word) < min_len_words:
|
45 |
+
continue
|
46 |
+
stem = ps.stem(word)
|
47 |
+
if stem not in words_stems_dict:
|
48 |
+
words_stems_dict[stem] = {"count": 0, "word_prefix": stem, "offsets_array": []}
|
49 |
+
count, word_offsets = update_stems_list(words_stems_dict[stem], word, offsets, n_row=n_row)
|
50 |
+
words_stems_dict[stem] = {"count": count, "word_prefix": stem, "offsets_array": word_offsets}
|
51 |
+
return words_stems_dict
|
52 |
+
|
53 |
+
|
54 |
+
def update_stems_list(current_stem_tuple: dict, word: str, offsets: list, n_row: int) -> tuple:
|
55 |
+
"""
|
56 |
+
Update the stems list with the new stem and its count.
|
57 |
+
|
58 |
+
Args:
|
59 |
+
current_stem_tuple (tuple): Tuple containing the current stem count and list of words.
|
60 |
+
offsets (list): List of offsets for the word.
|
61 |
+
word (str): The word to stem.
|
62 |
+
n_row (int): The row number in the original text.
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
dict[str|list|int]: A dictionary with the stem string, its offsets and count.
|
66 |
+
"""
|
67 |
+
n, word_offsets = current_stem_tuple["count"], current_stem_tuple["offsets_array"]
|
68 |
+
n += 1
|
69 |
+
word_offsets.append({"word": word, "offsets": list(offsets), "n_row": n_row})
|
70 |
+
return n, word_offsets
|