File size: 6,715 Bytes
3327b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# preprocessing sub functions

import re
import os
import glob
import string
import pandas as pd
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions


def remove_deleted(df):
    r"""
    remove_deleted function.
    This function appears to remove deleted post from crawled website data.

    Args:
        df: dataframe of crawled website data.

    Returns:
        df: dataframe of crawled website data without deleted post.
    """
    # Remove rows where the 'timestamp' column is numeric
    df = df[~df['timestamp'].str.isnumeric()]
    df.reset_index(drop=True, inplace=True)
    return df


def remove_deleted_post(df):
    r"""
    remove_deleted_post function.
    This function appears to remove deleted post where is in another format.

    Args:
        df: dataframe of crawled website data.

    Returns:
        df: dataframe of crawled website data without deleted post.
    """
    # Remove rows where the 'post' column contains 'del'
    df = df[df['post'] != 'del']
    df.reset_index(drop=True, inplace=True)
    return df


def update_lastEdit(df):
    r"""
    update_lastEdit function.
    This function appears to fill NaN values in the 'last_edit' column with corresponding values from the 'timestamp' column 

    Args:
        df: dataframe of crawled website data.

    Returns:
        df: dataframe of crawled website data with updated last_edit.
    """
    df.loc[:, 'last_edit'] = df['last_edit'].fillna(df['timestamp'])
    return df


def preprocess_date(date_str):
    r"""
    preprocess_date function.
    This function appears to convert occurrences of 'Today' in a date string to the current date
    Args:
        date_str: str that contains date information.

    Returns:
        str that contains date information with updated 'Today' to current date.
    """
    if "Today " in date_str:
        current_date = datetime.now().strftime("%B %d, %Y")
        return date_str.replace("Today", current_date)
    return date_str


def convert_datetime_with_multiple_formats(date_str, formats):
    r"""
    convert_datetime_with_multiple_formats function.
    This function appears to Convert a date string to a datetime object using multiple possible formats.

    Args:
        date_str: str that contains date information.
        formats: list of possible date formats.

    Returns:
        datetime object.
    """
    for fmt in formats:
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    raise ValueError(f"Time data {date_str} doesn't match provided formats")


def convert_to_datetime(df_):
    r"""
    convert_to_datetime function.
    This function appears to convert 'timestamp' and 'last_edit' columns to datetime format

    Args:
        df_: dataframe of crawled website data.

    Returns:
        df: dataframe of crawled website data with datatime format in 'timestamp' and 'last_edit' columns.
    """
    df = df_.copy()

    # Preprocess 'timestamp' and 'last_edit' columns to handle 'Today' values
    df['timestamp'] = df['timestamp'].apply(preprocess_date)
    df['last_edit'] = df['last_edit'].apply(preprocess_date)

    # List of potential datetime formats
    datetime_formats = ["%B %d, %Y at %I:%M:%S %p", "%B %d, %Y, %I:%M:%S %p"]

    df['timestamp'] = df['timestamp'].apply(
        convert_datetime_with_multiple_formats, formats=datetime_formats)
    df['timestamp'] = df['timestamp'].dt.date
    df['last_edit'] = df['last_edit'].apply(
        convert_datetime_with_multiple_formats, formats=datetime_formats)
    df['last_edit'] = df['last_edit'].dt.date

    return df


def remove_urls(text):
    r"""
    remove_urls function.
    This function appears to Remove URLs from a text.
    """
    return re.sub(r'http\S+', '', text)

#


def remove_extra_whitespace(text):
    r"""
    remove_extra_whitespace function.
    This function appears to Remove extra whitespace characters from a text.
    """
    return ' '.join(text.split())


def remove_special_characters(text):
    r"""
    remove_special_characters function.
    This function appears to remove special characters from a text.
    """
    return re.sub(r'[^\w\s]', '', text)


def to_lowercase(text):
    r"""
    to_lowercase function.
    This function appears to convert a text to lowercase.
    """
    return text.lower()


def remove_meta_info(text):
    r"""
    remove_meta_info function.
    This function appears to remove meta information where it contain quotes information.
    """
    text = str(text)
    return re.sub(r'Quote from: [a-zA-Z0-9_]+ on [a-zA-Z0-9, :]+ (AM|PM)', '', text)


def tokenize(text):
    r"""
    tokenize function.
    This function appears to Tokenize a text into individual words.
    """
    return text.split(' ')


def remove_sentence_punctuation(text):
    r"""
    remove_sentence_punctuation function.
    This function appears to remove punctuation from a text, excluding math symbols.
    """
    math_symbols = "+-×*÷/=()[]{},.<>%^"
    punctuations_to_remove = ''.join(
        set(string.punctuation) - set(math_symbols))
    return text.translate(str.maketrans(punctuations_to_remove, ' ' * len(punctuations_to_remove)))


def lemmatize_text(text):
    r"""
    lemmatize_text function.
    This function appears to lemmatize text, where it convert words to their base form.
    """
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


def replace_numbers(text, replace_with="<NUM>"):
    r"""
    replace_numbers function.
    This function appears to replace numbers in a text with a specified string (default is "<NUM>").
    """
    return re.sub(r'\b\d+\b', replace_with, text)


def remove_stopwords(tokens):
    r"""
    remove_stopwords function.
    This function appears to remove stopwords from a list of tokens.
    """
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]


def expand_contractions(tokens):
    r"""
    expand_contractions function.
    This function appears to expand contractions in a list of tokens (e.g., "isn't" to "is not")
    """
    return [contractions.fix(word) for word in tokens]


def remove_repeated_phrases(text):
    r"""
    remove_repeated_phrases function.
    This function appears to remove repeated phrases from a text.
    eg. "hello hello world" -> "hello world"
    """
    phrases = text.split()
    seen = set()
    output = []
    for phrase in phrases:
        if phrase not in seen:
            seen.add(phrase)
            output.append(phrase)
    return ' '.join(output)