File size: 9,122 Bytes
beccb39
 
 
71654c6
 
a2808a9
 
3c59de5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
beccb39
71654c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c7d796
71654c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
beccb39
 
71654c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
beccb39
 
71654c6
 
beccb39
 
71654c6
 
 
 
 
 
53ab711
71654c6
 
 
 
 
beccb39
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import os
import sqlite3
from docx import Document
import re
from hazm import Normalizer
import pypdf

from nltk.tokenize import sent_tokenize
from hazm import SentenceTokenizer  # For Persian sentence tokenization

def smart_chunking(text, max_tokens=1024, tokenizer=None):
    """
    Splits the text into meaningful chunks using sentence boundaries.
    Ensures that each chunk does not exceed the maximum token limit.
    Supports both Persian and English text.
    """
    # Step 1: Split text into sentences
    if any(lang_char in text for lang_char in "ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"):  # Check for Persian characters
        # Use hazm for Persian sentence tokenization
        persian_sent_tokenizer = SentenceTokenizer()
        sentences = persian_sent_tokenizer.tokenize(text)
    else:
        # Use NLTK for English sentence tokenization
        sentences = sent_tokenize(text)
    
    # Step 2: Initialize variables
    chunks = []
    current_chunk = []
    current_length = 0
    
    # Step 3: Add sentences to chunks
    for sentence in sentences:
        # Tokenize the sentence to estimate its length
        sentence_tokens = tokenizer.encode(sentence) if tokenizer else sentence.split()
        sentence_length = len(sentence_tokens)
        
        # If adding the sentence exceeds the max length, start a new chunk
        if current_length + sentence_length > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
        
        # Add the sentence to the current chunk
        current_chunk.append(sentence)
        current_length += sentence_length
    
    # Add any remaining sentences as the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks


def is_meaningful(text):
    """
    Determines whether the given text is considered meaningful based on the presence of a specific control character.

    This function checks if the input text contains the ASCII control character '\\x19' (End of Medium). 
    If the character is found, the text is deemed not meaningful and the function returns 0. Otherwise, 
    the text is considered meaningful and the function returns 1.

    Parameters:
    ----------
    text : str
        The input text to be evaluated for meaningfulness.

    Returns:
    -------
    int
        - 0: If the text contains the '\\x19' control character, indicating it is not meaningful.
        - 1: If the text does not contain the '\\x19' control character, indicating it is meaningful.

    Example:
    --------
    >>> is_meaningful("This is a valid sentence.")
    1

    >>> is_meaningful("Invalid text \\x19 with control character.")
    0
    """
    if "\x19" in text:
        return 0
    return 1



# Step 1: Text Cleaning
def clean_text(text):
    """
    Cleans the input text by removing unwanted patterns and retaining only Persian characters and spaces.

    This function performs the following cleaning steps:
    1. Removes URLs, emails, and other web-related patterns (e.g., http, https, www).
    2. Replaces multiple consecutive spaces with a single space.
    3. Retains only Persian characters (Unicode range \\u0600-\\u06FF) and spaces, removing all other characters.
    4. Strips leading and trailing whitespace from the resulting text.

    Parameters:
    ----------
    text : str
        The input text to be cleaned.

    Returns:
    -------
    str
        The cleaned text containing only Persian characters and spaces, with unnecessary patterns removed.

    Example:
    --------
    >>> clean_text("سلام! این یک متن آزمایشی است. http://example.com و ایمیل: [email protected]")
    'سلام این یک متن آزمایشی است'

    >>> clean_text("  متون   با فاصله های زیاد  ")
    'متون با فاصله های زیاد'
    """
    # Remove URLs, emails, and other patterns
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
    # text = re.sub(r"[^\u0600-\u06FF\s]", "", text)  # Keep only Persian characters and spaces
    return text.strip()


# Step 2: Normalization
def normalize_text(text):
    """
    Normalizes the input Persian text by standardizing characters and applying common normalization rules.

    This function uses the `Normalizer` class from the `hazm` library to perform the following tasks:
    1. Standardize Persian characters (e.g., converting Arabic characters to their Persian equivalents).
    2. Apply common normalization rules such as fixing spacing, removing diacritics, and handling special cases.
    
    Parameters:
    ----------
    text : str
        The input Persian text to be normalized.

    Returns:
    -------
    str
        The normalized Persian text with standardized characters and consistent formatting.

    Example:
    --------
    >>> normalize_text("سلامٔ دوست عزیز، حال شما چطور است؟")
    'سلام دوست عزیز، حال شما چطور است؟'

    >>> normalize_text("متن با اضافه‌ی فاصله‌های نامنظم.")
    'متن با اضافه‌ی فاصله‌های نامنظم.'
    """
    normalizer = Normalizer()
    text = normalizer.normalize(text)  # Standardize Persian characters
    return text


# Full Preprocessing Pipeline
def preprocess_persian_text(text):
    """
    Preprocesses Persian text by cleaning and normalizing it.

    This function performs the following steps:
    1. Cleans the input text using the `clean_text` function:
       - Removes URLs, emails, and other unwanted patterns.
       - Replaces multiple spaces with a single space.
       - Retains only Persian characters and spaces.
    2. Normalizes the cleaned text using the `normalize_text` function:
       - Standardizes Persian characters (e.g., converting Arabic characters to their Persian equivalents).
       - Applies common normalization rules such as fixing spacing and removing diacritics.

    Parameters:
    ----------
    text : str
        The input Persian text to be preprocessed.

    Returns:
    -------
    str
        The preprocessed Persian text, which is cleaned and normalized.

    Example:
    --------
    >>> preprocess_persian_text("سلامٔ دوست عزیز! این یک متن آزمایشی است: http://example.com")
    'سلام دوست عزیز این یک متن آزمایشی است'

    >>> preprocess_persian_text("  متون   با فاصله‌های نامنظم و کلمات عربی مثل شیء ")
    'متون با فاصله‌های نامنظم و کلمات عربی مثل شیء'
    """
    text = clean_text(text)
    text = normalize_text(text)
    return text

    

def read_file(file_path):
    """
    Reads and preprocesses text from Word (.docx), Text (.txt), or PDF (.pdf) files.

    This function supports reading Persian text from the following file formats:
    1. `.docx`: Extracts text from paragraphs in a Word document.
    2. `.txt`: Reads plain text from a text file encoded in UTF-8.
    3. `.pdf`: Extracts text from a PDF file using `pypdf`.

    After extracting the raw text, the function preprocesses it using the `preprocess_persian_text` function,
    which cleans and normalizes the Persian text.

    Parameters:
    ----------
    file_path : str
        The path to the input file. Supported formats are `.docx`, `.txt`, and `.pdf`.

    Returns:
    -------
    str
        The preprocessed Persian text extracted from the file.

    Raises:
    ------
    ValueError
        - If the file format is unsupported (only `.docx`, `.txt`, and `.pdf` are allowed).
        - If the extracted text from a PDF file is deemed not meaningful (e.g., contains control characters).

    Example:
    --------
    >>> read_file("example.docx")
    'سلام دوست عزیز این یک متن آزمایشی است'

    >>> read_file("example.txt")
    'این یک فایل متنی ساده است.'

    >>> read_file("example.pdf")
    'این متن از یک فایل پی دی اف استخراج شده است.'
    """
    if file_path.endswith('.docx'):
        doc = Document(file_path)
        text =  "\n".join([para.text for para in doc.paragraphs])
        return preprocess_persian_text(text)
    elif file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as f:
            text =  f.read()
            return preprocess_persian_text(text)
    elif file_path.endswith('.pdf'):
        reader = pypdf.PdfReader(file_path)
        raw_data = ""
        for idx in range(len(reader.pages)):
            raw_data += reader.pages[idx].extract_text()
            if not is_meaningful(raw_data):
                print("this text not supported")
                raise ValueError("Unsupported file format.")
        return preprocess_persian_text(raw_data)
        
    else:
        raise ValueError("Unsupported file format. Only .docx and .txt are allowed.")