|
import os |
|
import sqlite3 |
|
from docx import Document |
|
import re |
|
from hazm import Normalizer |
|
import pypdf |
|
|
|
from nltk.tokenize import sent_tokenize |
|
from hazm import SentenceTokenizer |
|
|
|
def smart_chunking(text, max_tokens=1024, tokenizer=None): |
|
""" |
|
Splits the text into meaningful chunks using sentence boundaries. |
|
Ensures that each chunk does not exceed the maximum token limit. |
|
Supports both Persian and English text. |
|
""" |
|
|
|
if any(lang_char in text for lang_char in "ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"): |
|
|
|
persian_sent_tokenizer = SentenceTokenizer() |
|
sentences = persian_sent_tokenizer.tokenize(text) |
|
else: |
|
|
|
sentences = sent_tokenize(text) |
|
|
|
|
|
chunks = [] |
|
current_chunk = [] |
|
current_length = 0 |
|
|
|
|
|
for sentence in sentences: |
|
|
|
sentence_tokens = tokenizer.encode(sentence) if tokenizer else sentence.split() |
|
sentence_length = len(sentence_tokens) |
|
|
|
|
|
if current_length + sentence_length > max_tokens: |
|
chunks.append(" ".join(current_chunk)) |
|
current_chunk = [] |
|
current_length = 0 |
|
|
|
|
|
current_chunk.append(sentence) |
|
current_length += sentence_length |
|
|
|
|
|
if current_chunk: |
|
chunks.append(" ".join(current_chunk)) |
|
|
|
return chunks |
|
|
|
|
|
def is_meaningful(text): |
|
""" |
|
Determines whether the given text is considered meaningful based on the presence of a specific control character. |
|
|
|
This function checks if the input text contains the ASCII control character '\\x19' (End of Medium). |
|
If the character is found, the text is deemed not meaningful and the function returns 0. Otherwise, |
|
the text is considered meaningful and the function returns 1. |
|
|
|
Parameters: |
|
---------- |
|
text : str |
|
The input text to be evaluated for meaningfulness. |
|
|
|
Returns: |
|
------- |
|
int |
|
- 0: If the text contains the '\\x19' control character, indicating it is not meaningful. |
|
- 1: If the text does not contain the '\\x19' control character, indicating it is meaningful. |
|
|
|
Example: |
|
-------- |
|
>>> is_meaningful("This is a valid sentence.") |
|
1 |
|
|
|
>>> is_meaningful("Invalid text \\x19 with control character.") |
|
0 |
|
""" |
|
if "\x19" in text: |
|
return 0 |
|
return 1 |
|
|
|
|
|
|
|
|
|
def clean_text(text): |
|
""" |
|
Cleans the input text by removing unwanted patterns and retaining only Persian characters and spaces. |
|
|
|
This function performs the following cleaning steps: |
|
1. Removes URLs, emails, and other web-related patterns (e.g., http, https, www). |
|
2. Replaces multiple consecutive spaces with a single space. |
|
3. Retains only Persian characters (Unicode range \\u0600-\\u06FF) and spaces, removing all other characters. |
|
4. Strips leading and trailing whitespace from the resulting text. |
|
|
|
Parameters: |
|
---------- |
|
text : str |
|
The input text to be cleaned. |
|
|
|
Returns: |
|
------- |
|
str |
|
The cleaned text containing only Persian characters and spaces, with unnecessary patterns removed. |
|
|
|
Example: |
|
-------- |
|
>>> clean_text("سلام! این یک متن آزمایشی است. http://example.com و ایمیل: [email protected]") |
|
'سلام این یک متن آزمایشی است' |
|
|
|
>>> clean_text(" متون با فاصله های زیاد ") |
|
'متون با فاصله های زیاد' |
|
""" |
|
|
|
text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) |
|
text = re.sub(r"\s+", " ", text) |
|
|
|
return text.strip() |
|
|
|
|
|
|
|
def normalize_text(text): |
|
""" |
|
Normalizes the input Persian text by standardizing characters and applying common normalization rules. |
|
|
|
This function uses the `Normalizer` class from the `hazm` library to perform the following tasks: |
|
1. Standardize Persian characters (e.g., converting Arabic characters to their Persian equivalents). |
|
2. Apply common normalization rules such as fixing spacing, removing diacritics, and handling special cases. |
|
|
|
Parameters: |
|
---------- |
|
text : str |
|
The input Persian text to be normalized. |
|
|
|
Returns: |
|
------- |
|
str |
|
The normalized Persian text with standardized characters and consistent formatting. |
|
|
|
Example: |
|
-------- |
|
>>> normalize_text("سلامٔ دوست عزیز، حال شما چطور است؟") |
|
'سلام دوست عزیز، حال شما چطور است؟' |
|
|
|
>>> normalize_text("متن با اضافهی فاصلههای نامنظم.") |
|
'متن با اضافهی فاصلههای نامنظم.' |
|
""" |
|
normalizer = Normalizer() |
|
text = normalizer.normalize(text) |
|
return text |
|
|
|
|
|
|
|
def preprocess_persian_text(text): |
|
""" |
|
Preprocesses Persian text by cleaning and normalizing it. |
|
|
|
This function performs the following steps: |
|
1. Cleans the input text using the `clean_text` function: |
|
- Removes URLs, emails, and other unwanted patterns. |
|
- Replaces multiple spaces with a single space. |
|
- Retains only Persian characters and spaces. |
|
2. Normalizes the cleaned text using the `normalize_text` function: |
|
- Standardizes Persian characters (e.g., converting Arabic characters to their Persian equivalents). |
|
- Applies common normalization rules such as fixing spacing and removing diacritics. |
|
|
|
Parameters: |
|
---------- |
|
text : str |
|
The input Persian text to be preprocessed. |
|
|
|
Returns: |
|
------- |
|
str |
|
The preprocessed Persian text, which is cleaned and normalized. |
|
|
|
Example: |
|
-------- |
|
>>> preprocess_persian_text("سلامٔ دوست عزیز! این یک متن آزمایشی است: http://example.com") |
|
'سلام دوست عزیز این یک متن آزمایشی است' |
|
|
|
>>> preprocess_persian_text(" متون با فاصلههای نامنظم و کلمات عربی مثل شیء ") |
|
'متون با فاصلههای نامنظم و کلمات عربی مثل شیء' |
|
""" |
|
text = clean_text(text) |
|
text = normalize_text(text) |
|
return text |
|
|
|
|
|
|
|
def read_file(file_path): |
|
""" |
|
Reads and preprocesses text from Word (.docx), Text (.txt), or PDF (.pdf) files. |
|
|
|
This function supports reading Persian text from the following file formats: |
|
1. `.docx`: Extracts text from paragraphs in a Word document. |
|
2. `.txt`: Reads plain text from a text file encoded in UTF-8. |
|
3. `.pdf`: Extracts text from a PDF file using `pypdf`. |
|
|
|
After extracting the raw text, the function preprocesses it using the `preprocess_persian_text` function, |
|
which cleans and normalizes the Persian text. |
|
|
|
Parameters: |
|
---------- |
|
file_path : str |
|
The path to the input file. Supported formats are `.docx`, `.txt`, and `.pdf`. |
|
|
|
Returns: |
|
------- |
|
str |
|
The preprocessed Persian text extracted from the file. |
|
|
|
Raises: |
|
------ |
|
ValueError |
|
- If the file format is unsupported (only `.docx`, `.txt`, and `.pdf` are allowed). |
|
- If the extracted text from a PDF file is deemed not meaningful (e.g., contains control characters). |
|
|
|
Example: |
|
-------- |
|
>>> read_file("example.docx") |
|
'سلام دوست عزیز این یک متن آزمایشی است' |
|
|
|
>>> read_file("example.txt") |
|
'این یک فایل متنی ساده است.' |
|
|
|
>>> read_file("example.pdf") |
|
'این متن از یک فایل پی دی اف استخراج شده است.' |
|
""" |
|
if file_path.endswith('.docx'): |
|
doc = Document(file_path) |
|
text = "\n".join([para.text for para in doc.paragraphs]) |
|
return preprocess_persian_text(text) |
|
elif file_path.endswith('.txt'): |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
text = f.read() |
|
return preprocess_persian_text(text) |
|
elif file_path.endswith('.pdf'): |
|
reader = pypdf.PdfReader(file_path) |
|
raw_data = "" |
|
for idx in range(len(reader.pages)): |
|
raw_data += reader.pages[idx].extract_text() |
|
if not is_meaningful(raw_data): |
|
print("this text not supported") |
|
raise ValueError("Unsupported file format.") |
|
return preprocess_persian_text(raw_data) |
|
|
|
else: |
|
raise ValueError("Unsupported file format. Only .docx and .txt are allowed.") |
|
|
|
|