Spaces:
Sleeping
Sleeping
import os | |
import sqlite3 | |
from docx import Document | |
import re | |
from hazm import Normalizer | |
import pypdf | |
from nltk.tokenize import sent_tokenize | |
from hazm import SentenceTokenizer # For Persian sentence tokenization | |
def smart_chunking(text, max_tokens=1024, tokenizer=None): | |
""" | |
Splits the text into meaningful chunks using sentence boundaries. | |
Ensures that each chunk does not exceed the maximum token limit. | |
Supports both Persian and English text. | |
""" | |
# Step 1: Split text into sentences | |
if any(lang_char in text for lang_char in "ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"): # Check for Persian characters | |
# Use hazm for Persian sentence tokenization | |
persian_sent_tokenizer = SentenceTokenizer() | |
sentences = persian_sent_tokenizer.tokenize(text) | |
else: | |
# Use NLTK for English sentence tokenization | |
sentences = sent_tokenize(text) | |
# Step 2: Initialize variables | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
# Step 3: Add sentences to chunks | |
for sentence in sentences: | |
# Tokenize the sentence to estimate its length | |
sentence_tokens = tokenizer.encode(sentence) if tokenizer else sentence.split() | |
sentence_length = len(sentence_tokens) | |
# If adding the sentence exceeds the max length, start a new chunk | |
if current_length + sentence_length > max_tokens: | |
chunks.append(" ".join(current_chunk)) | |
current_chunk = [] | |
current_length = 0 | |
# Add the sentence to the current chunk | |
current_chunk.append(sentence) | |
current_length += sentence_length | |
# Add any remaining sentences as the last chunk | |
if current_chunk: | |
chunks.append(" ".join(current_chunk)) | |
return chunks | |
def is_meaningful(text): | |
""" | |
Determines whether the given text is considered meaningful based on the presence of a specific control character. | |
This function checks if the input text contains the ASCII control character '\\x19' (End of Medium). | |
If the character is found, the text is deemed not meaningful and the function returns 0. Otherwise, | |
the text is considered meaningful and the function returns 1. | |
Parameters: | |
---------- | |
text : str | |
The input text to be evaluated for meaningfulness. | |
Returns: | |
------- | |
int | |
- 0: If the text contains the '\\x19' control character, indicating it is not meaningful. | |
- 1: If the text does not contain the '\\x19' control character, indicating it is meaningful. | |
Example: | |
-------- | |
>>> is_meaningful("This is a valid sentence.") | |
1 | |
>>> is_meaningful("Invalid text \\x19 with control character.") | |
0 | |
""" | |
if "\x19" in text: | |
return 0 | |
return 1 | |
# Step 1: Text Cleaning | |
def clean_text(text): | |
""" | |
Cleans the input text by removing unwanted patterns and retaining only Persian characters and spaces. | |
This function performs the following cleaning steps: | |
1. Removes URLs, emails, and other web-related patterns (e.g., http, https, www). | |
2. Replaces multiple consecutive spaces with a single space. | |
3. Retains only Persian characters (Unicode range \\u0600-\\u06FF) and spaces, removing all other characters. | |
4. Strips leading and trailing whitespace from the resulting text. | |
Parameters: | |
---------- | |
text : str | |
The input text to be cleaned. | |
Returns: | |
------- | |
str | |
The cleaned text containing only Persian characters and spaces, with unnecessary patterns removed. | |
Example: | |
-------- | |
>>> clean_text("سلام! این یک متن آزمایشی است. http://example.com و ایمیل: [email protected]") | |
'سلام این یک متن آزمایشی است' | |
>>> clean_text(" متون با فاصله های زیاد ") | |
'متون با فاصله های زیاد' | |
""" | |
# Remove URLs, emails, and other patterns | |
text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) | |
text = re.sub(r"\s+", " ", text) # Replace multiple spaces with a single space | |
# text = re.sub(r"[^\u0600-\u06FF\s]", "", text) # Keep only Persian characters and spaces | |
return text.strip() | |
# Step 2: Normalization | |
def normalize_text(text): | |
""" | |
Normalizes the input Persian text by standardizing characters and applying common normalization rules. | |
This function uses the `Normalizer` class from the `hazm` library to perform the following tasks: | |
1. Standardize Persian characters (e.g., converting Arabic characters to their Persian equivalents). | |
2. Apply common normalization rules such as fixing spacing, removing diacritics, and handling special cases. | |
Parameters: | |
---------- | |
text : str | |
The input Persian text to be normalized. | |
Returns: | |
------- | |
str | |
The normalized Persian text with standardized characters and consistent formatting. | |
Example: | |
-------- | |
>>> normalize_text("سلامٔ دوست عزیز، حال شما چطور است؟") | |
'سلام دوست عزیز، حال شما چطور است؟' | |
>>> normalize_text("متن با اضافهی فاصلههای نامنظم.") | |
'متن با اضافهی فاصلههای نامنظم.' | |
""" | |
normalizer = Normalizer() | |
text = normalizer.normalize(text) # Standardize Persian characters | |
return text | |
# Full Preprocessing Pipeline | |
def preprocess_persian_text(text): | |
""" | |
Preprocesses Persian text by cleaning and normalizing it. | |
This function performs the following steps: | |
1. Cleans the input text using the `clean_text` function: | |
- Removes URLs, emails, and other unwanted patterns. | |
- Replaces multiple spaces with a single space. | |
- Retains only Persian characters and spaces. | |
2. Normalizes the cleaned text using the `normalize_text` function: | |
- Standardizes Persian characters (e.g., converting Arabic characters to their Persian equivalents). | |
- Applies common normalization rules such as fixing spacing and removing diacritics. | |
Parameters: | |
---------- | |
text : str | |
The input Persian text to be preprocessed. | |
Returns: | |
------- | |
str | |
The preprocessed Persian text, which is cleaned and normalized. | |
Example: | |
-------- | |
>>> preprocess_persian_text("سلامٔ دوست عزیز! این یک متن آزمایشی است: http://example.com") | |
'سلام دوست عزیز این یک متن آزمایشی است' | |
>>> preprocess_persian_text(" متون با فاصلههای نامنظم و کلمات عربی مثل شیء ") | |
'متون با فاصلههای نامنظم و کلمات عربی مثل شیء' | |
""" | |
text = clean_text(text) | |
text = normalize_text(text) | |
return text | |
def read_file(file_path): | |
""" | |
Reads and preprocesses text from Word (.docx), Text (.txt), or PDF (.pdf) files. | |
This function supports reading Persian text from the following file formats: | |
1. `.docx`: Extracts text from paragraphs in a Word document. | |
2. `.txt`: Reads plain text from a text file encoded in UTF-8. | |
3. `.pdf`: Extracts text from a PDF file using `pypdf`. | |
After extracting the raw text, the function preprocesses it using the `preprocess_persian_text` function, | |
which cleans and normalizes the Persian text. | |
Parameters: | |
---------- | |
file_path : str | |
The path to the input file. Supported formats are `.docx`, `.txt`, and `.pdf`. | |
Returns: | |
------- | |
str | |
The preprocessed Persian text extracted from the file. | |
Raises: | |
------ | |
ValueError | |
- If the file format is unsupported (only `.docx`, `.txt`, and `.pdf` are allowed). | |
- If the extracted text from a PDF file is deemed not meaningful (e.g., contains control characters). | |
Example: | |
-------- | |
>>> read_file("example.docx") | |
'سلام دوست عزیز این یک متن آزمایشی است' | |
>>> read_file("example.txt") | |
'این یک فایل متنی ساده است.' | |
>>> read_file("example.pdf") | |
'این متن از یک فایل پی دی اف استخراج شده است.' | |
""" | |
if file_path.endswith('.docx'): | |
doc = Document(file_path) | |
text = "\n".join([para.text for para in doc.paragraphs]) | |
return preprocess_persian_text(text) | |
elif file_path.endswith('.txt'): | |
with open(file_path, 'r', encoding='utf-8') as f: | |
text = f.read() | |
return preprocess_persian_text(text) | |
elif file_path.endswith('.pdf'): | |
reader = pypdf.PdfReader(file_path) | |
raw_data = "" | |
for idx in range(len(reader.pages)): | |
raw_data += reader.pages[idx].extract_text() | |
if not is_meaningful(raw_data): | |
print("this text not supported") | |
raise ValueError("Unsupported file format.") | |
return preprocess_persian_text(raw_data) | |
else: | |
raise ValueError("Unsupported file format. Only .docx and .txt are allowed.") | |