Spaces:
Sleeping
Sleeping
Update preprocessing.py
Browse files- preprocessing.py +45 -0
preprocessing.py
CHANGED
@@ -5,6 +5,51 @@ import re
|
|
5 |
from hazm import Normalizer
|
6 |
import pypdf
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def is_meaningful(text):
|
10 |
"""
|
|
|
5 |
from hazm import Normalizer
|
6 |
import pypdf
|
7 |
|
8 |
+
from nltk.tokenize import sent_tokenize
|
9 |
+
from hazm import SentenceTokenizer # For Persian sentence tokenization
|
10 |
+
|
11 |
+
def smart_chunking(text, max_tokens=1024, tokenizer=None):
|
12 |
+
"""
|
13 |
+
Splits the text into meaningful chunks using sentence boundaries.
|
14 |
+
Ensures that each chunk does not exceed the maximum token limit.
|
15 |
+
Supports both Persian and English text.
|
16 |
+
"""
|
17 |
+
# Step 1: Split text into sentences
|
18 |
+
if any(lang_char in text for lang_char in "ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"): # Check for Persian characters
|
19 |
+
# Use hazm for Persian sentence tokenization
|
20 |
+
persian_sent_tokenizer = SentenceTokenizer()
|
21 |
+
sentences = persian_sent_tokenizer.tokenize(text)
|
22 |
+
else:
|
23 |
+
# Use NLTK for English sentence tokenization
|
24 |
+
sentences = sent_tokenize(text)
|
25 |
+
|
26 |
+
# Step 2: Initialize variables
|
27 |
+
chunks = []
|
28 |
+
current_chunk = []
|
29 |
+
current_length = 0
|
30 |
+
|
31 |
+
# Step 3: Add sentences to chunks
|
32 |
+
for sentence in sentences:
|
33 |
+
# Tokenize the sentence to estimate its length
|
34 |
+
sentence_tokens = tokenizer.encode(sentence) if tokenizer else sentence.split()
|
35 |
+
sentence_length = len(sentence_tokens)
|
36 |
+
|
37 |
+
# If adding the sentence exceeds the max length, start a new chunk
|
38 |
+
if current_length + sentence_length > max_tokens:
|
39 |
+
chunks.append(" ".join(current_chunk))
|
40 |
+
current_chunk = []
|
41 |
+
current_length = 0
|
42 |
+
|
43 |
+
# Add the sentence to the current chunk
|
44 |
+
current_chunk.append(sentence)
|
45 |
+
current_length += sentence_length
|
46 |
+
|
47 |
+
# Add any remaining sentences as the last chunk
|
48 |
+
if current_chunk:
|
49 |
+
chunks.append(" ".join(current_chunk))
|
50 |
+
|
51 |
+
return chunks
|
52 |
+
|
53 |
|
54 |
def is_meaningful(text):
|
55 |
"""
|