Danielrahmai1991 commited on
Commit
3c59de5
·
verified ·
1 Parent(s): fb74abb

Update preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +45 -0
preprocessing.py CHANGED
@@ -5,6 +5,51 @@ import re
5
  from hazm import Normalizer
6
  import pypdf
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def is_meaningful(text):
10
  """
 
5
  from hazm import Normalizer
6
  import pypdf
7
 
8
+ from nltk.tokenize import sent_tokenize
9
+ from hazm import SentenceTokenizer # For Persian sentence tokenization
10
+
11
+ def smart_chunking(text, max_tokens=1024, tokenizer=None):
12
+ """
13
+ Splits the text into meaningful chunks using sentence boundaries.
14
+ Ensures that each chunk does not exceed the maximum token limit.
15
+ Supports both Persian and English text.
16
+ """
17
+ # Step 1: Split text into sentences
18
+ if any(lang_char in text for lang_char in "ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"): # Check for Persian characters
19
+ # Use hazm for Persian sentence tokenization
20
+ persian_sent_tokenizer = SentenceTokenizer()
21
+ sentences = persian_sent_tokenizer.tokenize(text)
22
+ else:
23
+ # Use NLTK for English sentence tokenization
24
+ sentences = sent_tokenize(text)
25
+
26
+ # Step 2: Initialize variables
27
+ chunks = []
28
+ current_chunk = []
29
+ current_length = 0
30
+
31
+ # Step 3: Add sentences to chunks
32
+ for sentence in sentences:
33
+ # Tokenize the sentence to estimate its length
34
+ sentence_tokens = tokenizer.encode(sentence) if tokenizer else sentence.split()
35
+ sentence_length = len(sentence_tokens)
36
+
37
+ # If adding the sentence exceeds the max length, start a new chunk
38
+ if current_length + sentence_length > max_tokens:
39
+ chunks.append(" ".join(current_chunk))
40
+ current_chunk = []
41
+ current_length = 0
42
+
43
+ # Add the sentence to the current chunk
44
+ current_chunk.append(sentence)
45
+ current_length += sentence_length
46
+
47
+ # Add any remaining sentences as the last chunk
48
+ if current_chunk:
49
+ chunks.append(" ".join(current_chunk))
50
+
51
+ return chunks
52
+
53
 
54
  def is_meaningful(text):
55
  """