Update app.py
Browse files
app.py
CHANGED
@@ -178,7 +178,7 @@ def extract_text_tables_pdfplumber(pdf_file):
|
|
178 |
print("No text extracted. The PDF might be image-based.")
|
179 |
return None, None
|
180 |
|
181 |
-
def split_text_into_chunks(text, tokenizer, max_tokens=
|
182 |
sentences = nltk.sent_tokenize(text)
|
183 |
chunks = []
|
184 |
current_chunk = ''
|
@@ -331,7 +331,7 @@ else:
|
|
331 |
st.write(f"Original text length: {input_length} words")
|
332 |
|
333 |
# Define the maximum number of tokens the model can handle
|
334 |
-
max_input_tokens =
|
335 |
|
336 |
# Function to split text into chunks based on tokens (modified to avoid overlaps)
|
337 |
def split_text_into_chunks(text, tokenizer, max_tokens=max_input_tokens):
|
|
|
178 |
print("No text extracted. The PDF might be image-based.")
|
179 |
return None, None
|
180 |
|
181 |
+
def split_text_into_chunks(text, tokenizer, max_tokens=512):
|
182 |
sentences = nltk.sent_tokenize(text)
|
183 |
chunks = []
|
184 |
current_chunk = ''
|
|
|
331 |
st.write(f"Original text length: {input_length} words")
|
332 |
|
333 |
# Define the maximum number of tokens the model can handle
|
334 |
+
max_input_tokens = 512
|
335 |
|
336 |
# Function to split text into chunks based on tokens (modified to avoid overlaps)
|
337 |
def split_text_into_chunks(text, tokenizer, max_tokens=max_input_tokens):
|