Spaces:

KKowenn
/

finbreif3

Sleeping

KKowenn commited on Mar 17

Commit

b0e3283

verified ·

1 Parent(s): dc6c06e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -178,7 +178,7 @@ def extract_text_tables_pdfplumber(pdf_file):
             print("No text extracted. The PDF might be image-based.")
             return None, None
-def split_text_into_chunks(text, tokenizer, max_tokens=256):
     sentences = nltk.sent_tokenize(text)
     chunks = []
     current_chunk = ''
@@ -331,7 +331,7 @@ else:
                 st.write(f"Original text length: {input_length} words")
                 # Define the maximum number of tokens the model can handle
-                max_input_tokens = 256
                 # Function to split text into chunks based on tokens (modified to avoid overlaps)
                 def split_text_into_chunks(text, tokenizer, max_tokens=max_input_tokens):

             print("No text extracted. The PDF might be image-based.")
             return None, None
+def split_text_into_chunks(text, tokenizer, max_tokens=512):
     sentences = nltk.sent_tokenize(text)
     chunks = []
     current_chunk = ''
                 st.write(f"Original text length: {input_length} words")
                 # Define the maximum number of tokens the model can handle
+                max_input_tokens = 512
                 # Function to split text into chunks based on tokens (modified to avoid overlaps)
                 def split_text_into_chunks(text, tokenizer, max_tokens=max_input_tokens):