Spaces:

wjjessen
/

rasa

Running

App Files Files Community

wjjessen commited on Nov 28, 2023

Commit

a95a714

1 Parent(s): b8f16a6

update code

Browse files

Files changed (1) hide show

app.py +16 -5

app.py CHANGED Viewed

@@ -38,9 +38,10 @@ def file_preprocessing(file, skipfirst, skiplast):
     else:
         pages = pages
     print("")
-    print("# pages after loop ##########")
     print("")
     print(pages)
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,  # number of characters
         chunk_overlap=100,
@@ -49,16 +50,26 @@ def file_preprocessing(file, skipfirst, skiplast):
     )
     # https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846
     texts = text_splitter.split_documents(pages)
     final_texts = ""
     for text in texts:
         final_texts = final_texts + text.page_content
     return final_texts
 def preproc_count(filepath, skipfirst, skiplast):
     input_text = file_preprocessing(filepath, skipfirst, skiplast)
     text_length = len(input_text)
-    print("Preproc input word count: %s" %(text_length))
     return input_text, text_length
@@ -80,9 +91,10 @@ def llm_pipeline(tokenizer, base_model, input_text, model_source):
     return summary
 def postproc_count(summary):
     text_length = len(summary)
-    print("Postproc summary word count: %s" %(text_length))
     return text_length
@@ -143,7 +155,6 @@ def main():
                     truncation=True,
                     legacy=False,
                     model_max_length=1000,
-                    #cache_dir="model_cache"
                 )
                 if model_source == "Download model":
                     base_model = AutoModelForSeq2SeqLM.from_pretrained(
@@ -185,13 +196,13 @@ def main():
                     postproc_text_length = postproc_count(summary)
                 end = time.time()
                 duration = end - start
                 st.info(
                     "PDF Summary&nbsp;&nbsp;|&nbsp;&nbsp;Number of words: "
                     f"{postproc_text_length:,}"
                     + "&nbsp;&nbsp;|&nbsp;&nbsp;Summarization time: "
                     f"{duration:.0f}" + " seconds"
                 )
-                #st.code("\n".join(tw.wrap(summary, width=80)), language='md')
                 st.success(summary)

     else:
         pages = pages
     print("")
+    print("# pages after skip(s) ##########")
     print("")
     print(pages)
+    print("")
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,  # number of characters
         chunk_overlap=100,
     )
     # https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846
     texts = text_splitter.split_documents(pages)
+    print("Number of tokens:" + str(len(texts)))
+    print("")
+    print("First three tokens:")
+    print(texts[0])
+    print("")
+    print(texts[1])
+    print("")
+    print(texts[2])
+    print("")
     final_texts = ""
     for text in texts:
         final_texts = final_texts + text.page_content
     return final_texts
+# function to count words in the input
 def preproc_count(filepath, skipfirst, skiplast):
     input_text = file_preprocessing(filepath, skipfirst, skiplast)
     text_length = len(input_text)
+    print("Input word count: " f"{text_length:,}")
     return input_text, text_length
     return summary
+# function to count words in the summary
 def postproc_count(summary):
     text_length = len(summary)
+    print("Summary word count: " f"{text_length:,}")
     return text_length
                     truncation=True,
                     legacy=False,
                     model_max_length=1000,
                 )
                 if model_source == "Download model":
                     base_model = AutoModelForSeq2SeqLM.from_pretrained(
                     postproc_text_length = postproc_count(summary)
                 end = time.time()
                 duration = end - start
+                print("Duration: " f"{duration:.0f}" + " seconds")
                 st.info(
                     "PDF Summary&nbsp;&nbsp;|&nbsp;&nbsp;Number of words: "
                     f"{postproc_text_length:,}"
                     + "&nbsp;&nbsp;|&nbsp;&nbsp;Summarization time: "
                     f"{duration:.0f}" + " seconds"
                 )
                 st.success(summary)