Spaces:

wjjessen
/

rasa

Sleeping

App Files Files Community

wjjessen commited on Nov 20, 2023

Commit

6fe569d

1 Parent(s): b760865

init commit

Browse files

Files changed (3) hide show

README.md +54 -1
app.py +192 -0
requirements.txt +16 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Rasa
 emoji: 📈
 colorFrom: purple
 colorTo: purple
@@ -10,3 +10,56 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: RASA
 emoji: 📈
 colorFrom: purple
 colorTo: purple
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# RASA: Research Article Summarization App
+## Description
+This application summarizes an uploaded research article PDF using the large language models "LaMini-Flan-T5-77M" or "LaMini-GPT-124M". LaMini-Flan-T5-77M is a fine-tuned version of google/flan-t5-small on LaMini-instruction dataset that contains 2.58M samples for instruction fine-tuning. LaMini-GPT-124M is a fine-tuned version of gpt2 on LaMini-instruction dataset that contains 2.58M samples for instruction fine-tuning.
+https://huggingface.co/MBZUAI/LaMini-Flan-T5-77M
+https://huggingface.co/MBZUAI/LaMini-GPT-124M
+## Table of Contents
+- [Installation](#installation)
+- [Usage](#usage)
+- [Credits](#credits)
+- [License](#license)
+## Installation
+Create a virtual python environment. To install the required python application packages, type "pip install -r requirements.txt" in a terminal window within the virtual python environment.
+## Usage
+To run locally, navigate to the project folder and in a terminal window type "streamlit run app.py".
+## Credits
+Written by Walter Jessen
+Based on https://www.youtube.com/watch?v=GIbar_kZzwk
+## MIT License
+Copyright (c) 2023 Walter Jessen
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from transformers import pipeline
+import base64
+from langchain.chains.summarize import load_summarize_chain
+from langchain.docstore.document import Document
+from langchain.document_loaders.pdf import PyMuPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from PyPDF2 import PdfReader
+import streamlit as st
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
+# notes
+# https://huggingface.co/docs/transformers/pad_truncation
+# file loader and preprocessor
+def file_preprocessing(file, skipfirst, skiplast):
+    loader = PyMuPDFLoader(file)
+    pages = loader.load_and_split()
+    print("")
+    print("# pages[0] ##########")
+    print("")
+    print(pages[0])
+    print("")
+    print("# pages ##########")
+    print("")
+    print(pages)
+    # skip page(s)
+    if (skipfirst == 1) & (skiplast == 0):
+        del pages[0]
+    elif (skipfirst == 0) & (skiplast == 1):
+        del pages[-1]
+    elif (skipfirst == 1) & (skiplast == 1):
+        del pages[0]
+        del pages[-1]
+    else:
+        pages = pages
+    print("")
+    print("# pages after loop ##########")
+    print("")
+    print(pages)
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,  # number of characters
+        chunk_overlap=100,
+        length_function=len,
+        separators=["\n\n", "\n", " ", ""],  # default list
+    )
+    # https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846
+    texts = text_splitter.split_documents(pages)
+    final_texts = ""
+    for text in texts:
+        final_texts = final_texts + text.page_content
+    return final_texts
+def preproc_count(filepath, skipfirst, skiplast):
+    input_text = file_preprocessing(filepath, skipfirst, skiplast)
+    text_length = len(input_text)
+    return input_text, text_length
+def postproc_count(summary):
+    text_length = len(summary)
+    return text_length
+# llm pipeline
+def llm_pipeline(tokenizer, base_model, input_text):
+    pipe_sum = pipeline(
+        "summarization",
+        model=base_model,
+        tokenizer=tokenizer,
+        max_length=600,
+        min_length=300,
+        truncation=True,
+    )
+    result = pipe_sum(input_text)
+    result = result[0]["summary_text"]
+    return result
+@st.cache_data
+# function to display the PDF
+def displayPDF(file):
+    with open(file, "rb") as f:
+        base64_pdf = base64.b64encode(f.read()).decode("utf-8")
+    # embed pdf in html
+    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
+    # display file
+    st.markdown(pdf_display, unsafe_allow_html=True)
+# streamlit code
+st.set_page_config(layout="wide")
+def main():
+    st.title("RASA: Research Article Summarization App")
+    uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
+    if uploaded_file is not None:
+        st.subheader("Options")
+        col1, col2, col3 = st.columns([1, 1, 2])
+        with col1:
+            model_names = [
+                "T5-Small",
+                "BART",
+            ]
+            selected_model = st.radio("Select a model to use:", model_names)
+            if selected_model == "BART":
+                checkpoint = "ccdv/lsg-bart-base-16384-pubmed"
+                tokenizer = AutoTokenizer.from_pretrained(
+                    checkpoint,
+                    truncation=True,
+                    legacy=False,
+                    model_max_length=1000,
+                    trust_remote_code=True,
+                )
+                base_model = AutoModelForSeq2SeqLM.from_pretrained(
+                    checkpoint, torch_dtype=torch.float32, trust_remote_code=True
+                )
+            else:  # default Flan T5 small
+                checkpoint = "MBZUAI/LaMini-Flan-T5-77M"
+                tokenizer = AutoTokenizer.from_pretrained(
+                    checkpoint,
+                    truncation=True,
+                    legacy=False,
+                    model_max_length=1000,
+                )
+                base_model = AutoModelForSeq2SeqLM.from_pretrained(
+                    checkpoint, torch_dtype=torch.float32
+                )
+        with col2:
+            st.write("Skip any pages?")
+            skipfirst = st.checkbox("Skip first page")
+            skiplast = st.checkbox("Skip last page")
+        with col3:
+            st.write("Background information (links open in a new window)")
+            st.write(
+                "Model class: [BART](https://huggingface.co/docs/transformers/main/en/model_doc/bart)"
+                "&nbsp;&nbsp;|&nbsp;&nbsp;Specific model: [MBZUAI/LaMini-Flan-T5-77M](https://huggingface.co/MBZUAI/LaMini-Flan-T5-77M)"
+            )
+            st.write(
+                "Model class: [T5-Small](https://huggingface.co/docs/transformers/main/en/model_doc/t5)"
+                "&nbsp;&nbsp;|&nbsp;&nbsp;Specific model: [ccdv/lsg-bart-base-16384-pubmed](https://huggingface.co/ccdv/lsg-bart-base-16384-pubmed)"
+            )
+        if st.button("Summarize"):
+            col1, col2 = st.columns(2)
+            filepath = "data/" + uploaded_file.name
+            with open(filepath, "wb") as temp_file:
+                temp_file.write(uploaded_file.read())
+            with col1:
+                input_text, text_length = preproc_count(filepath, skipfirst, skiplast)
+                st.info(
+                    "Uploaded PDF&nbsp;&nbsp;|&nbsp;&nbsp;Number of words: "
+                    f"{text_length:,}"
+                )
+                pdf_viewer = displayPDF(filepath)
+            with col2:
+                with st.spinner("Please wait..."):
+                    summary = llm_pipeline(tokenizer, base_model, input_text)
+                    text_length = postproc_count(summary)
+                st.info(
+                    "PDF Summary&nbsp;&nbsp;|&nbsp;&nbsp;Number of words: "
+                    f"{text_length:,}"
+                )
+                st.success(summary)
+st.markdown(
+    """<style>
+div[class*="stRadio"] > label > div[data-testid="stMarkdownContainer"] > p {
+    font-size: 1rem;
+    font-weight: 400;
+}
+div[class*="stMarkdown"] > div[data-testid="stMarkdownContainer"] > p {
+    margin-bottom: -15px;
+}
+div[class*="stCheckbox"] > label {
+    margin-bottom: -15px;
+}
+body > a {
+    text-decoration: underline;
+}
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+langchain
+sentence_transformers
+torch
+sentencepiece
+transformers==4.34.0
+accelerate
+chromadb
+pypdf
+tiktoken
+streamlit
+fastapi
+uvicorn
+python-multipart
+aiofiles
+PyPDF2
+PyMuPDF