adinarayana commited on
Commit
de71dbe
·
verified ·
1 Parent(s): e610de2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -19
app.py CHANGED
@@ -4,32 +4,54 @@ from transformers import pipeline
4
 
5
  import streamlit as st
6
 
 
 
 
7
 
8
- def get_openai_response(df):
9
- summarizer = pipeline("summarization")
10
- return summarizer(df)
11
 
 
 
 
 
12
 
 
 
13
 
14
- ## streamlit app
15
 
16
- st.set_page_config(page_title="Trail Demo")
17
- st.header("Sample")
18
- st.write("UPDATE: This app uses the 'gpt-3.5-turbo-instruct' model through Langchain")
 
 
19
 
 
 
20
 
21
- # input = st.text_input("Enter your query: ", key=input)
22
- st.write(pdfminer.__version__)
 
 
23
 
24
- uploaded_file = st.file_uploader("Choose a file", "pdf")
 
25
  if uploaded_file is not None:
26
- for page_layout in extract_pages(uploaded_file):
27
- for element in page_layout:
28
- df = element
29
- response = get_openai_response(df)
30
-
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- submit = st.button("Generate")
33
- if submit:
34
- st.subheader("The response is")
35
- st.write(response)
 
4
 
5
  import streamlit as st
6
 
7
+ def preprocess_text(element):
8
+ # Extract text content
9
+ text = element.get_text().strip()
10
 
11
+ # Remove non-textual elements
12
+ text = re.sub(r'[^\w\s]', '', text) # Replace with your preferred regular expression
 
13
 
14
+ # Remove stop words (optional)
15
+ # from nltk.corpus import stopwords
16
+ # stop_words = set(stopwords.words('english'))
17
+ # text = " ".join([word for word in text.split() if word not in stop_words])
18
 
19
+ # Convert to lowercase (optional)
20
+ # text = text.lower()
21
 
22
+ return text
23
 
24
+ def get_openai_response(text, length=100, model="gpt-3.5-turbo-instruct"):
25
+ summarizer = pipeline("summarization", model=model)
26
+ return summarizer(text, max_length=length)
27
+
28
+ ## Streamlit app
29
 
30
+ st.set_page_config(page_title="Trail Demo")
31
+ st.header("PDF Summarizer")
32
 
33
+ # User options
34
+ st.subheader("Settings")
35
+ summary_length = st.slider("Summary Length", min_value=50, max_value=500, value=100)
36
+ summarization_model = st.selectbox("Summarization Model", ["gpt-3.5-turbo-instruct", "t5-small"])
37
 
38
+ # File upload and processing
39
+ uploaded_file = st.file_uploader("Choose a PDF file")
40
  if uploaded_file is not None:
41
+ with st.spinner("Processing..."):
42
+ text = ""
43
+ for page_layout in extract_pages(uploaded_file):
44
+ for element in page_layout:
45
+ text += preprocess_text(element) + "\n"
46
+ if text:
47
+ st.subheader("Extracted Text")
48
+ st.write(text)
49
+ submit = st.button("Generate Summary")
50
+ if submit:
51
+ st.spinner("Summarizing...")
52
+ response = get_openai_response(text, length=summary_length, model=summarization_model)
53
+ st.subheader("Summary")
54
+ st.write(response[0]["summary_text"])
55
+ else:
56
+ st.error("No text found in the PDF.")
57