adinarayana commited on
Commit
a8825e5
·
verified ·
1 Parent(s): df6d309

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -66
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import re
3
  import pdfminer
4
  from pdfminer.high_level import extract_pages
5
- from transformers import pipeline, QuestionAnsweringModel, QuestionAnsweringProcessor
6
 
7
  import streamlit as st
8
 
@@ -43,10 +43,10 @@ def answer_question(text, question):
43
  """
44
  qa_model_name = "deepset/roberta-base-squad2" # Replace with your chosen model
45
 
46
- qa_model = QuestionAnsweringModel.from_pretrained(qa_model_name)
47
- qa_processor = QuestionAnsweringProcessor.from_pretrained(qa_model_name)
48
 
49
- inputs = qa_processor(question, text, return_tensors="pt")
50
  outputs = qa_model(**inputs)
51
 
52
  start_scores, end_scores = outputs.start_logits, outputs.end_logits
@@ -82,7 +82,7 @@ if uploaded_file is not None:
82
  summarize_button = st.button("Generate Summary")
83
  if summarize_button:
84
  with st.spinner("Summarizing..."):
85
- summary_response = get_openai_response(text, min_length=min_summary_length, model=summarization_model)
86
  st.subheader("Summary")
87
  st.write(summary_response[0]["summary_text"])
88
  if question:
@@ -92,64 +92,3 @@ if uploaded_file is not None:
92
  st.write(answer)
93
  else:
94
  st.error("No text found in the PDF.")
95
-
96
-
97
-
98
-
99
- # import os
100
- # import re
101
- # import pdfminer
102
- # from pdfminer.high_level import extract_pages
103
- # from transformers import pipeline
104
-
105
- # import streamlit as st
106
-
107
- # def preprocess_text(element):
108
- # if isinstance(element, pdfminer.layout.LTTextBoxHorizontal): # Check for text elements
109
- # text = element.get_text().strip()
110
- # # Remove non-textual elements
111
- # text = re.sub(r'[^\w\s]', '', text) # Replace with your preferred regular expression
112
-
113
- # # Remove stop words (optional)
114
- # # from nltk.corpus import stopwords
115
- # # stop_words = set(stopwords.words('english'))
116
- # # text = " ".join([word for word in text.split() if word not in stop_words])
117
-
118
- # # Convert to lowercase (optional)
119
- # # text = text.lower()
120
- # return text
121
- # else:
122
- # return ""
123
-
124
- # def get_openai_response(text, min_length=100, model="t5-small"):
125
- # summarizer = pipeline("summarization", model=model)
126
- # return summarizer(text, min_length=min_length)
127
-
128
- # ## Streamlit app
129
-
130
- # st.set_page_config(page_title="Trail Demo")
131
- # st.header("PDF Summarizer")
132
-
133
- # # User options
134
- # st.subheader("Settings")
135
- # min_summary_length = st.slider("Minimum Summary Length", min_value=50, max_value=500, value=100)
136
- # # max_summary_length = st.slider("Maximum Summary Length", min_value=50, max_value=500, value=100)
137
- # summarization_model = st.selectbox("Summarization Model", ["t5-small", "facebook/bart-large-cnn"])
138
-
139
- # # File upload and processing
140
- # uploaded_file = st.file_uploader("Choose a PDF file")
141
- # if uploaded_file is not None:
142
- # with st.spinner("Processing..."):
143
- # text = ""
144
- # for page_layout in extract_pages(uploaded_file):
145
- # for element in page_layout:
146
- # text += preprocess_text(element) + "\n"
147
- # if text:
148
- # submit = st.button("Generate Summary")
149
- # if submit:
150
- # with st.spinner("Summarizing..."):
151
- # response = get_openai_response(text, min_length=min_summary_length, model=summarization_model)
152
- # st.subheader("Summary")
153
- # st.write(response[0]["summary_text"])
154
- # else:
155
- # st.error("No text found in the PDF.")
 
2
  import re
3
  import pdfminer
4
  from pdfminer.high_level import extract_pages
5
+ from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
6
 
7
  import streamlit as st
8
 
 
43
  """
44
  qa_model_name = "deepset/roberta-base-squad2" # Replace with your chosen model
45
 
46
+ qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
47
+ tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
48
 
49
+ inputs = tokenizer(question, text, return_tensors="pt") # Tokenize inputs
50
  outputs = qa_model(**inputs)
51
 
52
  start_scores, end_scores = outputs.start_logits, outputs.end_logits
 
82
  summarize_button = st.button("Generate Summary")
83
  if summarize_button:
84
  with st.spinner("Summarizing..."):
85
+ summary_response = pipeline("summarization", model=summarization_model)(text, min_length=min_summary_length)
86
  st.subheader("Summary")
87
  st.write(summary_response[0]["summary_text"])
88
  if question:
 
92
  st.write(answer)
93
  else:
94
  st.error("No text found in the PDF.")