m. polinsky
commited on
Update digestor.py
Browse files- digestor.py +3 -43
digestor.py
CHANGED
@@ -174,6 +174,7 @@ class Digestor:
|
|
174 |
# API CALLS: consider placing the code from query() into here. * * * *
|
175 |
for chunk in chunklist:
|
176 |
safe = False
|
|
|
177 |
with Timer(name=f"{stubhead}_query_time", logger=None):
|
178 |
while not safe and repeat < 4:
|
179 |
try: # make these digest params.
|
@@ -190,7 +191,8 @@ class Digestor:
|
|
190 |
print("Summarization error, repeating...")
|
191 |
print(e)
|
192 |
repeat+=1
|
193 |
-
|
|
|
194 |
return collection_bin
|
195 |
|
196 |
|
@@ -207,46 +209,4 @@ class Digestor:
|
|
207 |
for each in self.summaries:
|
208 |
digest.append(' '.join(each.summary_text))
|
209 |
|
210 |
-
# Create dict to write out digest data for analysis
|
211 |
-
out_data = {}
|
212 |
-
datetime_str = f"""{dt.now()}"""
|
213 |
-
choices_str = ', '.join(self.user_choices)
|
214 |
-
digest_str = '\n\n'.join(digest)
|
215 |
-
|
216 |
-
|
217 |
-
# This is a long comprehension to store all the fields and values in each summary.
|
218 |
-
# integer: {
|
219 |
-
# name_of_field:value except for source,
|
220 |
-
# which is unhashable so needs explicit handling.
|
221 |
-
# }
|
222 |
-
summaries = { # k is a summary tuple, i,p = enumerate(k)
|
223 |
-
# Here we take the first dozen words of the first summary chunk as key
|
224 |
-
c: {
|
225 |
-
# field name : value unless its the source
|
226 |
-
k._fields[i]:p if k._fields[i]!='source'
|
227 |
-
else
|
228 |
-
{
|
229 |
-
'name': k.source.source_name,
|
230 |
-
'source_url': k.source.source_url,
|
231 |
-
'Summarization" Checkpoint': k.source.source_summarization_checkpoint,
|
232 |
-
'NER Checkpoint': k.source.source_ner_checkpoint,
|
233 |
-
} for i,p in enumerate(k)
|
234 |
-
} for c,k in enumerate(self.summaries)}
|
235 |
-
|
236 |
-
out_data['timestamp'] = datetime_str
|
237 |
-
out_data['choices'] = choices_str
|
238 |
-
out_data['digest_text'] = digest_str
|
239 |
-
out_data['article_count'] = len(self.summaries)
|
240 |
-
out_data['digest_length'] = len(digest_str.split(" "))
|
241 |
-
out_data['digest_time'] = self.timer.timers['digest_time']
|
242 |
-
out_data['sum_params'] = {
|
243 |
-
'token_limit':self.token_limit,
|
244 |
-
'word_limit':self.word_limit,
|
245 |
-
'params':self.SUMMARIZATION_PARAMETERS,
|
246 |
-
}
|
247 |
-
out_data['summaries'] = summaries
|
248 |
-
|
249 |
-
|
250 |
self.text = digest_str
|
251 |
-
|
252 |
-
return out_data
|
|
|
174 |
# API CALLS: consider placing the code from query() into here. * * * *
|
175 |
for chunk in chunklist:
|
176 |
safe = False
|
177 |
+
summarized_chunk = None
|
178 |
with Timer(name=f"{stubhead}_query_time", logger=None):
|
179 |
while not safe and repeat < 4:
|
180 |
try: # make these digest params.
|
|
|
191 |
print("Summarization error, repeating...")
|
192 |
print(e)
|
193 |
repeat+=1
|
194 |
+
if summarizaed_chunk is not None:
|
195 |
+
collection_bin.append(summarized_chunk)
|
196 |
return collection_bin
|
197 |
|
198 |
|
|
|
209 |
for each in self.summaries:
|
210 |
digest.append(' '.join(each.summary_text))
|
211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
self.text = digest_str
|
|
|
|