m. polinsky
commited on
Commit
·
fed8c31
1
Parent(s):
3195fcb
adding data gathering to digestor.py
Browse files- digestor.py +39 -0
digestor.py
CHANGED
|
@@ -210,3 +210,42 @@ class Digestor:
|
|
| 210 |
digest.append(' '.join(each.summary_text))
|
| 211 |
|
| 212 |
self.text = '\n\n'.join(digest)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
digest.append(' '.join(each.summary_text))
|
| 211 |
|
| 212 |
self.text = '\n\n'.join(digest)
|
| 213 |
+
|
| 214 |
+
# Create dict to write out digest data for analysis
|
| 215 |
+
out_data = {}
|
| 216 |
+
t = dt.now()
|
| 217 |
+
datetime_str = f"""{t.hour:.2f}:{t.minute:.2f}:{t.second:.2f}"""
|
| 218 |
+
choices_str = ', '.join(self.user_choices)
|
| 219 |
+
digest_str = '\n\t'.join(digest)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
# This is a long comprehension to store all the fields and values in each summary.
|
| 223 |
+
# integer: {
|
| 224 |
+
# name_of_field:value except for source,
|
| 225 |
+
# which is unhashable so needs explicit handling.
|
| 226 |
+
# }
|
| 227 |
+
summaries = { # k is a summary tuple, i,p = enumerate(k)
|
| 228 |
+
# Here we take the first dozen words of the first summary chunk as key
|
| 229 |
+
c: {
|
| 230 |
+
# field name : value unless its the source
|
| 231 |
+
k._fields[i]:p if k._fields[i]!='source'
|
| 232 |
+
else
|
| 233 |
+
{
|
| 234 |
+
'name': k.source.source_name,
|
| 235 |
+
'source_url': k.source.source_url,
|
| 236 |
+
'Summarization" Checkpoint': k.source.source_summarization_checkpoint,
|
| 237 |
+
'NER Checkpoint': k.source.source_ner_checkpoint,
|
| 238 |
+
} for i,p in enumerate(k)
|
| 239 |
+
} for c,k in enumerate(self.summaries)}
|
| 240 |
+
|
| 241 |
+
out_data['timestamp'] = datetime_str
|
| 242 |
+
out_data['article_count'] = len(self.summaries)
|
| 243 |
+
out_data['digest_length'] = len(digest_str.split(" "))
|
| 244 |
+
out_data['sum_params'] = {
|
| 245 |
+
'token_limit':self.token_limit,
|
| 246 |
+
'word_limit':self.word_limit,
|
| 247 |
+
'params':self.SUMMARIZATION_PARAMETERS,
|
| 248 |
+
}
|
| 249 |
+
out_data['summaries'] = summaries
|
| 250 |
+
|
| 251 |
+
return out_data
|