m. polinsky
commited on
Create source.py
Browse files
source.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# source.py provides an abstract dataclass for a data source
|
| 2 |
+
from abc import ABC, abstractmethod
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from collections import namedtuple
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
|
| 7 |
+
Summary = namedtuple('Summary',['source','cluster_list','link_ext','hed','dek','date','authors','original_length','summary_text','summary_length','chunk_time', 'query_time', 'mean_query_time', 'summary_time'])
|
| 8 |
+
Summary.__doc__ = f"""
|
| 9 |
+
Summary: a namedtuple for storing Summaries and relevant metadata.
|
| 10 |
+
|
| 11 |
+
• Source: A Source object for the source of the summarized document.
|
| 12 |
+
• cluster_list: A list of the NER entities detected in this article's hed (headline).
|
| 13 |
+
• link_ext: The link extension of the article (on the base url, source's source_url)
|
| 14 |
+
• hed, dek: headline and subheader. These are standard industry terms.
|
| 15 |
+
Dek is None if not applicable.
|
| 16 |
+
• date: Date of publication/update listed in article.
|
| 17 |
+
• authors: list of authors, currently a string containing the byline.
|
| 18 |
+
• original_length: length of the original article
|
| 19 |
+
• cluster_num: Number of clusters the source article appears in
|
| 20 |
+
• summary_text: List of summarized chunks.
|
| 21 |
+
• summary_length: Length of summary text
|
| 22 |
+
• stats for stats
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class Source(ABC):
|
| 27 |
+
source_name: Optional[str] = ""
|
| 28 |
+
source_url: Optional[str] = ""
|
| 29 |
+
# Checkpoint str encourages use of source-appropriate models.
|
| 30 |
+
source_summarization_checkpoint: Optional[str] = ""
|
| 31 |
+
source_ner_checkpoint: Optional[str] = ""
|
| 32 |
+
|
| 33 |
+
"""
|
| 34 |
+
User must implement a source-dependent method
|
| 35 |
+
to retrieve data used to create clusters.
|
| 36 |
+
|
| 37 |
+
This gets called when clustering is performed.
|
| 38 |
+
"""
|
| 39 |
+
@abstractmethod
|
| 40 |
+
def retrieve_cluster_data(self) -> List[namedtuple]:
|
| 41 |
+
pass
|
| 42 |
+
|
| 43 |
+
"""
|
| 44 |
+
User must implement a source-dependent method
|
| 45 |
+
to retrieve texts for summarization.
|
| 46 |
+
|
| 47 |
+
This gets called once topics for digestion have been selected.
|
| 48 |
+
"""
|
| 49 |
+
@abstractmethod
|
| 50 |
+
def retrieve_article(self) -> List[namedtuple]:
|
| 51 |
+
pass
|