m. polinsky
commited on
Create source.py
Browse files
source.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# source.py provides an abstract dataclass for a data source
|
2 |
+
from abc import ABC, abstractmethod
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from collections import namedtuple
|
5 |
+
from typing import List, Optional
|
6 |
+
|
7 |
+
Summary = namedtuple('Summary',['source','cluster_list','link_ext','hed','dek','date','authors','original_length','summary_text','summary_length','chunk_time', 'query_time', 'mean_query_time', 'summary_time'])
|
8 |
+
Summary.__doc__ = f"""
|
9 |
+
Summary: a namedtuple for storing Summaries and relevant metadata.
|
10 |
+
|
11 |
+
• Source: A Source object for the source of the summarized document.
|
12 |
+
• cluster_list: A list of the NER entities detected in this article's hed (headline).
|
13 |
+
• link_ext: The link extension of the article (on the base url, source's source_url)
|
14 |
+
• hed, dek: headline and subheader. These are standard industry terms.
|
15 |
+
Dek is None if not applicable.
|
16 |
+
• date: Date of publication/update listed in article.
|
17 |
+
• authors: list of authors, currently a string containing the byline.
|
18 |
+
• original_length: length of the original article
|
19 |
+
• cluster_num: Number of clusters the source article appears in
|
20 |
+
• summary_text: List of summarized chunks.
|
21 |
+
• summary_length: Length of summary text
|
22 |
+
• stats for stats
|
23 |
+
"""
|
24 |
+
|
25 |
+
@dataclass
|
26 |
+
class Source(ABC):
|
27 |
+
source_name: Optional[str] = ""
|
28 |
+
source_url: Optional[str] = ""
|
29 |
+
# Checkpoint str encourages use of source-appropriate models.
|
30 |
+
source_summarization_checkpoint: Optional[str] = ""
|
31 |
+
source_ner_checkpoint: Optional[str] = ""
|
32 |
+
|
33 |
+
"""
|
34 |
+
User must implement a source-dependent method
|
35 |
+
to retrieve data used to create clusters.
|
36 |
+
|
37 |
+
This gets called when clustering is performed.
|
38 |
+
"""
|
39 |
+
@abstractmethod
|
40 |
+
def retrieve_cluster_data(self) -> List[namedtuple]:
|
41 |
+
pass
|
42 |
+
|
43 |
+
"""
|
44 |
+
User must implement a source-dependent method
|
45 |
+
to retrieve texts for summarization.
|
46 |
+
|
47 |
+
This gets called once topics for digestion have been selected.
|
48 |
+
"""
|
49 |
+
@abstractmethod
|
50 |
+
def retrieve_article(self) -> List[namedtuple]:
|
51 |
+
pass
|