TopicDig / source.py
m. polinsky
Create source.py
c6e9812 unverified
# source.py provides an abstract dataclass for a data source
from abc import ABC, abstractmethod
from dataclasses import dataclass
from collections import namedtuple
from typing import List, Optional
Summary = namedtuple('Summary',['source','cluster_list','link_ext','hed','dek','date','authors','original_length','summary_text','summary_length','chunk_time', 'query_time', 'mean_query_time', 'summary_time'])
Summary.__doc__ = f"""
Summary: a namedtuple for storing Summaries and relevant metadata.
• Source: A Source object for the source of the summarized document.
• cluster_list: A list of the NER entities detected in this article's hed (headline).
• link_ext: The link extension of the article (on the base url, source's source_url)
• hed, dek: headline and subheader. These are standard industry terms.
Dek is None if not applicable.
• date: Date of publication/update listed in article.
• authors: list of authors, currently a string containing the byline.
• original_length: length of the original article
• cluster_num: Number of clusters the source article appears in
• summary_text: List of summarized chunks.
• summary_length: Length of summary text
• stats for stats
"""
@dataclass
class Source(ABC):
source_name: Optional[str] = ""
source_url: Optional[str] = ""
# Checkpoint str encourages use of source-appropriate models.
source_summarization_checkpoint: Optional[str] = ""
source_ner_checkpoint: Optional[str] = ""
"""
User must implement a source-dependent method
to retrieve data used to create clusters.
This gets called when clustering is performed.
"""
@abstractmethod
def retrieve_cluster_data(self) -> List[namedtuple]:
pass
"""
User must implement a source-dependent method
to retrieve texts for summarization.
This gets called once topics for digestion have been selected.
"""
@abstractmethod
def retrieve_article(self) -> List[namedtuple]:
pass