from typing import List, Set |
from collections import namedtuple |
import random |
import requests |
import json |
from codetiming import Timer |
import streamlit as st |
from digestor import Digestor |
from source import Source |
from scrape_sources import NPRLite, CNNText, stub |
def initialize(limit, rando, use_cache=True): |
clusters: dict[str:List[namedtuple]] = dict() |
sources:List[Source]= [] |
sources.append(NPRLite( |
'npr', |
'https://text.npr.org/1001', |
'sshleifer/distilbart-cnn-12-6', |
'dbmdz/bert-large-cased-finetuned-conll03-english' |
)) |
sources.append(CNNText( |
'cnn', |
'https://lite.cnn.com', |
'sshleifer/distilbart-cnn-12-6', |
'dbmdz/bert-large-cased-finetuned-conll03-english' |
)) |
cluster_data: List[namedtuple('article', ['link','hed','entities', 'source'])] |
article_dict : dict[str:namedtuple] |
cluster_data = [] |
article_meta = namedtuple('article_meta',['source', 'count']) |
cluster_meta : List[article_meta] = [] |
for data_source in sources: |
if limit is not None: |
c_data, c_meta = data_source.retrieve_cluster_data(limit//len(sources)) |
else: |
c_data, c_meta = data_source.retrieve_cluster_data() |
cluster_data.append(c_data) |
cluster_meta.append(article_meta(data_source.source_name, c_meta)) |
st.info(f"Number of clusters from source: {data_source.source_name}\n\t{len(c_data)}") |
print("Finished...moving on to clustering...") |
cluster_data = cluster_data[0] + cluster_data[1] |
for tup in cluster_data: |
perform_ner(tup, cache=use_cache) |
generate_clusters(clusters, tup) |
st.write(f"""Total number of clusters: {len(clusters)}""") |
article_dict = {stub.hed: stub for stub in cluster_data} |
return article_dict, clusters |
def perform_ner(tup:namedtuple('article',['link','hed','entities', 'source']), cache=True): |
with Timer(name="ner_query_time", logger=None): |
result = ner_results(ner_query( |
{ |
"inputs":tup.hed, |
"paramters": |
{ |
"use_cache": cache, |
}, |
} |
)) |
for i in result: |
tup.entities.append(i) |
@st.cache() |
def ner_query(payload): |
print("making a query....") |
data = json.dumps(payload) |
response = requests.request("POST", NER_API_URL, headers=headers, data=data) |
return json.loads(response.content.decode("utf-8")) |
def generate_clusters( |
the_dict: dict, |
tup : namedtuple('article_stub',[ 'link','hed','entities', 'source']) |
) -> dict: |
for entity in tup.entities: |
if entity not in the_dict: |
the_dict[entity] = [] |
the_dict[entity].append(tup) |
def ner_results(ner_object, groups=True, NER_THRESHOLD=0.5) -> List[str]: |
people, places, orgs, misc = [], [], [], [] |
ent = 'entity' if not groups else 'entity_group' |
designation = 'I-' if not groups else '' |
actions = {designation+'PER':people.append, |
designation+'LOC':places.append, |
designation+'ORG':orgs.append, |
designation+'MISC':misc.append |
} |
readable = [ actions[d[ent]](d['word']) for d in ner_object if '#' not in d['word'] and d['score'] > NER_THRESHOLD ] |
ner_list = [i for i in set(people) if len(i) > 2] + [i for i in set(places) if len(i) > 2] + [i for i in set(orgs) if len(i) > 2] + [i for i in set(misc) if len(i) > 2] |
return ner_list |
NER_API_URL = "https://api-inference.huggingface.co/models/dbmdz/bert-large-cased-finetuned-conll03-english" |
headers = {"Authorization": f"""Bearer {st.secrets['ato']}"""} |
LIMIT = 20 |
USE_CACHE = True |
if not USE_CACHE: |
if LIMIT is not None: |
print(f"LIMIT: {LIMIT}") |
digests = dict() |
out_dicts = [] |
print("Initializing....") |
article_dict, clusters = initialize(LIMIT, USE_CACHE) |
if st.button("Refresh topics!"): |
article_dict, clusters = initialize(LIMIT, USE_CACHE) |
selections = [] |
choices = list(clusters.keys()) |
choices.insert(0,'None') |
with st.form(key='columns_in_form'): |
cols = st.columns(3) |
for i, col in enumerate(cols): |
selections.append(col.selectbox(f'Make a Selection', choices, key=i)) |
submitted = st.form_submit_button('Submit') |
if submitted: |
selections = [i for i in selections if i is not None] |
with st.spinner(text="Digesting...please wait, this will take a few moments...Maybe check some messages or start reading the latest papers on summarization with transformers...."): |
chosen = [] |
for i in selections: |
if i != 'None': |
for j in clusters[i]: |
if j not in chosen: |
chosen.append(j) |
digestor = Digestor(timer=Timer(), cache = USE_CACHE, stubs=chosen, user_choices=list(selections)) |
digestor.digest() |
digestor.build_digest() |
if len(digestor.text) == 0: |
st.write("You didn't select a topic!") |
else: |
st.write("Your digest is ready:\n") |
st.write(digestor.text) |
"st.session_state object:", st.session_state |