|
import spacy |
|
import pytextrank |
|
import tiktoken |
|
from nltk import download, sent_tokenize |
|
from math import ceil, log |
|
from typing import Literal |
|
|
|
""" |
|
Ensure Spacy, pytextrank, nltk, and tiktoken are installed. |
|
Download spacy model: |
|
spacy download en_core_web_sm |
|
""" |
|
|
|
|
|
download('punkt_tab') |
|
|
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
nlp.add_pipe("textrank") |
|
|
|
|
|
MAX_RETRY_COUNT = 5 |
|
SENTENCE_BUFFER_INCREMENT = 5 |
|
ALLOWED_TOKEN_DEVIATION = 250 |
|
|
|
|
|
encoder = tiktoken.encoding_for_model("gpt-4o") |
|
|
|
def get_tokens_with_count(text: str): |
|
"""Encodes the text and returns the tokens along with their count.""" |
|
tokens = encoder.encode(text) |
|
return tokens, len(tokens) |
|
|
|
def calculate_avg_tokens_per_sentence(text: str): |
|
""" |
|
Calculates the average number of tokens per sentence in the given text. |
|
|
|
Args: |
|
text (str): The input text. |
|
|
|
Returns: |
|
float: The average number of tokens per sentence. |
|
""" |
|
|
|
sentences = sent_tokenize(text) |
|
|
|
|
|
_, total_tokens = get_tokens_with_count(text) |
|
|
|
|
|
if len(sentences) > 0: |
|
return total_tokens / len(sentences) |
|
else: |
|
return 0 |
|
|
|
def calculate_summary_length(num_sentences: int): |
|
"""Determines the target summary sentence count based on the number of sentences.""" |
|
if num_sentences <= 0: |
|
raise ValueError("Number of sentences must be greater than zero.") |
|
|
|
if num_sentences <= 12: |
|
return max(1, ceil(num_sentences * 2 / 3)) |
|
else: |
|
scaling_factor = 10 + log(num_sentences, 2) |
|
return max(15, ceil(num_sentences / scaling_factor)) |
|
|
|
def get_sentence_count(text: str, token_count: int): |
|
""" |
|
Dynamically calculates the initial number of sentences for the summary based on the token limit. |
|
|
|
Args: |
|
text (str): The input text. |
|
token_count (int): Target token limit for the summary. |
|
|
|
Returns: |
|
int: Suggested number of sentences for the summary. |
|
""" |
|
avg_tokens_per_sentence = calculate_avg_tokens_per_sentence(text) |
|
if avg_tokens_per_sentence < 1: |
|
avg_tokens_per_sentence = 15 |
|
|
|
sentences = sent_tokenize(text) |
|
total_sentences = len(sentences) |
|
|
|
|
|
estimated_sentences = min( |
|
total_sentences, |
|
max(1, token_count // avg_tokens_per_sentence) |
|
) |
|
return calculate_summary_length(num_sentences=estimated_sentences) |
|
|
|
def summarize(tr, sentence_count: int, level: str): |
|
"""Generates a summary using pytextrank.""" |
|
summaries = [ |
|
str(sent) for sent in tr.summary( |
|
limit_sentences=sentence_count, preserve_order=True, level=level |
|
) |
|
] |
|
return ". ".join(summaries) |
|
|
|
def get_textrank_summary( |
|
text: str, |
|
token_count: int = 100, |
|
level: Literal["sentence", "paragraph"] = "sentence", |
|
verbose: bool = True, |
|
): |
|
""" |
|
Generates a textrank-based summary within the specified token limit. |
|
|
|
Args: |
|
text (str): The input text. |
|
token_count (int): Desired token limit for the summary. |
|
level (Literal["sentence", "paragraph"]): Granularity of the summary. |
|
verbose (bool): Whether to print retry information for debugging. |
|
|
|
Returns: |
|
str: Generated summary. |
|
""" |
|
|
|
doc = nlp(text) |
|
tr = doc._.textrank |
|
|
|
|
|
sentence_count = get_sentence_count(text=text, token_count=token_count) |
|
sentence_count += SENTENCE_BUFFER_INCREMENT |
|
|
|
retry_count = 0 |
|
summary_content = "" |
|
|
|
while retry_count <= MAX_RETRY_COUNT: |
|
summary_content = summarize(tr=tr, sentence_count=sentence_count, level=level) |
|
_, summary_token_count = get_tokens_with_count(text=summary_content) |
|
deviation = abs(token_count - summary_token_count) |
|
|
|
|
|
|
|
if deviation <= ALLOWED_TOKEN_DEVIATION: |
|
break |
|
elif summary_token_count > token_count: |
|
sentence_count = max(1, sentence_count - SENTENCE_BUFFER_INCREMENT) |
|
else: |
|
sentence_count += SENTENCE_BUFFER_INCREMENT |
|
|
|
retry_count += 1 |
|
|
|
if retry_count > MAX_RETRY_COUNT: |
|
print("Warning: Max retries reached. Summary may not meet token requirements.") |
|
|
|
if verbose: |
|
verbose_message = f"**Token Count:** {token_count} | **Sentence Count:** {sentence_count} | **Summary Token Count:** {summary_token_count} | **Token count deviation:** {deviation}" |
|
return summary_content, verbose_message |
|
|
|
return summary_content, "" |
|
|