import spacy
import pytextrank
import tiktoken
from nltk import download, sent_tokenize
from math import ceil, log
from typing import Literal

"""
Ensure Spacy, pytextrank, nltk, and tiktoken are installed.
Download spacy model:
spacy download en_core_web_sm
"""

# Install nltk tokenizer
download('punkt_tab')

# Load the spaCy model and add the pytextrank pipeline
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

# Constants
MAX_RETRY_COUNT = 5
SENTENCE_BUFFER_INCREMENT = 5
ALLOWED_TOKEN_DEVIATION = 250

# Initialize the token encoder for the specified model
encoder = tiktoken.encoding_for_model("gpt-4o")  # Change model name if needed

def get_tokens_with_count(text: str):
    """Encodes the text and returns the tokens along with their count."""
    tokens = encoder.encode(text)
    return tokens, len(tokens)

def calculate_avg_tokens_per_sentence(text: str):
    """
    Calculates the average number of tokens per sentence in the given text.
    
    Args:
        text (str): The input text.
    
    Returns:
        float: The average number of tokens per sentence.
    """
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Encode the text to get tokens
    _, total_tokens = get_tokens_with_count(text)
    
    # Calculate the average tokens per sentence
    if len(sentences) > 0:
        return total_tokens / len(sentences)
    else:
        return 0  # Avoid division by zero for empty or malformed text

def calculate_summary_length(num_sentences: int):
    """Determines the target summary sentence count based on the number of sentences."""
    if num_sentences <= 0:
        raise ValueError("Number of sentences must be greater than zero.")

    if num_sentences <= 12:  # Smaller corpus
        return max(1, ceil(num_sentences * 2 / 3))
    else:  # Larger corpus
        scaling_factor = 10 + log(num_sentences, 2)  # Dynamic scaling
        return max(15, ceil(num_sentences / scaling_factor))

def get_sentence_count(text: str, token_count: int):
    """
    Dynamically calculates the initial number of sentences for the summary based on the token limit.
    
    Args:
        text (str): The input text.
        token_count (int): Target token limit for the summary.
    
    Returns:
        int: Suggested number of sentences for the summary.
    """
    avg_tokens_per_sentence = calculate_avg_tokens_per_sentence(text)
    if avg_tokens_per_sentence < 1:
        avg_tokens_per_sentence = 15  # Fallback default

    sentences = sent_tokenize(text)
    total_sentences = len(sentences)

    # Calculate the initial number of sentences for the summary
    estimated_sentences = min(
        total_sentences,  # Do not exceed the total number of sentences
        max(1, token_count // avg_tokens_per_sentence)  # Adjust based on token limit
    )
    return calculate_summary_length(num_sentences=estimated_sentences)

def summarize(tr, sentence_count: int, level: str):
    """Generates a summary using pytextrank."""
    summaries = [
        str(sent) for sent in tr.summary(
            limit_sentences=sentence_count, preserve_order=True, level=level
        )
    ]
    return ". ".join(summaries)

def get_textrank_summary(
    text: str,
    token_count: int = 100,
    level: Literal["sentence", "paragraph"] = "sentence",
    verbose: bool = True,
):
    """
    Generates a textrank-based summary within the specified token limit.
    
    Args:
        text (str): The input text.
        token_count (int): Desired token limit for the summary.
        level (Literal["sentence", "paragraph"]): Granularity of the summary.
        verbose (bool): Whether to print retry information for debugging.
    
    Returns:
        str: Generated summary.
    """
    # Analyze the text with spaCy and extract textrank data
    doc = nlp(text)
    tr = doc._.textrank

    # Determine initial sentence count
    sentence_count = get_sentence_count(text=text, token_count=token_count)
    sentence_count += SENTENCE_BUFFER_INCREMENT

    retry_count = 0
    summary_content = ""

    while retry_count <= MAX_RETRY_COUNT:
        summary_content = summarize(tr=tr, sentence_count=sentence_count, level=level)
        _, summary_token_count = get_tokens_with_count(text=summary_content)
        deviation = abs(token_count - summary_token_count)

        

        if deviation <= ALLOWED_TOKEN_DEVIATION:
            break
        elif summary_token_count > token_count:
            sentence_count = max(1, sentence_count - SENTENCE_BUFFER_INCREMENT)
        else:
            sentence_count += SENTENCE_BUFFER_INCREMENT

        retry_count += 1

    if retry_count > MAX_RETRY_COUNT:
        print("Warning: Max retries reached. Summary may not meet token requirements.")

    if verbose:
        verbose_message = f"**Token Count:** {token_count} | **Sentence Count:** {sentence_count} | **Summary Token Count:** {summary_token_count} | **Token count deviation:** {deviation}"
        return summary_content, verbose_message
    
    return summary_content, ""