SOAPAssistV00

Runtime error

File size: 8,457 Bytes

35b22df

"""General prompt helper that can help deal with token limitations.

The helper can split text. It can also concatenate text from Node
structs but keeping token limitations in mind.

"""

from typing import Callable, List, Optional

from gpt_index.constants import MAX_CHUNK_OVERLAP
from gpt_index.data_structs.data_structs import Node
from gpt_index.langchain_helpers.chain_wrapper import LLMPredictor
from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter
from gpt_index.prompts.base import Prompt
from gpt_index.utils import globals_helper


class PromptHelper:
    """Prompt helper.

    This utility helps us fill in the prompt, split the text,
    and fill in context information according to necessary token limitations.

    Args:
        max_input_size (int): Maximum input size for the LLM.
        num_output (int): Number of outputs for the LLM.
        max_chunk_overlap (int): Maximum chunk overlap for the LLM.
        embedding_limit (Optional[int]): Maximum number of embeddings to use.
        chunk_size_limit (Optional[int]): Maximum chunk size to use.
        tokenizer (Optional[Callable[[str], List]]): Tokenizer to use.

    """

    def __init__(
        self,
        max_input_size: int,
        num_output: int,
        max_chunk_overlap: int,
        embedding_limit: Optional[int] = None,
        chunk_size_limit: Optional[int] = None,
        tokenizer: Optional[Callable[[str], List]] = None,
        separator: str = " ",
    ) -> None:
        """Init params."""
        self.max_input_size = max_input_size
        self.num_output = num_output
        self.max_chunk_overlap = max_chunk_overlap
        self.embedding_limit = embedding_limit
        self.chunk_size_limit = chunk_size_limit
        # TODO: make configurable
        self._tokenizer = tokenizer or globals_helper.tokenizer
        self._separator = separator
        self.use_chunk_size_limit = chunk_size_limit is not None

    @classmethod
    def from_llm_predictor(
        self,
        llm_predictor: LLMPredictor,
        max_chunk_overlap: Optional[int] = None,
        embedding_limit: Optional[int] = None,
        chunk_size_limit: Optional[int] = None,
        tokenizer: Optional[Callable[[str], List]] = None,
    ) -> "PromptHelper":
        """Create from llm predictor.

        This will autofill values like max_input_size and num_output.

        """
        llm_metadata = llm_predictor.get_llm_metadata()
        max_chunk_overlap = max_chunk_overlap or min(
            MAX_CHUNK_OVERLAP,
            llm_metadata.max_input_size // 10,
        )
        if chunk_size_limit is not None:
            max_chunk_overlap = min(max_chunk_overlap, chunk_size_limit // 10)

        return self(
            llm_metadata.max_input_size,
            llm_metadata.num_output,
            max_chunk_overlap,
            embedding_limit=embedding_limit,
            chunk_size_limit=chunk_size_limit,
            tokenizer=tokenizer,
        )

    def get_chunk_size_given_prompt(
        self, prompt_text: str, num_chunks: int, padding: Optional[int] = 1
    ) -> int:
        """Get chunk size making sure we can also fit the prompt in.

        Chunk size is computed based on a function of the total input size,
        the prompt length, the number of outputs, and the number of chunks.

        If padding is specified, then we subtract that from the chunk size.
        By default we assume there is a padding of 1 (for the newline between chunks).

        Limit by embedding_limit and chunk_size_limit if specified.

        """
        prompt_tokens = self._tokenizer(prompt_text)
        num_prompt_tokens = len(prompt_tokens)

        # NOTE: if embedding limit is specified, then chunk_size must not be larger than
        # embedding_limit
        result = (
            self.max_input_size - num_prompt_tokens - self.num_output
        ) // num_chunks
        if padding is not None:
            result -= padding

        if self.embedding_limit is not None:
            result = min(result, self.embedding_limit)
        if self.chunk_size_limit is not None and self.use_chunk_size_limit:
            result = min(result, self.chunk_size_limit)

        return result

    def _get_empty_prompt_txt(self, prompt: Prompt) -> str:
        """Get empty prompt text.

        Substitute empty strings in parts of the prompt that have
        not yet been filled out. Skip variables that have already
        been partially formatted. This is used to compute the initial tokens.

        """
        fmt_dict = {
            v: "" for v in prompt.input_variables if v not in prompt.partial_dict
        }
        # TODO: change later from llm=None
        empty_prompt_txt = prompt.format(llm=None, **fmt_dict)
        return empty_prompt_txt

    def get_biggest_prompt(self, prompts: List[Prompt]) -> Prompt:
        """Get biggest prompt.

        Oftentimes we need to fetch the biggest prompt, in order to
        be the most conservative about chunking text. This
        is a helper utility for that.

        """
        empty_prompt_txts = [self._get_empty_prompt_txt(prompt) for prompt in prompts]
        empty_prompt_txt_lens = [len(txt) for txt in empty_prompt_txts]
        biggest_prompt = prompts[
            empty_prompt_txt_lens.index(max(empty_prompt_txt_lens))
        ]
        return biggest_prompt

    def get_text_splitter_given_prompt(
        self, prompt: Prompt, num_chunks: int, padding: Optional[int] = 1
    ) -> TokenTextSplitter:
        """Get text splitter given initial prompt.

        Allows us to get the text splitter which will split up text according
        to the desired chunk size.

        """
        # generate empty_prompt_txt to compute initial tokens
        empty_prompt_txt = self._get_empty_prompt_txt(prompt)
        chunk_size = self.get_chunk_size_given_prompt(
            empty_prompt_txt, num_chunks, padding=padding
        )
        text_splitter = TokenTextSplitter(
            separator=self._separator,
            chunk_size=chunk_size,
            chunk_overlap=self.max_chunk_overlap // num_chunks,
            tokenizer=self._tokenizer,
        )
        return text_splitter

    def get_text_from_nodes(
        self, node_list: List[Node], prompt: Optional[Prompt] = None
    ) -> str:
        """Get text from nodes. Used by tree-structured indices."""
        num_nodes = len(node_list)
        text_splitter = None
        if prompt is not None:
            # add padding given the newline character
            text_splitter = self.get_text_splitter_given_prompt(
                prompt,
                num_nodes,
                padding=1,
            )
        results = []
        for node in node_list:
            text = (
                text_splitter.truncate_text(node.get_text())
                if text_splitter is not None
                else node.get_text()
            )
            results.append(text)

        return "\n".join(results)

    def get_numbered_text_from_nodes(
        self, node_list: List[Node], prompt: Optional[Prompt] = None
    ) -> str:
        """Get text from nodes in the format of a numbered list.

        Used by tree-structured indices.

        """
        num_nodes = len(node_list)
        text_splitter = None
        if prompt is not None:
            # add padding given the number, and the newlines
            text_splitter = self.get_text_splitter_given_prompt(
                prompt,
                num_nodes,
                padding=5,
            )
        results = []
        number = 1
        for node in node_list:
            node_text = " ".join(node.get_text().splitlines())
            if text_splitter is not None:
                node_text = text_splitter.truncate_text(node_text)
            text = f"({number}) {node_text}"
            results.append(text)
            number += 1
        return "\n\n".join(results)

    def compact_text_chunks(self, prompt: Prompt, text_chunks: List[str]) -> List[str]:
        """Compact text chunks.

        This will combine text chunks into consolidated chunks
        that more fully "pack" the prompt template given the max_input_size.

        """
        combined_str = "\n\n".join([c.strip() for c in text_chunks if c.strip()])
        # resplit based on self.max_chunk_overlap
        text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1)
        return text_splitter.split_text(combined_str)