File size: 4,743 Bytes
b699122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""Default query for GPTPandasIndex."""

import logging
from typing import Any, Callable, Optional

import pandas as pd
from langchain.input import print_text

from gpt_index.data_structs.table_v2 import PandasStructTable
from gpt_index.indices.query.base import BaseGPTIndexQuery
from gpt_index.indices.query.schema import QueryBundle
from gpt_index.prompts.default_prompts import DEFAULT_PANDAS_PROMPT
from gpt_index.prompts.prompts import PandasPrompt
from gpt_index.response.schema import Response

logger = logging.getLogger(__name__)


DEFAULT_INSTRUCTION_STR = (
    "We wish to convert this query to executable Python code using Pandas.\n"
    "The final line of code should be a Python expression that can be called "
    "with the `eval()` function. This expression should represent a solution "
    "to the query."
)


def default_output_processor(
    output: str, df: pd.DataFrame, **output_kwargs: Any
) -> str:
    """Process outputs in a default manner."""
    import ast
    import sys
    import traceback

    if sys.version_info < (3, 9):
        logger.warn(
            "Python version must be >= 3.9 in order to use "
            "the default output processor, which executes "
            "the Python query. Instead, we will return the "
            "raw Python instructions as a string."
        )
        return output

    local_vars = {"df": df}

    # NOTE: inspired from langchain's tool
    # see langchain.tools.python.tool (PythonAstREPLTool)
    try:
        tree = ast.parse(output)
        module = ast.Module(tree.body[:-1], type_ignores=[])
        exec(ast.unparse(module), {}, local_vars)  # type: ignore
        module_end = ast.Module(tree.body[-1:], type_ignores=[])
        module_end_str = ast.unparse(module_end)  # type: ignore
        try:
            return str(eval(module_end_str, {}, local_vars))
        except Exception as e:
            raise e
    except Exception as e:
        err_string = (
            "There was an error running the output as Python code. "
            f"Error message: {e}"
        )
        traceback.print_exc()
        return err_string


class GPTNLPandasIndexQuery(BaseGPTIndexQuery[PandasStructTable]):
    """GPT Pandas query.

    Convert natural language to Pandas python code.

    .. code-block:: python

        response = index.query("<query_str>", mode="default")

    Args:
        df (pd.DataFrame): Pandas dataframe to use.
        instruction_str (Optional[str]): Instruction string to use.
        output_processor (Optional[Callable[[str], str]]): Output processor.
            A callable that takes in the output string, pandas DataFrame,
            and any output kwargs and returns a string.
        pandas_prompt (Optional[PandasPrompt]): Pandas prompt to use.
        head (int): Number of rows to show in the table context.

    """

    def __init__(
        self,
        index_struct: PandasStructTable,
        df: Optional[pd.DataFrame] = None,
        instruction_str: Optional[str] = None,
        output_processor: Optional[Callable] = None,
        pandas_prompt: Optional[PandasPrompt] = None,
        output_kwargs: Optional[dict] = None,
        head: int = 5,
        **kwargs: Any,
    ) -> None:
        """Initialize params."""
        super().__init__(index_struct=index_struct, **kwargs)
        if df is None:
            raise ValueError("df must be provided.")
        self.df = df
        self._head = head
        self._pandas_prompt = pandas_prompt or DEFAULT_PANDAS_PROMPT
        self._instruction_str = instruction_str or DEFAULT_INSTRUCTION_STR
        self._output_processor = output_processor or default_output_processor
        self._output_kwargs = output_kwargs or {}

    def _get_table_context(self) -> str:
        """Get table context."""
        return str(self.df.head(self._head))

    def _query(self, query_bundle: QueryBundle) -> Response:
        """Answer a query."""
        context = self._get_table_context()

        pandas_response_str, _ = self._service_context.llm_predictor.predict(
            self._pandas_prompt,
            df_str=context,
            query_str=query_bundle.query_str,
            instruction_str=self._instruction_str,
        )
        if self._verbose:
            print_text(f"> Pandas Instructions:\n" f"```\n{pandas_response_str}\n```\n")
        pandas_output = self._output_processor(
            pandas_response_str,
            self.df,
            **self._output_kwargs,
        )
        if self._verbose:
            print_text(f"> Pandas Output: {pandas_output}\n")

        response_extra_info = {
            "pandas_instruction_str": pandas_response_str,
        }

        return Response(response=pandas_output, extra_info=response_extra_info)