Spaces:
Runtime error
Runtime error
"""Tabular parser. | |
Contains parsers for tabular data files. | |
""" | |
from pathlib import Path | |
from typing import Any, Dict, List, Union | |
import pandas as pd | |
from gpt_index.readers.file.base_parser import BaseParser | |
class CSVParser(BaseParser): | |
"""CSV parser. | |
Args: | |
concat_rows (bool): whether to concatenate all rows into one document. | |
If set to False, a Document will be created for each row. | |
True by default. | |
""" | |
def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None: | |
"""Init params.""" | |
super().__init__(*args, **kwargs) | |
self._concat_rows = concat_rows | |
def _init_parser(self) -> Dict: | |
"""Init parser.""" | |
return {} | |
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: | |
"""Parse file. | |
Returns: | |
Union[str, List[str]]: a string or a List of strings. | |
""" | |
try: | |
import csv | |
except ImportError: | |
raise ImportError("csv module is required to read CSV files.") | |
text_list = [] | |
with open(file, "r") as fp: | |
csv_reader = csv.reader(fp) | |
for row in csv_reader: | |
text_list.append(", ".join(row)) | |
if self._concat_rows: | |
return "\n".join(text_list) | |
else: | |
return text_list | |
class PandasCSVParser(BaseParser): | |
r"""Pandas-based CSV parser. | |
Parses CSVs using the separator detection from Pandas `read_csv`function. | |
If special parameters are required, use the `pandas_config` dict. | |
Args: | |
concat_rows (bool): whether to concatenate all rows into one document. | |
If set to False, a Document will be created for each row. | |
True by default. | |
col_joiner (str): Separator to use for joining cols per row. | |
Set to ", " by default. | |
row_joiner (str): Separator to use for joining each row. | |
Only used when `concat_rows=True`. | |
Set to "\n" by default. | |
pandas_config (dict): Options for the `pandas.read_csv` function call. | |
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html | |
for more information. | |
Set to empty dict by default, this means pandas will try to figure | |
out the separators, table head, etc. on its own. | |
""" | |
def __init__( | |
self, | |
*args: Any, | |
concat_rows: bool = True, | |
col_joiner: str = ", ", | |
row_joiner: str = "\n", | |
pandas_config: dict = {}, | |
**kwargs: Any | |
) -> None: | |
"""Init params.""" | |
super().__init__(*args, **kwargs) | |
self._concat_rows = concat_rows | |
self._col_joiner = col_joiner | |
self._row_joiner = row_joiner | |
self._pandas_config = pandas_config | |
def _init_parser(self) -> Dict: | |
"""Init parser.""" | |
return {} | |
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: | |
"""Parse file.""" | |
df = pd.read_csv(file, **self._pandas_config) | |
text_list = df.apply( | |
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 | |
).tolist() | |
if self._concat_rows: | |
return (self._row_joiner).join(text_list) | |
else: | |
return text_list | |