SOAPAssist / gpt_index /readers /file /tabular_parser.py
AbeerTrial's picture
Upload folder using huggingface_hub
8a58cf3
raw
history blame
3.36 kB
"""Tabular parser.
Contains parsers for tabular data files.
"""
from pathlib import Path
from typing import Any, Dict, List, Union
import pandas as pd
from gpt_index.readers.file.base_parser import BaseParser
class CSVParser(BaseParser):
"""CSV parser.
Args:
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
"""
def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
def _init_parser(self) -> Dict:
"""Init parser."""
return {}
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
"""Parse file.
Returns:
Union[str, List[str]]: a string or a List of strings.
"""
try:
import csv
except ImportError:
raise ImportError("csv module is required to read CSV files.")
text_list = []
with open(file, "r") as fp:
csv_reader = csv.reader(fp)
for row in csv_reader:
text_list.append(", ".join(row))
if self._concat_rows:
return "\n".join(text_list)
else:
return text_list
class PandasCSVParser(BaseParser):
r"""Pandas-based CSV parser.
Parses CSVs using the separator detection from Pandas `read_csv`function.
If special parameters are required, use the `pandas_config` dict.
Args:
concat_rows (bool): whether to concatenate all rows into one document.
If set to False, a Document will be created for each row.
True by default.
col_joiner (str): Separator to use for joining cols per row.
Set to ", " by default.
row_joiner (str): Separator to use for joining each row.
Only used when `concat_rows=True`.
Set to "\n" by default.
pandas_config (dict): Options for the `pandas.read_csv` function call.
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
for more information.
Set to empty dict by default, this means pandas will try to figure
out the separators, table head, etc. on its own.
"""
def __init__(
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_rows = concat_rows
self._col_joiner = col_joiner
self._row_joiner = row_joiner
self._pandas_config = pandas_config
def _init_parser(self) -> Dict:
"""Init parser."""
return {}
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
"""Parse file."""
df = pd.read_csv(file, **self._pandas_config)
text_list = df.apply(
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
).tolist()
if self._concat_rows:
return (self._row_joiner).join(text_list)
else:
return text_list