|
from typing import Dict, List, Optional
|
|
from pathlib import Path
|
|
import pandas as pd
|
|
from llama_index.readers.file import CSVReader
|
|
from llama_index.schema import Document
|
|
|
|
class EnhancedCSVReader:
|
|
"""Enhanced CSV reader with metadata extraction capabilities."""
|
|
|
|
def __init__(self):
|
|
self.csv_reader = CSVReader()
|
|
|
|
def load_data(self, file_path: str) -> List[Document]:
|
|
"""Load CSV file and extract documents with metadata."""
|
|
|
|
documents = self.csv_reader.load_data(file_path)
|
|
|
|
|
|
csv_metadata = self._extract_metadata(file_path)
|
|
|
|
|
|
for doc in documents:
|
|
doc.metadata.update(csv_metadata)
|
|
|
|
return documents
|
|
|
|
def _extract_metadata(self, file_path: str) -> Dict:
|
|
"""Extract useful metadata from CSV file."""
|
|
df = pd.read_csv(file_path)
|
|
filename = Path(file_path).name
|
|
|
|
|
|
columns = df.columns.tolist()
|
|
dtypes = {col: str(df[col].dtype) for col in columns}
|
|
|
|
|
|
samples = {}
|
|
for col in columns:
|
|
non_null_values = df[col].dropna().head(3).tolist()
|
|
samples[col] = [str(val) for val in non_null_values]
|
|
|
|
|
|
row_count = len(df)
|
|
|
|
return {
|
|
"filename": filename,
|
|
"columns": columns,
|
|
"dtypes": dtypes,
|
|
"samples": samples,
|
|
"row_count": row_count
|
|
}
|
|
|