Spaces:

Chamin09
/

ChatCSV

Sleeping

App Files Files Community

Chamin09 commited on Apr 22

Commit

5bc018f

verified ·

1 Parent(s): fd61747

Create csv_index_builder.py

Browse files

Files changed (1) hide show

indexes/csv_index_builder.py +62 -0

indexes/csv_index_builder.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from typing import Dict, List, Optional
+from pathlib import Path
+import pandas as pd
+class EnhancedCSVReader:
+    """Enhanced CSV reader with metadata extraction capabilities."""
+    def __init__(self):
+        """Initialize the CSV reader."""
+        pass
+    def load_data(self, file_path: str) -> List[Dict]:
+        """Load CSV file and extract documents with metadata."""
+        # Extract metadata
+        csv_metadata = self._extract_metadata(file_path)
+        # Read the CSV content
+        df = pd.read_csv(file_path)
+        # Convert each row to a document-like dictionary
+        documents = []
+        for _, row in df.head(10).iterrows():  # Sample first 10 rows
+            doc = {
+                "content": row.to_string(),
+                "metadata": csv_metadata.copy()
+            }
+            documents.append(doc)
+        # Add a schema document
+        schema_doc = {
+            "content": f"CSV Schema: {', '.join(df.columns)}",
+            "metadata": csv_metadata.copy()
+        }
+        documents.append(schema_doc)
+        return documents
+    def _extract_metadata(self, file_path: str) -> Dict:
+        """Extract useful metadata from CSV file."""
+        df = pd.read_csv(file_path)
+        filename = Path(file_path).name
+        # Extract column information
+        columns = df.columns.tolist()
+        dtypes = {col: str(df[col].dtype) for col in columns}
+        # Extract sample values (first 3 non-null values per column)
+        samples = {}
+        for col in columns:
+            non_null_values = df[col].dropna().head(3).tolist()
+            samples[col] = [str(val) for val in non_null_values]
+        # Basic statistics
+        row_count = len(df)
+        return {
+            "filename": filename,
+            "columns": columns,
+            "dtypes": dtypes,
+            "samples": samples,
+            "row_count": row_count
+        }