Chamin09 commited on
Commit
5bc018f
·
verified ·
1 Parent(s): fd61747

Create csv_index_builder.py

Browse files
Files changed (1) hide show
  1. indexes/csv_index_builder.py +62 -0
indexes/csv_index_builder.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional
2
+ from pathlib import Path
3
+ import pandas as pd
4
+
5
+ class EnhancedCSVReader:
6
+ """Enhanced CSV reader with metadata extraction capabilities."""
7
+
8
+ def __init__(self):
9
+ """Initialize the CSV reader."""
10
+ pass
11
+
12
+ def load_data(self, file_path: str) -> List[Dict]:
13
+ """Load CSV file and extract documents with metadata."""
14
+ # Extract metadata
15
+ csv_metadata = self._extract_metadata(file_path)
16
+
17
+ # Read the CSV content
18
+ df = pd.read_csv(file_path)
19
+
20
+ # Convert each row to a document-like dictionary
21
+ documents = []
22
+ for _, row in df.head(10).iterrows(): # Sample first 10 rows
23
+ doc = {
24
+ "content": row.to_string(),
25
+ "metadata": csv_metadata.copy()
26
+ }
27
+ documents.append(doc)
28
+
29
+ # Add a schema document
30
+ schema_doc = {
31
+ "content": f"CSV Schema: {', '.join(df.columns)}",
32
+ "metadata": csv_metadata.copy()
33
+ }
34
+ documents.append(schema_doc)
35
+
36
+ return documents
37
+
38
+ def _extract_metadata(self, file_path: str) -> Dict:
39
+ """Extract useful metadata from CSV file."""
40
+ df = pd.read_csv(file_path)
41
+ filename = Path(file_path).name
42
+
43
+ # Extract column information
44
+ columns = df.columns.tolist()
45
+ dtypes = {col: str(df[col].dtype) for col in columns}
46
+
47
+ # Extract sample values (first 3 non-null values per column)
48
+ samples = {}
49
+ for col in columns:
50
+ non_null_values = df[col].dropna().head(3).tolist()
51
+ samples[col] = [str(val) for val in non_null_values]
52
+
53
+ # Basic statistics
54
+ row_count = len(df)
55
+
56
+ return {
57
+ "filename": filename,
58
+ "columns": columns,
59
+ "dtypes": dtypes,
60
+ "samples": samples,
61
+ "row_count": row_count
62
+ }