Spaces:

Chamin09
/

ChatCSV

Sleeping

App Files Files Community

Chamin09 commited on Apr 22

Commit

e13d87a

verified ·

1 Parent(s): 18f7294

Upload 12 files

Browse files

Files changed (12) hide show

.gitattributes +0 -35
app.py +217 -0
indexes/csv_index_builder.py +51 -0
indexes/index_manager.py +104 -0
indexes/query_engine.py +124 -0
models/llm_setup.py +64 -0
requirements.txt +14 -0
tools/data_tools.py +158 -0
tools/export.py +161 -0
tools/visualization.py +246 -0
utils/csv_helper.py +189 -0
utils/prompt_template.py +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import os
+import gradio as gr
+import tempfile
+from pathlib import Path
+import base64
+from PIL import Image
+import io
+import time
+# Import our components
+from models.llm_setup import setup_llm
+from indexes.csv_index_builder import EnhancedCSVReader
+from indexes.index_manager import CSVIndexManager
+from indexes.query_engine import CSVQueryEngine
+from tools.data_tools import PandasDataTools
+from tools.visualization import VisualizationTools
+from tools.export import ExportTools
+# Setup temporary directory for uploaded files
+UPLOAD_DIR = Path(tempfile.mkdtemp())
+EXPORT_DIR = Path(tempfile.mkdtemp())
+class CSVChatApp:
+    """Main application class for CSV chatbot."""
+    def __init__(self):
+        """Initialize the application components."""
+        # Initialize the language model
+        self.llm = setup_llm()
+        # Initialize the index manager
+        self.index_manager = CSVIndexManager()
+        # Initialize tools
+        self.data_tools = PandasDataTools(str(UPLOAD_DIR))
+        self.viz_tools = VisualizationTools(str(UPLOAD_DIR))
+        self.export_tools = ExportTools(str(EXPORT_DIR))
+        # Initialize query engine with tools
+        self.query_engine = self._setup_query_engine()
+        # Track conversation history
+        self.chat_history = []
+        self.uploaded_files = []
+    def _setup_query_engine(self):
+        """Set up the query engine with tools."""
+        # Get all tools
+        tools = (
+            self.data_tools.get_tools() +
+            self.viz_tools.get_tools() +
+            self.export_tools.get_tools()
+        )
+        # Create query engine with tools
+        query_engine = CSVQueryEngine(self.index_manager, self.llm)
+        return query_engine
+    def handle_file_upload(self, files):
+        """Process uploaded CSV files."""
+        file_info = []
+        for file in files:
+            if file is None:
+                continue
+            # Get file path
+            file_path = Path(file.name)
+            # Only process CSV files
+            if not file_path.suffix.lower() == '.csv':
+                continue
+            # Copy to upload directory
+            dest_path = UPLOAD_DIR / file_path.name
+            with open(dest_path, 'wb') as f:
+                f.write(file_path.read_bytes())
+            # Create index for this file
+            try:
+                self.index_manager.create_index(str(dest_path))
+                file_info.append(f"✅ Indexed: {file_path.name}")
+                self.uploaded_files.append(str(dest_path))
+            except Exception as e:
+                file_info.append(f"❌ Failed to index {file_path.name}: {str(e)}")
+        # Return information about processed files
+        if file_info:
+            return "\n".join(file_info)
+        else:
+            return "No CSV files were uploaded."
+    def process_query(self, query, history):
+        """Process a user query and generate a response."""
+        if not self.uploaded_files:
+            return "Please upload CSV files before asking questions."
+        # Add user message to history
+        self.chat_history.append({"role": "user", "content": query})
+        # Process the query
+        try:
+            response = self.query_engine.query(query)
+            answer = response["answer"]
+            # Check if response contains an image
+            if isinstance(answer, dict) and "image" in answer:
+                # Handle image in response
+                img_data = answer["image"]
+                img = Image.open(io.BytesIO(base64.b64decode(img_data)))
+                img_path = EXPORT_DIR / f"viz_{int(time.time())}.png"
+                img.save(img_path)
+                # Update answer to include image path
+                text_response = answer.get("text", "Generated visualization")
+                answer = (text_response, str(img_path))
+            # Add assistant message to history
+            self.chat_history.append({"role": "assistant", "content": answer})
+            return answer
+        except Exception as e:
+            error_msg = f"Error processing query: {str(e)}"
+            self.chat_history.append({"role": "assistant", "content": error_msg})
+            return error_msg
+    def export_conversation(self):
+        """Export the conversation as a report."""
+        if not self.chat_history:
+            return "No conversation to export."
+        # Extract content for report
+        title = "CSV Chat Conversation Report"
+        content = ""
+        images = []
+        for msg in self.chat_history:
+            role = msg["role"]
+            content_text = msg["content"]
+            # Handle content that might contain images
+            if isinstance(content_text, tuple) and len(content_text) == 2:
+                text, img_path = content_text
+                content += f"\n\n{'User' if role == 'user' else 'Assistant'}: {text}"
+                # Add image to report
+                try:
+                    with open(img_path, "rb") as img_file:
+                        img_data = base64.b64encode(img_file.read()).decode('utf-8')
+                        images.append(img_data)
+                except Exception:
+                    pass
+            else:
+                content += f"\n\n{'User' if role == 'user' else 'Assistant'}: {content_text}"
+        # Generate report
+        result = self.export_tools.generate_report(title, content, images)
+        if result["success"]:
+            return f"Report exported to: {result['report_path']}"
+        else:
+            return "Failed to export report."
+# Create the Gradio interface
+def create_interface():
+    """Create the Gradio web interface."""
+    app = CSVChatApp()
+    with gr.Blocks(title="CSV Chat Assistant") as interface:
+        gr.Markdown("# CSV Chat Assistant")
+        gr.Markdown("Upload CSV files and ask questions in natural language.")
+        with gr.Row():
+            with gr.Column(scale=1):
+                file_upload = gr.File(
+                    label="Upload CSV Files",
+                    file_count="multiple",
+                    type="file"
+                )
+                upload_button = gr.Button("Process Files")
+                file_status = gr.Textbox(label="File Status")
+                export_button = gr.Button("Export Conversation")
+                export_status = gr.Textbox(label="Export Status")
+            with gr.Column(scale=2):
+                chatbot = gr.Chatbot(label="Conversation")
+                msg = gr.Textbox(label="Your Question")
+                submit_button = gr.Button("Submit")
+        # Set up event handlers
+        upload_button.click(
+            fn=app.handle_file_upload,
+            inputs=[file_upload],
+            outputs=[file_status]
+        )
+        submit_button.click(
+            fn=app.process_query,
+            inputs=[msg, chatbot],
+            outputs=[chatbot]
+        )
+        export_button.click(
+            fn=app.export_conversation,
+            inputs=[],
+            outputs=[export_status]
+        )
+    return interface
+# Launch the app
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch()

indexes/csv_index_builder.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from typing import Dict, List, Optional
+from pathlib import Path
+import pandas as pd
+from llama_index.readers.file import CSVReader
+from llama_index.schema import Document
+class EnhancedCSVReader:
+    """Enhanced CSV reader with metadata extraction capabilities."""
+    def __init__(self):
+        self.csv_reader = CSVReader()
+    def load_data(self, file_path: str) -> List[Document]:
+        """Load CSV file and extract documents with metadata."""
+        # Load the CSV file
+        documents = self.csv_reader.load_data(file_path)
+        # Extract and add metadata
+        csv_metadata = self._extract_metadata(file_path)
+        # Enhance documents with metadata
+        for doc in documents:
+            doc.metadata.update(csv_metadata)
+        return documents
+    def _extract_metadata(self, file_path: str) -> Dict:
+        """Extract useful metadata from CSV file."""
+        df = pd.read_csv(file_path)
+        filename = Path(file_path).name
+        # Extract column information
+        columns = df.columns.tolist()
+        dtypes = {col: str(df[col].dtype) for col in columns}
+        # Extract sample values (first 3 non-null values per column)
+        samples = {}
+        for col in columns:
+            non_null_values = df[col].dropna().head(3).tolist()
+            samples[col] = [str(val) for val in non_null_values]
+        # Basic statistics
+        row_count = len(df)
+        return {
+            "filename": filename,
+            "columns": columns,
+            "dtypes": dtypes,
+            "samples": samples,
+            "row_count": row_count
+        }

indexes/index_manager.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from typing import Dict, List, Optional
+from pathlib import Path
+import os
+from llama_index import VectorStoreIndex, StorageContext
+from llama_index.vector_stores import ChromaVectorStore
+from llama_index.embeddings import HuggingFaceEmbedding
+import chromadb
+from indexes.csv_index_builder import EnhancedCSVReader
+class CSVIndexManager:
+    """Manages creation and retrieval of indexes for CSV files."""
+    def __init__(self, embedding_model_name: str = "all-MiniLM-L6-v2"):
+        self.csv_reader = EnhancedCSVReader()
+        self.embed_model = HuggingFaceEmbedding(model_name=embedding_model_name)
+        self.chroma_client = chromadb.Client()
+        self.indexes = {}
+    def create_index(self, file_path: str) -> VectorStoreIndex:
+        """Create vector index for a CSV file."""
+        # Extract filename as identifier
+        file_id = Path(file_path).stem
+        # Load documents with metadata
+        documents = self.csv_reader.load_data(file_path)
+        # Create Chroma collection for this CSV
+        chroma_collection = self.chroma_client.create_collection(file_id)
+        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
+        storage_context = StorageContext.from_defaults(vector_store=vector_store)
+        # Create vector index with our embedding model
+        index = VectorStoreIndex.from_documents(
+            documents,
+            storage_context=storage_context,
+            embed_model=self.embed_model
+        )
+        # Store in our registry
+        self.indexes[file_id] = {
+            "index": index,
+            "metadata": documents[0].metadata if documents else {}
+        }
+        return index
+    def index_directory(self, directory_path: str) -> Dict[str, VectorStoreIndex]:
+        """Index all CSV files in a directory."""
+        indexed_files = {}
+        # Get all CSV files in directory
+        csv_files = [f for f in os.listdir(directory_path)
+                    if f.lower().endswith('.csv')]
+        # Create index for each CSV file
+        for csv_file in csv_files:
+            file_path = os.path.join(directory_path, csv_file)
+            file_id = Path(file_path).stem
+            index = self.create_index(file_path)
+            indexed_files[file_id] = index
+        return indexed_files
+    def find_relevant_csvs(self, query: str, top_k: int = 3) -> List[str]:
+        """Find most relevant CSV files for a given query."""
+        if not self.indexes:
+            return []
+        # Create a document from the query
+        query_embedding = self.embed_model.get_text_embedding(query)
+        # Calculate similarity with each CSV's metadata
+        similarities = {}
+        for file_id, index_info in self.indexes.items():
+            # Get metadata description
+            metadata = index_info["metadata"]
+            # Create a rich description of the CSV
+            csv_description = f"CSV file {metadata['filename']} with columns: {', '.join(metadata['columns'])}. "
+            csv_description += f"Contains {metadata['row_count']} rows. "
+            csv_description += "Sample data: "
+            for col, samples in metadata['samples'].items():
+                if samples:
+                    csv_description += f"{col}: {', '.join(str(s) for s in samples[:2])}; "
+            # Get embedding for this description
+            csv_embedding = self.embed_model.get_text_embedding(csv_description)
+            # Calculate cosine similarity
+            similarity = self._cosine_similarity(query_embedding, csv_embedding)
+            similarities[file_id] = similarity
+        # Sort by similarity and return top_k
+        sorted_files = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
+        return [file_id for file_id, _ in sorted_files[:top_k]]
+    def _cosine_similarity(self, vec1, vec2):
+        """Calculate cosine similarity between two vectors."""
+        dot_product = sum(a * b for a, b in zip(vec1, vec2))
+        norm_a = sum(a * a for a in vec1) ** 0.5
+        norm_b = sum(b * b for b in vec2) ** 0.5
+        return dot_product / (norm_a * norm_b) if norm_a * norm_b != 0 else 0

indexes/query_engine.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from typing import Dict, List, Optional, Any
+from llama_index.query_engine import RetrieverQueryEngine
+from llama_index.retrievers import VectorIndexRetriever
+from llama_index.response_synthesizers import ResponseMode
+from llama_index.llms import HuggingFaceLLM
+from llama_index import ServiceContext, QueryBundle
+from llama_index.prompts import PromptTemplate
+class CSVQueryEngine:
+    """Query engine for CSV data with multi-file support."""
+    def __init__(self, index_manager, llm, response_mode="compact"):
+        """Initialize with index manager and language model."""
+        self.index_manager = index_manager
+        self.llm = llm
+        self.service_context = ServiceContext.from_defaults(llm=llm)
+        self.response_mode = response_mode
+        # Set up custom prompts
+        self._setup_prompts()
+    def _setup_prompts(self):
+        """Set up custom prompts for CSV querying."""
+        self.csv_query_prompt = PromptTemplate(
+            """You are an AI assistant specialized in analyzing CSV data.
+            Answer the following query using the provided CSV information.
+            If calculations are needed, explain your process.
+            CSV Context: {context_str}
+            Query: {query_str}
+            Answer:"""
+        )
+    def query(self, query_text: str) -> Dict[str, Any]:
+        """Process a natural language query across CSV files."""
+        # Find relevant CSV files
+        relevant_csvs = self.index_manager.find_relevant_csvs(query_text)
+        if not relevant_csvs:
+            return {
+                "answer": "No relevant CSV files found for your query.",
+                "sources": []
+            }
+        # Prepare response
+        responses = []
+        sources = []
+        # Query each relevant CSV
+        for csv_id in relevant_csvs:
+            index_info = self.index_manager.indexes.get(csv_id)
+            if not index_info:
+                continue
+            index = index_info["index"]
+            metadata = index_info["metadata"]
+            # Create retriever for this index
+            retriever = VectorIndexRetriever(
+                index=index,
+                similarity_top_k=5
+            )
+            # Create query engine
+            query_engine = RetrieverQueryEngine.from_args(
+                retriever=retriever,
+                service_context=self.service_context,
+                text_qa_template=self.csv_query_prompt,
+                response_mode=self.response_mode
+            )
+            # Execute query
+            response = query_engine.query(query_text)
+            responses.append({
+                "csv_id": csv_id,
+                "filename": metadata["filename"],
+                "response": response
+            })
+            # Collect source information
+            if hasattr(response, "source_nodes"):
+                for node in response.source_nodes:
+                    sources.append({
+                        "csv": metadata["filename"],
+                        "content": node.node.get_content()[:100] + "..."
+                    })
+        # Combine responses if multiple CSVs were queried
+        if len(responses) > 1:
+            combined_response = self._combine_responses(query_text, responses)
+            return {
+                "answer": combined_response,
+                "sources": sources
+            }
+        elif len(responses) == 1:
+            return {
+                "answer": responses[0]["response"],
+                "sources": sources
+            }
+        else:
+            return {
+                "answer": "Failed to process query with the available CSV data.",
+                "sources": []
+            }
+    def _combine_responses(self, query_text: str, responses: List[Dict]) -> str:
+        """Combine responses from multiple CSV files."""
+        # Create a prompt for combining multiple CSV responses
+        combine_prompt = f"""
+        I need to answer this question: {query_text}
+        I've analyzed multiple CSV files and found these results:
+        {chr(10).join([f"From {r['filename']}: {str(r['response'])}" for r in responses])}
+        Please provide a unified answer that combines these insights.
+        """
+        # Use the LLM to generate a combined response
+        combined_response = self.llm.complete(combine_prompt)
+        return combined_response.text

models/llm_setup.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from typing import Optional
+from llama_index.llms import HuggingFaceLLM
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+def setup_llm(model_name: str = "microsoft/phi-3-mini-4k-instruct",
+              device: str = None,
+              context_window: int = 4096,
+              max_new_tokens: int = 512) -> HuggingFaceLLM:
+    """
+    Set up the language model for the CSV chatbot.
+    Args:
+        model_name: Name of the Hugging Face model to use
+        device: Device to run the model on ('cuda', 'cpu', etc.)
+        context_window: Maximum context window size
+        max_new_tokens: Maximum number of new tokens to generate
+    Returns:
+        Configured LLM instance
+    """
+    # Determine device
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Configure quantization for memory efficiency
+    if device == "cuda":
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16
+        )
+    else:
+        quantization_config = None
+    # Configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        trust_remote_code=True
+    )
+    # Configure model with appropriate parameters for HF Spaces
+    model_kwargs = {
+        "trust_remote_code": True,
+        "torch_dtype": torch.float16,
+    }
+    if quantization_config:
+        model_kwargs["quantization_config"] = quantization_config
+    # Initialize LLM
+    llm = HuggingFaceLLM(
+        model_name=model_name,
+        tokenizer_name=model_name,
+        context_window=context_window,
+        max_new_tokens=max_new_tokens,
+        generate_kwargs={"temperature": 0.7, "top_p": 0.95},
+        device_map=device,
+        tokenizer_kwargs={"trust_remote_code": True},
+        model_kwargs=model_kwargs,
+        # Cache the model to avoid reloading
+        cache_folder="./model_cache"
+    )
+    return llm

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+llama-index
+transformers
+gradio
+pandas
+numpy
+matplotlib
+plotly
+sentence-transformers
+chromadb
+torch
+pillow
+chardet
+bitsandbytes
+accelerate

tools/data_tools.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from typing import Dict, List, Any, Optional, Callable
+import pandas as pd
+import numpy as np
+from llama_index.tools import FunctionTool
+from pathlib import Path
+class PandasDataTools:
+    """Tools for data analysis operations on CSV files."""
+    def __init__(self, csv_directory: str):
+        """Initialize with directory containing CSV files."""
+        self.csv_directory = csv_directory
+        self.dataframes = {}
+        self.tools = self._create_tools()
+    def _load_dataframe(self, filename: str) -> pd.DataFrame:
+        """Load a CSV file as DataFrame, with caching."""
+        if filename not in self.dataframes:
+            file_path = Path(self.csv_directory) / filename
+            if not file_path.exists() and not filename.endswith('.csv'):
+                file_path = Path(self.csv_directory) / f"{filename}.csv"
+            if file_path.exists():
+                self.dataframes[filename] = pd.read_csv(file_path)
+            else:
+                raise ValueError(f"CSV file not found: {filename}")
+        return self.dataframes[filename]
+    def _create_tools(self) -> List[FunctionTool]:
+        """Create LlamaIndex function tools for data operations."""
+        tools = [
+            FunctionTool.from_defaults(
+                name="describe_csv",
+                description="Get statistical description of a CSV file",
+                fn=self.describe_csv
+            ),
+            FunctionTool.from_defaults(
+                name="filter_data",
+                description="Filter CSV data based on conditions",
+                fn=self.filter_data
+            ),
+            FunctionTool.from_defaults(
+                name="group_and_aggregate",
+                description="Group data and calculate aggregate statistics",
+                fn=self.group_and_aggregate
+            ),
+            FunctionTool.from_defaults(
+                name="sort_data",
+                description="Sort data by specified columns",
+                fn=self.sort_data
+            ),
+            FunctionTool.from_defaults(
+                name="calculate_correlation",
+                description="Calculate correlation between columns",
+                fn=self.calculate_correlation
+            )
+        ]
+        return tools
+    def get_tools(self) -> List[FunctionTool]:
+        """Get all available data tools."""
+        return self.tools
+    # Tool implementations
+    def describe_csv(self, filename: str) -> Dict[str, Any]:
+        """Get statistical description of CSV data."""
+        df = self._load_dataframe(filename)
+        description = df.describe().to_dict()
+        # Add additional info
+        result = {
+            "statistics": description,
+            "shape": df.shape,
+            "columns": df.columns.tolist(),
+            "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}
+        }
+        return result
+    def filter_data(self, filename: str, column: str, condition: str, value: Any) -> Dict[str, Any]:
+        """Filter data based on condition (==, >, <, >=, <=, !=, contains)."""
+        df = self._load_dataframe(filename)
+        if condition == "==":
+            filtered = df[df[column] == value]
+        elif condition == ">":
+            filtered = df[df[column] > float(value)]
+        elif condition == "<":
+            filtered = df[df[column] < float(value)]
+        elif condition == ">=":
+            filtered = df[df[column] >= float(value)]
+        elif condition == "<=":
+            filtered = df[df[column] <= float(value)]
+        elif condition == "!=":
+            filtered = df[df[column] != value]
+        elif condition.lower() == "contains":
+            filtered = df[df[column].astype(str).str.contains(str(value))]
+        else:
+            return {"error": f"Unsupported condition: {condition}"}
+        return {
+            "result_count": len(filtered),
+            "results": filtered.head(10).to_dict(orient="records"),
+            "total_count": len(df)
+        }
+    def group_and_aggregate(self, filename: str, group_by: str, agg_column: str,
+                           agg_function: str = "mean") -> Dict[str, Any]:
+        """Group by column and calculate aggregate statistic."""
+        df = self._load_dataframe(filename)
+        agg_functions = {
+            "mean": np.mean,
+            "sum": np.sum,
+            "min": np.min,
+            "max": np.max,
+            "count": len,
+            "median": np.median
+        }
+        if agg_function not in agg_functions:
+            return {"error": f"Unsupported aggregation function: {agg_function}"}
+        grouped = df.groupby(group_by)[agg_column].agg(agg_functions[agg_function])
+        return {
+            "group_by": group_by,
+            "aggregated_column": agg_column,
+            "aggregation": agg_function,
+            "results": grouped.to_dict()
+        }
+    def sort_data(self, filename: str, sort_by: str, ascending: bool = True) -> Dict[str, Any]:
+        """Sort data by column."""
+        df = self._load_dataframe(filename)
+        sorted_df = df.sort_values(by=sort_by, ascending=ascending)
+        return {
+            "sorted_by": sort_by,
+            "ascending": ascending,
+            "results": sorted_df.head(10).to_dict(orient="records")
+        }
+    def calculate_correlation(self, filename: str, column1: str, column2: str) -> Dict[str, Any]:
+        """Calculate correlation between two columns."""
+        df = self._load_dataframe(filename)
+        try:
+            correlation = df[column1].corr(df[column2])
+            return {
+                "correlation": correlation,
+                "column1": column1,
+                "column2": column2
+            }
+        except Exception as e:
+            return {"error": f"Could not calculate correlation: {str(e)}"}

tools/export.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from typing import Dict, List, Any, Optional, Union
+import pandas as pd
+import smtplib
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+from email.mime.image import MIMEImage
+import base64
+import io
+from pathlib import Path
+import json
+import datetime
+from llama_index.tools import FunctionTool
+class ExportTools:
+    """Tools for exporting data, generating reports, and sending emails."""
+    def __init__(self, output_directory: str = "./exports"):
+        """Initialize with directory for saved exports."""
+        self.output_directory = Path(output_directory)
+        self.output_directory.mkdir(exist_ok=True, parents=True)
+        self.tools = self._create_tools()
+    def _create_tools(self) -> List[FunctionTool]:
+        """Create LlamaIndex function tools for export operations."""
+        tools = [
+            FunctionTool.from_defaults(
+                name="generate_report",
+                description="Generate a report from conversation and results",
+                fn=self.generate_report
+            ),
+            FunctionTool.from_defaults(
+                name="save_results_to_csv",
+                description="Save query results to a CSV file",
+                fn=self.save_results_to_csv
+            ),
+            FunctionTool.from_defaults(
+                name="send_email",
+                description="Send results via email",
+                fn=self.send_email
+            )
+        ]
+        return tools
+    def get_tools(self) -> List[FunctionTool]:
+        """Get all available export tools."""
+        return self.tools
+    def generate_report(self, title: str, content: str,
+                       images: List[str] = None) -> Dict[str, Any]:
+        """Generate HTML report from content and images."""
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"{title.replace(' ', '_')}_{timestamp}.html"
+        file_path = self.output_directory / filename
+        # Basic HTML template
+        html = f"""
+        <!DOCTYPE html>
+        <html>
+        <head>
+            <title>{title}</title>
+            <style>
+                body {{ font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; }}
+                h1 {{ color: #333366; }}
+                .report-container {{ max-width: 800px; margin: 0 auto; }}
+                .timestamp {{ color: #666; font-size: 0.8em; }}
+                img {{ max-width: 100%; height: auto; margin: 20px 0; }}
+            </style>
+        </head>
+        <body>
+            <div class="report-container">
+                <h1>{title}</h1>
+                <div class="timestamp">Generated on: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</div>
+                <div class="content">
+                    {content.replace('\n', '<br>')}
+                </div>
+        """
+        # Add images if provided
+        if images and len(images) > 0:
+            html += "<div class='images'>"
+            for i, img_base64 in enumerate(images):
+                html += f"<img src='data:image/png;base64,{img_base64}' alt='Figure {i+1}'>"
+            html += "</div>"
+        html += """
+            </div>
+        </body>
+        </html>
+        """
+        # Write to file
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write(html)
+        return {
+            "success": True,
+            "report_path": str(file_path),
+            "title": title,
+            "timestamp": timestamp
+        }
+    def save_results_to_csv(self, data: List[Dict[str, Any]],
+                           filename: str = None) -> Dict[str, Any]:
+        """Save query results to a CSV file."""
+        if not data or len(data) == 0:
+            return {"success": False, "error": "No data provided"}
+        # Create DataFrame from data
+        df = pd.DataFrame(data)
+        # Generate filename if not provided
+        if not filename:
+            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"query_results_{timestamp}.csv"
+        # Ensure filename has .csv extension
+        if not filename.lower().endswith('.csv'):
+            filename += '.csv'
+        file_path = self.output_directory / filename
+        # Save to CSV
+        df.to_csv(file_path, index=False)
+        return {
+            "success": True,
+            "file_path": str(file_path),
+            "row_count": len(df),
+            "column_count": len(df.columns)
+        }
+    def send_email(self, to_email: str, subject: str, body: str,
+                  from_email: str = None, smtp_server: str = None,
+                  smtp_port: int = 587, username: str = None,
+                  password: str = None, images: List[str] = None) -> Dict[str, Any]:
+        """
+        Send email with results.
+        Note: In production, credentials should be securely managed.
+        For demo purposes, this will log the email content instead.
+        """
+        # For safety in a demo app, don't actually send emails
+        # Just log what would be sent and return success
+        email_content = {
+            "to": to_email,
+            "subject": subject,
+            "body": body[:100] + "..." if len(body) > 100 else body,
+            "images": f"{len(images) if images else 0} images would be attached",
+            "note": "Email sending is simulated for demo purposes"
+        }
+        # Log the email content
+        print(f"SIMULATED EMAIL: {json.dumps(email_content, indent=2)}")
+        return {
+            "success": True,
+            "to": to_email,
+            "subject": subject,
+            "simulated": True,
+            "timestamp": datetime.datetime.now().isoformat()
+        }

tools/visualization.py ADDED Viewed

	@@ -0,0 +1,246 @@

+from typing import Dict, List, Any, Optional, Tuple, Union
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib
+import io
+import base64
+import numpy as np
+from llama_index.tools import FunctionTool
+from pathlib import Path
+# Configure matplotlib for non-interactive environments
+matplotlib.use('Agg')
+class VisualizationTools:
+    """Tools for creating visualizations from CSV data."""
+    def __init__(self, csv_directory: str):
+        """Initialize with directory containing CSV files."""
+        self.csv_directory = csv_directory
+        self.dataframes = {}
+        self.tools = self._create_tools()
+        self.figure_size = (10, 6)
+        self.dpi = 100
+    def _load_dataframe(self, filename: str) -> pd.DataFrame:
+        """Load a CSV file as DataFrame, with caching."""
+        if filename not in self.dataframes:
+            file_path = Path(self.csv_directory) / filename
+            if not file_path.exists() and not filename.endswith('.csv'):
+                file_path = Path(self.csv_directory) / f"{filename}.csv"
+            if file_path.exists():
+                self.dataframes[filename] = pd.read_csv(file_path)
+            else:
+                raise ValueError(f"CSV file not found: {filename}")
+        return self.dataframes[filename]
+    def _create_tools(self) -> List[FunctionTool]:
+        """Create LlamaIndex function tools for visualizations."""
+        tools = [
+            FunctionTool.from_defaults(
+                name="create_line_chart",
+                description="Create a line chart from CSV data",
+                fn=self.create_line_chart
+            ),
+            FunctionTool.from_defaults(
+                name="create_bar_chart",
+                description="Create a bar chart from CSV data",
+                fn=self.create_bar_chart
+            ),
+            FunctionTool.from_defaults(
+                name="create_scatter_plot",
+                description="Create a scatter plot from CSV data",
+                fn=self.create_scatter_plot
+            ),
+            FunctionTool.from_defaults(
+                name="create_histogram",
+                description="Create a histogram from CSV data",
+                fn=self.create_histogram
+            ),
+            FunctionTool.from_defaults(
+                name="create_pie_chart",
+                description="Create a pie chart from CSV data",
+                fn=self.create_pie_chart
+            )
+        ]
+        return tools
+    def get_tools(self) -> List[FunctionTool]:
+        """Get all available visualization tools."""
+        return self.tools
+    def _figure_to_base64(self, fig) -> str:
+        """Convert matplotlib figure to base64 encoded string."""
+        buf = io.BytesIO()
+        fig.savefig(buf, format='png', dpi=self.dpi)
+        buf.seek(0)
+        img_str = base64.b64encode(buf.read()).decode('utf-8')
+        plt.close(fig)
+        return img_str
+    # Visualization tool implementations
+    def create_line_chart(self, filename: str, x_column: str, y_column: str,
+                          title: str = None, limit: int = 50) -> Dict[str, Any]:
+        """Create a line chart visualization."""
+        df = self._load_dataframe(filename)
+        # Limit data points if needed
+        if len(df) > limit:
+            df = df.head(limit)
+        fig, ax = plt.subplots(figsize=self.figure_size)
+        # Create line chart
+        ax.plot(df[x_column], df[y_column], marker='o', linestyle='-')
+        # Set labels and title
+        ax.set_xlabel(x_column)
+        ax.set_ylabel(y_column)
+        ax.set_title(title or f"{y_column} vs {x_column}")
+        ax.grid(True)
+        # Convert to base64
+        img_str = self._figure_to_base64(fig)
+        return {
+            "chart_type": "line",
+            "x_column": x_column,
+            "y_column": y_column,
+            "data_points": len(df),
+            "image": img_str
+        }
+    def create_bar_chart(self, filename: str, x_column: str, y_column: str,
+                         title: str = None, limit: int = 20) -> Dict[str, Any]:
+        """Create a bar chart visualization."""
+        df = self._load_dataframe(filename)
+        # Limit categories if needed
+        if len(df) > limit:
+            df = df.head(limit)
+        fig, ax = plt.subplots(figsize=self.figure_size)
+        # Create bar chart
+        ax.bar(df[x_column], df[y_column])
+        # Set labels and title
+        ax.set_xlabel(x_column)
+        ax.set_ylabel(y_column)
+        ax.set_title(title or f"{y_column} by {x_column}")
+        # Rotate x labels if there are many categories
+        if len(df) > 5:
+            plt.xticks(rotation=45, ha='right')
+        plt.tight_layout()
+        # Convert to base64
+        img_str = self._figure_to_base64(fig)
+        return {
+            "chart_type": "bar",
+            "x_column": x_column,
+            "y_column": y_column,
+            "categories": len(df),
+            "image": img_str
+        }
+    def create_scatter_plot(self, filename: str, x_column: str, y_column: str,
+                           color_column: str = None, title: str = None) -> Dict[str, Any]:
+        """Create a scatter plot visualization."""
+        df = self._load_dataframe(filename)
+        fig, ax = plt.subplots(figsize=self.figure_size)
+        # Create scatter plot
+        if color_column and color_column in df.columns:
+            scatter = ax.scatter(df[x_column], df[y_column], c=df[color_column], cmap='viridis', alpha=0.7)
+            plt.colorbar(scatter, ax=ax, label=color_column)
+        else:
+            ax.scatter(df[x_column], df[y_column], alpha=0.7)
+        # Set labels and title
+        ax.set_xlabel(x_column)
+        ax.set_ylabel(y_column)
+        ax.set_title(title or f"{y_column} vs {x_column}")
+        ax.grid(True, linestyle='--', alpha=0.7)
+        # Convert to base64
+        img_str = self._figure_to_base64(fig)
+        return {
+            "chart_type": "scatter",
+            "x_column": x_column,
+            "y_column": y_column,
+            "color_column": color_column,
+            "data_points": len(df),
+            "image": img_str
+        }
+    def create_histogram(self, filename: str, column: str, bins: int = 10,
+                        title: str = None) -> Dict[str, Any]:
+        """Create a histogram visualization."""
+        df = self._load_dataframe(filename)
+        fig, ax = plt.subplots(figsize=self.figure_size)
+        # Create histogram
+        ax.hist(df[column], bins=bins, alpha=0.7, edgecolor='black')
+        # Set labels and title
+        ax.set_xlabel(column)
+        ax.set_ylabel('Frequency')
+        ax.set_title(title or f"Distribution of {column}")
+        ax.grid(True, linestyle='--', alpha=0.7)
+        # Convert to base64
+        img_str = self._figure_to_base64(fig)
+        return {
+            "chart_type": "histogram",
+            "column": column,
+            "bins": bins,
+            "data_points": len(df),
+            "image": img_str
+        }
+    def create_pie_chart(self, filename: str, label_column: str, value_column: str = None,
+                        title: str = None, limit: int = 10) -> Dict[str, Any]:
+        """Create a pie chart visualization."""
+        df = self._load_dataframe(filename)
+        # If value column not provided, count occurrences of each label
+        if value_column is None:
+            data = df[label_column].value_counts().head(limit)
+            labels = data.index.tolist()
+            values = data.values.tolist()
+        else:
+            # Group by label and sum values
+            grouped = df.groupby(label_column)[value_column].sum().reset_index()
+            # Limit to top categories
+            grouped = grouped.nlargest(limit, value_column)
+            labels = grouped[label_column].tolist()
+            values = grouped[value_column].tolist()
+        fig, ax = plt.subplots(figsize=self.figure_size)
+        # Create pie chart
+        ax.pie(values, labels=labels, autopct='%1.1f%%', startangle=90, shadow=True)
+        ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
+        # Set title
+        ax.set_title(title or f"Distribution of {label_column}")
+        # Convert to base64
+        img_str = self._figure_to_base64(fig)
+        return {
+            "chart_type": "pie",
+            "label_column": label_column,
+            "value_column": value_column,
+            "categories": len(labels),
+            "image": img_str
+        }

utils/csv_helper.py ADDED Viewed

	@@ -0,0 +1,189 @@

+from typing import Dict, List, Any, Optional, Tuple
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import os
+import chardet
+import csv
+class CSVHelpers:
+    """Helper utilities for CSV preprocessing and analysis."""
+    @staticmethod
+    def detect_encoding(file_path: str, sample_size: int = 10000) -> str:
+        """Detect the encoding of a CSV file."""
+        with open(file_path, 'rb') as f:
+            raw_data = f.read(sample_size)
+            result = chardet.detect(raw_data)
+            return result['encoding']
+    @staticmethod
+    def detect_delimiter(file_path: str, encoding: str = 'utf-8') -> str:
+        """Detect the delimiter used in a CSV file."""
+        with open(file_path, 'r', encoding=encoding) as csvfile:
+            sample = csvfile.read(4096)
+            # Check common delimiters
+            for delimiter in [',', ';', '\t', '|']:
+                sniffer = csv.Sniffer()
+                try:
+                    if delimiter in sample:
+                        dialect = sniffer.sniff(sample, delimiters=delimiter)
+                        return dialect.delimiter
+                except:
+                    continue
+            # Default to comma if detection fails
+            return ','
+    @staticmethod
+    def preprocess_csv(file_path: str) -> Tuple[pd.DataFrame, Dict[str, Any]]:
+        """
+        Preprocess a CSV file with automatic encoding and delimiter detection.
+        Returns the DataFrame and metadata about the preprocessing.
+        """
+        # Detect encoding
+        try:
+            encoding = CSVHelpers.detect_encoding(file_path)
+        except:
+            encoding = 'utf-8'  # Default to UTF-8 if detection fails
+        # Detect delimiter
+        try:
+            delimiter = CSVHelpers.detect_delimiter(file_path, encoding)
+        except:
+            delimiter = ','  # Default to comma if detection fails
+        # Read the CSV with detected parameters
+        df = pd.read_csv(file_path, encoding=encoding, delimiter=delimiter, low_memory=False)
+        # Basic preprocessing
+        metadata = {
+            "original_shape": df.shape,
+            "encoding": encoding,
+            "delimiter": delimiter,
+            "columns": list(df.columns),
+            "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}
+        }
+        # Handle missing values
+        missing_counts = df.isna().sum()
+        metadata["missing_values"] = {col: int(count) for col, count in missing_counts.items() if count > 0}
+        # Handle duplicate rows
+        duplicates = df.duplicated().sum()
+        metadata["duplicate_rows"] = int(duplicates)
+        return df, metadata
+    @staticmethod
+    def infer_column_types(df: pd.DataFrame) -> Dict[str, str]:
+        """
+        Infer semantic types of columns (beyond pandas dtypes).
+        Examples: date, categorical, numeric, text, etc.
+        """
+        column_types = {}
+        for column in df.columns:
+            # Skip columns with all missing values
+            if df[column].isna().all():
+                column_types[column] = "unknown"
+                continue
+            # Get pandas dtype
+            dtype = df[column].dtype
+            # Check if datetime
+            if pd.api.types.is_datetime64_dtype(df[column]):
+                column_types[column] = "datetime"
+            # Try to convert to datetime if string
+            elif dtype == 'object':
+                try:
+                    # Sample non-null values
+                    sample = df[column].dropna().head(10)
+                    pd.to_datetime(sample)
+                    column_types[column] = "potential_datetime"
+                except:
+                    # Check if categorical (few unique values)
+                    unique_ratio = df[column].nunique() / len(df)
+                    if unique_ratio < 0.1:  # Less than 10% unique values
+                        column_types[column] = "categorical"
+                    else:
+                        column_types[column] = "text"
+            # Numeric types
+            elif pd.api.types.is_numeric_dtype(dtype):
+                # Check if potential ID column
+                if df[column].nunique() == len(df) and df[column].min() >= 0:
+                    column_types[column] = "id"
+                # Check if binary
+                elif df[column].nunique() <= 2:
+                    column_types[column] = "binary"
+                # Check if integer
+                elif pd.api.types.is_integer_dtype(dtype):
+                    column_types[column] = "integer"
+                else:
+                    column_types[column] = "float"
+            # Boolean type
+            elif pd.api.types.is_bool_dtype(dtype):
+                column_types[column] = "boolean"
+            # Fallback
+            else:
+                column_types[column] = "unknown"
+        return column_types
+    @staticmethod
+    def suggest_visualizations(df: pd.DataFrame) -> List[Dict[str, Any]]:
+        """
+        Suggest appropriate visualizations based on data types.
+        Returns a list of visualization suggestions.
+        """
+        suggestions = []
+        column_types = CSVHelpers.infer_column_types(df)
+        numeric_columns = [col for col, type in column_types.items()
+                          if type in ["integer", "float"]]
+        categorical_columns = [col for col, type in column_types.items()
+                              if type in ["categorical", "binary"]]
+        datetime_columns = [col for col, type in column_types.items()
+                           if type in ["datetime", "potential_datetime"]]
+        # Histogram for numeric columns
+        for col in numeric_columns[:3]:  # Limit to first 3
+            suggestions.append({
+                "chart_type": "histogram",
+                "column": col,
+                "title": f"Distribution of {col}"
+            })
+        # Bar charts for categorical columns
+        for col in categorical_columns[:3]:  # Limit to first 3
+            suggestions.append({
+                "chart_type": "bar",
+                "x_column": col,
+                "y_column": "count",
+                "title": f"Count by {col}"
+            })
+        # Time series for datetime + numeric combinations
+        if datetime_columns and numeric_columns:
+            suggestions.append({
+                "chart_type": "line",
+                "x_column": datetime_columns[0],
+                "y_column": numeric_columns[0],
+                "title": f"{numeric_columns[0]} over Time"
+            })
+        # Scatter plots for numeric pairs
+        if len(numeric_columns) >= 2:
+            suggestions.append({
+                "chart_type": "scatter",
+                "x_column": numeric_columns[0],
+                "y_column": numeric_columns[1],
+                "title": f"{numeric_columns[1]} vs {numeric_columns[0]}"
+            })
+        return suggestions

utils/prompt_template.py ADDED Viewed

File without changes