Spaces:

AnnsKhan
/

billion_row_challenge

Sleeping

App Files Files Community

AnnsKhan commited on Feb 17

Commit

b469f9b

1 Parent(s): b2f66f6

hi

Browse files

Files changed (4) hide show

.github/workflows/deploy.yml +46 -0
.gitignore +2 -1
README.md +13 -1
app.py +230 -0

.github/workflows/deploy.yml ADDED Viewed

	@@ -0,0 +1,46 @@

+name: Deploy to Hugging Face Spaces
+on:
+  push:
+    branches:
+      - main  # Change this if your default branch is different
+jobs:
+  Deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          pip install huggingface_hub
+      - name: Configure huggingface-cli
+        run: |
+          echo "Hugging Face Token: ${{ secrets.HF_TOKEN }}"
+          huggingface-cli login --token ${{ secrets.HF_TOKEN }}
+      - name: Set up Git
+        run: |
+          git config --global user.name "github-actions[bot]"
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+      - name: Add Hugging Face remote
+        run: |
+          git remote add huggingface https://huggingface:${{ secrets.HF_TOKEN }}@huggingface.co/spaces/AnnsKhan/billion_row_challenge
+      - name: Fetch and reset to main
+        run: |
+          git fetch huggingface
+          git reset --hard origin/main
+      - name: Push to Hugging Face Hub
+        run: |
+          git push huggingface main --force

.gitignore CHANGED Viewed

@@ -25,7 +25,8 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.

 .installed.cfg
 *.egg
 MANIFEST
+wandb/
+data/
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.

README.md CHANGED Viewed

	@@ -1 +1,13 @@
1	- ~~# billionrows_~~

+---
+title: Billion Row Challenge
+emoji: 🌖
+colorFrom: yellow
+colorTo: purple
+sdk: gradio
+sdk_version: 5.16.0
+app_file: app.py
+pinned: false
+short_description: asdasdasdasdas
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,230 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+import gradio as gr
+import time
+import psutil
+import tracemalloc
+import gc
+import pandas as pd
+import dask.dataframe as dd
+import polars as pl
+import duckdb
+import seaborn as sns
+import matplotlib.pyplot as plt
+import io
+import os
+from PIL import Image
+import numpy as np
+import matplotlib
+import wandb
+wandb.init(project="billion-row-analysis", name="benchmarking")
+os.environ["MODIN_ENGINE"] = "dask"
+# Initialize FastAPI app
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Performance measurement function
+def measure_performance(load_function, *args):
+    gc.collect()
+    tracemalloc.start()
+    start_time = time.time()
+    start_cpu = psutil.cpu_percent(interval=1)
+    total_memory = psutil.virtual_memory().total  # Get total system memory
+    start_memory = psutil.Process().memory_info().rss / total_memory * 100  # Convert to percentage
+    data = load_function(*args)
+    end_memory = psutil.Process().memory_info().rss / total_memory * 100  # Convert to percentage
+    end_cpu = psutil.cpu_percent(interval=1)
+    end_time = time.time()
+    _, peak_memory = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+    peak_memory_percentage = peak_memory / total_memory * 100  # Convert to percentage
+    return data, end_time - start_time, max(end_cpu - start_cpu, 0), max(end_memory - start_memory, 0), peak_memory_percentage
+# Data loading functions
+def load_data_python_vectorized():
+    df = pd.read_parquet('data/raw/jan_2024.parquet')
+    # Convert numerical columns to NumPy arrays for vectorized operations
+    num_cols = df.select_dtypes(include=['number']).columns
+    np_data = {col: df[col].to_numpy() for col in num_cols}
+    return np_data
+def load_data_pandas():
+    return pd.read_parquet('data/raw/jan_2024.parquet')
+def load_data_dask():
+    return dd.read_parquet('data/raw/jan_2024.parquet')
+def load_data_polars():
+    return pl.read_parquet('data/raw/jan_2024.parquet')
+def load_data_duckdb():
+    return duckdb.read_parquet('data/raw/jan_2024.parquet')
+# Loaders list
+loaders = [
+    (load_data_pandas, "Pandas"),
+    (load_data_dask, "Dask"),
+    (load_data_polars, "Polars"),
+    (load_data_duckdb, "DuckDB"),
+    (load_data_python_vectorized, "Python Vectorized"),
+]
+def run_benchmark():
+    benchmark_results = []
+    error_messages = []
+    for loader, lib_name in loaders:
+        try:
+            data, load_time, cpu_load, mem_load, peak_mem_load = measure_performance(loader)
+            # Log metrics to Weights & Biases
+            wandb.log({
+                "Library": lib_name,
+                "Load Time (s)": load_time,
+                "CPU Load (%)": cpu_load,
+                "Memory Load (%)": mem_load,
+                "Peak Memory (%)": peak_mem_load
+            })
+            benchmark_results.append({
+                "Library": lib_name,
+                "Load Time (s)": load_time,
+                "CPU Load (%)": cpu_load,
+                "Memory Load (%)": mem_load,
+                "Peak Memory (%)": peak_mem_load
+            })
+        except Exception as e:
+            error_messages.append(f"{lib_name} Error: {str(e)}")
+    if error_messages:
+        return '\n'.join(error_messages), None
+    benchmark_df = pd.DataFrame(benchmark_results)
+    sns.set(style="whitegrid")
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle("Benchmark Results", fontsize=16)
+    sns.barplot(x="Library", y="Load Time (s)", data=benchmark_df, ax=axes[0, 0])
+    sns.barplot(x="Library", y="CPU Load (%)", data=benchmark_df, ax=axes[0, 1])
+    sns.barplot(x="Library", y="Memory Load (%)", data=benchmark_df, ax=axes[1, 0])
+    sns.barplot(x="Library", y="Peak Memory (%)", data=benchmark_df, ax=axes[1, 1])
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    buf.seek(0)
+    # Convert plot to an image and log it to wandb
+    image = Image.open(buf)
+    wandb.log({"Benchmark Results": wandb.Image(image)})
+    image_array = np.array(image)
+    return benchmark_df.to_markdown(), image_array  # Return NumPy array
+matplotlib.use("Agg")
+def explore_dataset():
+    try:
+        df = pd.read_parquet('data/raw/jan_2024.parquet')
+        # Generate dataset summary
+        summary = df.describe(include='all').T
+        summary["missing_values"] = df.isnull().sum()
+        summary["unique_values"] = df.nunique()
+        summary_text = summary.to_markdown()
+        # Log dataset summary as text in Weights & Biases
+        wandb.log({"Dataset Summary": wandb.Html(summary_text)})
+        # Prepare for visualization
+        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+        fig.suptitle("Dataset Overview", fontsize=16)
+        # Plot data type distribution
+        data_types = df.dtypes.value_counts()
+        sns.barplot(x=data_types.index.astype(str), y=data_types.values, ax=axes[0])
+        axes[0].set_title("Column Count by Data Type")
+        axes[0].set_ylabel("Count")
+        # Plot mean values of numeric columns
+        num_cols = df.select_dtypes(include=['number']).columns
+        if len(num_cols) > 0:
+            mean_values = df[num_cols].mean()
+            sns.barplot(x=mean_values.index, y=mean_values.values, ax=axes[1])
+            axes[1].set_title("Mean Values of Numeric Columns")
+            axes[1].tick_params(axis='x', rotation=45)
+            # Log mean values to Weights & Biases
+            for col, mean_val in mean_values.items():
+                wandb.log({f"Mean Values/{col}": mean_val})
+        # Save figure to buffer
+        buf = io.BytesIO()
+        plt.tight_layout()
+        plt.savefig(buf, format='png', bbox_inches='tight')
+        plt.close(fig)
+        buf.seek(0)
+        # Convert figure to NumPy array
+        image = Image.open(buf)
+        image_array = np.array(image)
+        # Log image to Weights & Biases
+        wandb.log({"Dataset Overview": wandb.Image(image)})
+        return summary_text, image_array
+    except Exception as e:
+        return f"Error loading data: {str(e)}", None
+    # Gradio interface setup
+def gradio_interface():
+    def run_and_plot():
+        results, plot = run_benchmark()
+        return results, plot
+    def explore_data():
+        summary, plot = explore_dataset()
+        return summary, plot
+    with gr.Blocks() as demo:
+        gr.Markdown("## Explore Dataset")
+        explore_button = gr.Button("Explore Data")
+        summary_text = gr.Textbox(label="Dataset Summary")
+        explore_image = gr.Image(label="Feature Distributions")
+        explore_button.click(explore_data, outputs=[summary_text, explore_image])
+        gr.Markdown("## Benchmarking Different Data Loading Libraries")
+        run_button = gr.Button("Run Benchmark")
+        result_text = gr.Textbox(label="Benchmark Results")
+        plot_image = gr.Image(label="Performance Graph")
+        run_button.click(run_and_plot, outputs=[result_text, plot_image])
+    return demo
+demo = gradio_interface()
+# Run the Gradio app
+demo.launch(share=False)  # No need for share=True in VS Code, local access is sufficient