AnnsKhan commited on
Commit
b469f9b
·
1 Parent(s): b2f66f6
Files changed (4) hide show
  1. .github/workflows/deploy.yml +46 -0
  2. .gitignore +2 -1
  3. README.md +13 -1
  4. app.py +230 -0
.github/workflows/deploy.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy to Hugging Face Spaces
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main # Change this if your default branch is different
7
+
8
+ jobs:
9
+ Deploy:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout repository
14
+ uses: actions/checkout@v3
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v4
18
+ with:
19
+ python-version: "3.10"
20
+
21
+ - name: Install dependencies
22
+ run: |
23
+ pip install huggingface_hub
24
+
25
+ - name: Configure huggingface-cli
26
+ run: |
27
+ echo "Hugging Face Token: ${{ secrets.HF_TOKEN }}"
28
+ huggingface-cli login --token ${{ secrets.HF_TOKEN }}
29
+
30
+ - name: Set up Git
31
+ run: |
32
+ git config --global user.name "github-actions[bot]"
33
+ git config --global user.email "github-actions[bot]@users.noreply.github.com"
34
+
35
+ - name: Add Hugging Face remote
36
+ run: |
37
+ git remote add huggingface https://huggingface:${{ secrets.HF_TOKEN }}@huggingface.co/spaces/AnnsKhan/billion_row_challenge
38
+
39
+ - name: Fetch and reset to main
40
+ run: |
41
+ git fetch huggingface
42
+ git reset --hard origin/main
43
+
44
+ - name: Push to Hugging Face Hub
45
+ run: |
46
+ git push huggingface main --force
.gitignore CHANGED
@@ -25,7 +25,8 @@ share/python-wheels/
25
  .installed.cfg
26
  *.egg
27
  MANIFEST
28
-
 
29
  # PyInstaller
30
  # Usually these files are written by a python script from a template
31
  # before PyInstaller builds the exe, so as to inject date/other infos into it.
 
25
  .installed.cfg
26
  *.egg
27
  MANIFEST
28
+ wandb/
29
+ data/
30
  # PyInstaller
31
  # Usually these files are written by a python script from a template
32
  # before PyInstaller builds the exe, so as to inject date/other infos into it.
README.md CHANGED
@@ -1 +1,13 @@
1
- # billionrows_
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Billion Row Challenge
3
+ emoji: 🌖
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.16.0
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: asdasdasdasdas
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ import gradio as gr
4
+ import time
5
+ import psutil
6
+ import tracemalloc
7
+ import gc
8
+ import pandas as pd
9
+ import dask.dataframe as dd
10
+ import polars as pl
11
+ import duckdb
12
+ import seaborn as sns
13
+ import matplotlib.pyplot as plt
14
+ import io
15
+ import os
16
+ from PIL import Image
17
+ import numpy as np
18
+ import matplotlib
19
+ import wandb
20
+
21
+ wandb.init(project="billion-row-analysis", name="benchmarking")
22
+
23
+
24
+ os.environ["MODIN_ENGINE"] = "dask"
25
+
26
+ # Initialize FastAPI app
27
+ app = FastAPI()
28
+
29
+ app.add_middleware(
30
+ CORSMiddleware,
31
+ allow_origins=["*"],
32
+ allow_credentials=True,
33
+ allow_methods=["*"],
34
+ allow_headers=["*"],
35
+ )
36
+
37
+ # Performance measurement function
38
+ def measure_performance(load_function, *args):
39
+ gc.collect()
40
+ tracemalloc.start()
41
+ start_time = time.time()
42
+ start_cpu = psutil.cpu_percent(interval=1)
43
+
44
+ total_memory = psutil.virtual_memory().total # Get total system memory
45
+
46
+ start_memory = psutil.Process().memory_info().rss / total_memory * 100 # Convert to percentage
47
+ data = load_function(*args)
48
+ end_memory = psutil.Process().memory_info().rss / total_memory * 100 # Convert to percentage
49
+
50
+ end_cpu = psutil.cpu_percent(interval=1)
51
+ end_time = time.time()
52
+
53
+ _, peak_memory = tracemalloc.get_traced_memory()
54
+ tracemalloc.stop()
55
+
56
+ peak_memory_percentage = peak_memory / total_memory * 100 # Convert to percentage
57
+
58
+ return data, end_time - start_time, max(end_cpu - start_cpu, 0), max(end_memory - start_memory, 0), peak_memory_percentage
59
+
60
+ # Data loading functions
61
+ def load_data_python_vectorized():
62
+ df = pd.read_parquet('data/raw/jan_2024.parquet')
63
+
64
+ # Convert numerical columns to NumPy arrays for vectorized operations
65
+ num_cols = df.select_dtypes(include=['number']).columns
66
+ np_data = {col: df[col].to_numpy() for col in num_cols}
67
+ return np_data
68
+
69
+ def load_data_pandas():
70
+ return pd.read_parquet('data/raw/jan_2024.parquet')
71
+
72
+ def load_data_dask():
73
+ return dd.read_parquet('data/raw/jan_2024.parquet')
74
+
75
+ def load_data_polars():
76
+ return pl.read_parquet('data/raw/jan_2024.parquet')
77
+
78
+ def load_data_duckdb():
79
+ return duckdb.read_parquet('data/raw/jan_2024.parquet')
80
+
81
+ # Loaders list
82
+ loaders = [
83
+ (load_data_pandas, "Pandas"),
84
+ (load_data_dask, "Dask"),
85
+ (load_data_polars, "Polars"),
86
+ (load_data_duckdb, "DuckDB"),
87
+ (load_data_python_vectorized, "Python Vectorized"),
88
+ ]
89
+
90
+ def run_benchmark():
91
+ benchmark_results = []
92
+ error_messages = []
93
+
94
+ for loader, lib_name in loaders:
95
+ try:
96
+ data, load_time, cpu_load, mem_load, peak_mem_load = measure_performance(loader)
97
+
98
+ # Log metrics to Weights & Biases
99
+ wandb.log({
100
+ "Library": lib_name,
101
+ "Load Time (s)": load_time,
102
+ "CPU Load (%)": cpu_load,
103
+ "Memory Load (%)": mem_load,
104
+ "Peak Memory (%)": peak_mem_load
105
+ })
106
+
107
+ benchmark_results.append({
108
+ "Library": lib_name,
109
+ "Load Time (s)": load_time,
110
+ "CPU Load (%)": cpu_load,
111
+ "Memory Load (%)": mem_load,
112
+ "Peak Memory (%)": peak_mem_load
113
+ })
114
+
115
+ except Exception as e:
116
+ error_messages.append(f"{lib_name} Error: {str(e)}")
117
+
118
+ if error_messages:
119
+ return '\n'.join(error_messages), None
120
+
121
+ benchmark_df = pd.DataFrame(benchmark_results)
122
+
123
+ sns.set(style="whitegrid")
124
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
125
+ fig.suptitle("Benchmark Results", fontsize=16)
126
+
127
+ sns.barplot(x="Library", y="Load Time (s)", data=benchmark_df, ax=axes[0, 0])
128
+ sns.barplot(x="Library", y="CPU Load (%)", data=benchmark_df, ax=axes[0, 1])
129
+ sns.barplot(x="Library", y="Memory Load (%)", data=benchmark_df, ax=axes[1, 0])
130
+ sns.barplot(x="Library", y="Peak Memory (%)", data=benchmark_df, ax=axes[1, 1])
131
+
132
+ buf = io.BytesIO()
133
+ plt.savefig(buf, format='png')
134
+ buf.seek(0)
135
+
136
+ # Convert plot to an image and log it to wandb
137
+ image = Image.open(buf)
138
+ wandb.log({"Benchmark Results": wandb.Image(image)})
139
+
140
+ image_array = np.array(image)
141
+
142
+ return benchmark_df.to_markdown(), image_array # Return NumPy array
143
+
144
+
145
+ matplotlib.use("Agg")
146
+ def explore_dataset():
147
+ try:
148
+ df = pd.read_parquet('data/raw/jan_2024.parquet')
149
+
150
+ # Generate dataset summary
151
+ summary = df.describe(include='all').T
152
+ summary["missing_values"] = df.isnull().sum()
153
+ summary["unique_values"] = df.nunique()
154
+ summary_text = summary.to_markdown()
155
+
156
+ # Log dataset summary as text in Weights & Biases
157
+ wandb.log({"Dataset Summary": wandb.Html(summary_text)})
158
+
159
+ # Prepare for visualization
160
+ fig, axes = plt.subplots(1, 2, figsize=(14, 5))
161
+ fig.suptitle("Dataset Overview", fontsize=16)
162
+
163
+ # Plot data type distribution
164
+ data_types = df.dtypes.value_counts()
165
+ sns.barplot(x=data_types.index.astype(str), y=data_types.values, ax=axes[0])
166
+ axes[0].set_title("Column Count by Data Type")
167
+ axes[0].set_ylabel("Count")
168
+
169
+ # Plot mean values of numeric columns
170
+ num_cols = df.select_dtypes(include=['number']).columns
171
+ if len(num_cols) > 0:
172
+ mean_values = df[num_cols].mean()
173
+ sns.barplot(x=mean_values.index, y=mean_values.values, ax=axes[1])
174
+ axes[1].set_title("Mean Values of Numeric Columns")
175
+ axes[1].tick_params(axis='x', rotation=45)
176
+
177
+ # Log mean values to Weights & Biases
178
+ for col, mean_val in mean_values.items():
179
+ wandb.log({f"Mean Values/{col}": mean_val})
180
+
181
+ # Save figure to buffer
182
+ buf = io.BytesIO()
183
+ plt.tight_layout()
184
+ plt.savefig(buf, format='png', bbox_inches='tight')
185
+ plt.close(fig)
186
+ buf.seek(0)
187
+
188
+ # Convert figure to NumPy array
189
+ image = Image.open(buf)
190
+ image_array = np.array(image)
191
+
192
+ # Log image to Weights & Biases
193
+ wandb.log({"Dataset Overview": wandb.Image(image)})
194
+
195
+ return summary_text, image_array
196
+
197
+ except Exception as e:
198
+ return f"Error loading data: {str(e)}", None
199
+
200
+
201
+ # Gradio interface setup
202
+ def gradio_interface():
203
+ def run_and_plot():
204
+ results, plot = run_benchmark()
205
+ return results, plot
206
+
207
+ def explore_data():
208
+ summary, plot = explore_dataset()
209
+ return summary, plot
210
+
211
+ with gr.Blocks() as demo:
212
+ gr.Markdown("## Explore Dataset")
213
+ explore_button = gr.Button("Explore Data")
214
+ summary_text = gr.Textbox(label="Dataset Summary")
215
+ explore_image = gr.Image(label="Feature Distributions")
216
+ explore_button.click(explore_data, outputs=[summary_text, explore_image])
217
+
218
+ gr.Markdown("## Benchmarking Different Data Loading Libraries")
219
+
220
+ run_button = gr.Button("Run Benchmark")
221
+ result_text = gr.Textbox(label="Benchmark Results")
222
+ plot_image = gr.Image(label="Performance Graph")
223
+
224
+ run_button.click(run_and_plot, outputs=[result_text, plot_image])
225
+ return demo
226
+
227
+ demo = gradio_interface()
228
+
229
+ # Run the Gradio app
230
+ demo.launch(share=False) # No need for share=True in VS Code, local access is sufficient