Spaces:
Sleeping
Sleeping
updated
Browse files
app.py
CHANGED
@@ -17,15 +17,17 @@ from PIL import Image
|
|
17 |
import numpy as np
|
18 |
import matplotlib
|
19 |
import wandb
|
20 |
-
|
21 |
|
22 |
# Load dataset once at the start to avoid redundant requests
|
23 |
# dataset = load_dataset("Chendi/NYC_TAXI_FARE_CLEANED")
|
24 |
|
25 |
wandb.login(key=os.getenv("WANDB_API_KEY"))
|
26 |
wandb.init(project="billion-row-analysis", name="benchmarking")
|
27 |
-
|
28 |
-
|
|
|
|
|
29 |
os.environ["MODIN_ENGINE"] = "dask"
|
30 |
|
31 |
# Initialize FastAPI app
|
@@ -83,7 +85,7 @@ def measure_performance(load_function, *args):
|
|
83 |
|
84 |
# Data loading functions
|
85 |
def load_data_python_vectorized():
|
86 |
-
df = pd.read_parquet(
|
87 |
|
88 |
# Convert numerical columns to NumPy arrays for vectorized operations
|
89 |
num_cols = df.select_dtypes(include=['number']).columns
|
@@ -91,16 +93,16 @@ def load_data_python_vectorized():
|
|
91 |
return np_data
|
92 |
|
93 |
def load_data_pandas():
|
94 |
-
return pd.read_parquet(
|
95 |
|
96 |
def load_data_dask():
|
97 |
-
return dd.read_parquet(
|
98 |
|
99 |
def load_data_polars():
|
100 |
-
return pl.read_parquet(
|
101 |
|
102 |
def load_data_duckdb():
|
103 |
-
return duckdb.read_parquet(
|
104 |
|
105 |
# Loaders list
|
106 |
loaders = [
|
|
|
17 |
import numpy as np
|
18 |
import matplotlib
|
19 |
import wandb
|
20 |
+
from datasets import load_dataset
|
21 |
|
22 |
# Load dataset once at the start to avoid redundant requests
|
23 |
# dataset = load_dataset("Chendi/NYC_TAXI_FARE_CLEANED")
|
24 |
|
25 |
wandb.login(key=os.getenv("WANDB_API_KEY"))
|
26 |
wandb.init(project="billion-row-analysis", name="benchmarking")
|
27 |
+
dataset = load_dataset("AnnsKhan/jan_2024_nyc", split="train")
|
28 |
+
parquet_path = "data/raw/jan_2024.parquet"
|
29 |
+
if not os.path.exists(parquet_path):
|
30 |
+
dataset.to_pandas().to_parquet(parquet_path) # Save to disk
|
31 |
os.environ["MODIN_ENGINE"] = "dask"
|
32 |
|
33 |
# Initialize FastAPI app
|
|
|
85 |
|
86 |
# Data loading functions
|
87 |
def load_data_python_vectorized():
|
88 |
+
df = pd.read_parquet(parquet_path)
|
89 |
|
90 |
# Convert numerical columns to NumPy arrays for vectorized operations
|
91 |
num_cols = df.select_dtypes(include=['number']).columns
|
|
|
93 |
return np_data
|
94 |
|
95 |
def load_data_pandas():
|
96 |
+
return pd.read_parquet(parquet_path)
|
97 |
|
98 |
def load_data_dask():
|
99 |
+
return dd.read_parquet(parquet_path)
|
100 |
|
101 |
def load_data_polars():
|
102 |
+
return pl.read_parquet(parquet_path)
|
103 |
|
104 |
def load_data_duckdb():
|
105 |
+
return duckdb.read_parquet(parquet_path)
|
106 |
|
107 |
# Loaders list
|
108 |
loaders = [
|