AnnsKhan commited on
Commit
05c5acf
·
1 Parent(s): d7a9790
Files changed (1) hide show
  1. app.py +10 -8
app.py CHANGED
@@ -17,15 +17,17 @@ from PIL import Image
17
  import numpy as np
18
  import matplotlib
19
  import wandb
20
- # from datasets import load_dataset
21
 
22
  # Load dataset once at the start to avoid redundant requests
23
  # dataset = load_dataset("Chendi/NYC_TAXI_FARE_CLEANED")
24
 
25
  wandb.login(key=os.getenv("WANDB_API_KEY"))
26
  wandb.init(project="billion-row-analysis", name="benchmarking")
27
-
28
-
 
 
29
  os.environ["MODIN_ENGINE"] = "dask"
30
 
31
  # Initialize FastAPI app
@@ -83,7 +85,7 @@ def measure_performance(load_function, *args):
83
 
84
  # Data loading functions
85
  def load_data_python_vectorized():
86
- df = pd.read_parquet('data/raw/jan_2024.parquet')
87
 
88
  # Convert numerical columns to NumPy arrays for vectorized operations
89
  num_cols = df.select_dtypes(include=['number']).columns
@@ -91,16 +93,16 @@ def load_data_python_vectorized():
91
  return np_data
92
 
93
  def load_data_pandas():
94
- return pd.read_parquet('data/raw/jan_2024.parquet')
95
 
96
  def load_data_dask():
97
- return dd.read_parquet('data/raw/jan_2024.parquet')
98
 
99
  def load_data_polars():
100
- return pl.read_parquet('data/raw/jan_2024.parquet')
101
 
102
  def load_data_duckdb():
103
- return duckdb.read_parquet('data/raw/jan_2024.parquet')
104
 
105
  # Loaders list
106
  loaders = [
 
17
  import numpy as np
18
  import matplotlib
19
  import wandb
20
+ from datasets import load_dataset
21
 
22
  # Load dataset once at the start to avoid redundant requests
23
  # dataset = load_dataset("Chendi/NYC_TAXI_FARE_CLEANED")
24
 
25
  wandb.login(key=os.getenv("WANDB_API_KEY"))
26
  wandb.init(project="billion-row-analysis", name="benchmarking")
27
+ dataset = load_dataset("AnnsKhan/jan_2024_nyc", split="train")
28
+ parquet_path = "data/raw/jan_2024.parquet"
29
+ if not os.path.exists(parquet_path):
30
+ dataset.to_pandas().to_parquet(parquet_path) # Save to disk
31
  os.environ["MODIN_ENGINE"] = "dask"
32
 
33
  # Initialize FastAPI app
 
85
 
86
  # Data loading functions
87
  def load_data_python_vectorized():
88
+ df = pd.read_parquet(parquet_path)
89
 
90
  # Convert numerical columns to NumPy arrays for vectorized operations
91
  num_cols = df.select_dtypes(include=['number']).columns
 
93
  return np_data
94
 
95
  def load_data_pandas():
96
+ return pd.read_parquet(parquet_path)
97
 
98
  def load_data_dask():
99
+ return dd.read_parquet(parquet_path)
100
 
101
  def load_data_polars():
102
+ return pl.read_parquet(parquet_path)
103
 
104
  def load_data_duckdb():
105
+ return duckdb.read_parquet(parquet_path)
106
 
107
  # Loaders list
108
  loaders = [