wayne-chi commited on
Commit
78575a4
·
verified ·
1 Parent(s): 55c27cb

Upload 6 files

Browse files
Files changed (6) hide show
  1. download_models.py +28 -0
  2. inference.py +242 -0
  3. predictor.py +640 -0
  4. requirements.txt +13 -3
  5. setup.sh +2 -0
  6. streamlit_app.py +15 -0
download_models.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import hf_hub_download, snapshot_download
3
+
4
+ # Target directory for models
5
+ target_dir = "Models"
6
+ os.makedirs(target_dir, exist_ok=True)
7
+
8
+ # Download specific files (Folds 1–5) from willieseun/Eagle-Team-TabPFN
9
+ print("Downloading fold models from willieseun/Eagle-Team-TabPFN...")
10
+ for i in range(1, 6):
11
+ file_name = f"Fold_{i}_best_model.tabpfn_fit"
12
+ model_path = hf_hub_download(
13
+ repo_id="willieseun/Eagle-Team-TabPFN",
14
+ filename=file_name,
15
+ local_dir=target_dir
16
+ )
17
+ print(f"Downloaded: {model_path}")
18
+
19
+ # Download full snapshot from wayne-chi/Eagle_Team
20
+ print("\nDownloading snapshot from wayne-chi/Eagle_Team...")
21
+ snapshot_download(
22
+ repo_id="wayne-chi/Eagle_Team",
23
+ revision="main", # Optional, default is "main"
24
+ local_dir=target_dir,
25
+ local_dir_use_symlinks=False
26
+ )
27
+
28
+ print("\n✅ All models downloaded successfully to:", target_dir)
inference.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ import joblib
5
+ import argparse
6
+ import os
7
+ import glob
8
+ from sklearn.multioutput import MultiOutputRegressor
9
+ from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNRegressor
10
+ from tabpfn import TabPFNRegressor
11
+
12
+
13
+ os.environ["TABPFN_ALLOW_CPU_LARGE_DATASET"] = "true"
14
+
15
+ def joblib_load_cpu(path):
16
+ # Patch torch.load globally inside joblib to always load on CPU
17
+ original_load = torch.load
18
+
19
+ def cpu_loader(*args, **kwargs):
20
+ kwargs['map_location'] = torch.device('cpu')
21
+ return original_load(*args, **kwargs)
22
+
23
+ torch.load = cpu_loader
24
+ try:
25
+ model = joblib.load(path)
26
+ finally:
27
+ torch.load = original_load # Restore original torch.load
28
+ return model
29
+
30
+ class TabPFNEnsemblePredictor:
31
+ """
32
+ A class to load an ensemble of TabPFN models and generate averaged predictions.
33
+
34
+ This class is designed to find and load all k-fold models from a specified
35
+ directory, handle the necessary feature engineering, and produce a single,
36
+ ensembled prediction from various input types (DataFrame, numpy array, or CSV file path).
37
+
38
+ Attributes:
39
+ model_paths (list): A list of file paths for the loaded models.
40
+ models (list): A list of the loaded model objects.
41
+ target_cols (list): The names of the target columns for the output DataFrame.
42
+ """
43
+
44
+ def __init__(self, model_dir: str, model_pattern: str = "Fold_*_best_model.tabpfn_fit*"):
45
+ """
46
+ Initializes the predictor by finding and loading the ensemble of models.
47
+
48
+ Args:
49
+ model_dir (str): The directory containing the saved .tabpfn_fit model files.
50
+ model_pattern (str, optional): The glob pattern to find model files.
51
+ Defaults to "Fold_*_best_model.tabpfn_fit".
52
+
53
+ Raises:
54
+ FileNotFoundError: If no models matching the pattern are found in the directory.
55
+ """
56
+ print("Initializing the TabPFN Ensemble Predictor...")
57
+ self.model_paths = sorted(glob.glob(os.path.join(model_dir, model_pattern)))
58
+ if not self.model_paths:
59
+ raise FileNotFoundError(
60
+ f"Error: No models found in '{model_dir}' matching the pattern '{model_pattern}'"
61
+ )
62
+
63
+ print(f"Found {len(self.model_paths)} models to form the ensemble.")
64
+ self.models = self._load_models()
65
+ self.target_cols = [f"BlendProperty{i}" for i in range(1, 11)]
66
+
67
+ def _load_models(self) -> list:
68
+ """
69
+ Loads the TabPFN models from the specified paths and moves them to the CPU.
70
+
71
+ This is a private method called during initialization.
72
+ """
73
+ loaded_models = []
74
+ for model_path in self.model_paths:
75
+ print(f"Loading model: {os.path.basename(model_path)}...")
76
+ try:
77
+ # Move model components to CPU for inference to avoid potential CUDA errors
78
+ # and ensure compatibility on machines without a GPU.
79
+ if not torch.cuda.is_available():
80
+ #torch.device("cpu") # Force default
81
+ #os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
82
+ #os.environ["CUDA_VISIBLE_DEVICES"] = ""
83
+ #os.environ["HSA_OVERRIDE_GFX_VERSION"] = "0"
84
+ model = joblib_load_cpu(model_path)
85
+ for estimator in model.estimators_:
86
+ estimator.device = "cpu"
87
+ estimator.max_time = 40
88
+ print("Cuda not available using cpu")
89
+ #for estimator in model.estimators_:
90
+ # if hasattr(estimator, "predictor_") and hasattr(estimator.predictor_, "predictors"):
91
+ # for p in estimator.predictor_.predictors:
92
+ # p.to("cpu")
93
+ # if hasattr(estimator.predictor_, 'to'):
94
+ # estimator.predictor_.to('cpu')
95
+
96
+ else:
97
+ print("Cuda is available")
98
+ model = joblib.load(model_path)
99
+ for estimator in model.estimators_:
100
+ if hasattr(estimator, "predictor_") and hasattr(estimator.predictor_, "predictors"):
101
+ for p in estimator.predictor_.predictors:
102
+ p.to("cuda")
103
+
104
+ loaded_models.append(model)
105
+ print(f"Successfully loaded {os.path.basename(model_path)}")
106
+ except Exception as e:
107
+ print(f"Warning: Could not load model from {model_path}. Skipping. Error: {e}")
108
+ return loaded_models
109
+
110
+ @staticmethod
111
+ def _feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
112
+ """
113
+ Applies feature engineering to the input dataframe. This is a static method
114
+ as it does not depend on the state of the class instance.
115
+
116
+ Args:
117
+ df (pd.DataFrame): The input dataframe.
118
+
119
+ Returns:
120
+ pd.DataFrame: The dataframe with new engineered features.
121
+ """
122
+ components = ['Component1', 'Component2', 'Component3', 'Component4', 'Component5']
123
+ properties = [f'Property{i}' for i in range(1, 11)]
124
+ df_featured = df.copy()
125
+
126
+ for prop in properties:
127
+ df_featured[f'Weighted_{prop}'] = sum(
128
+ df_featured[f'{comp}_fraction'] * df_featured[f'{comp}_{prop}'] for comp in components
129
+ )
130
+ cols = [f'{comp}_{prop}' for comp in components]
131
+ df_featured[f'{prop}_variance'] = df_featured[cols].var(axis=1)
132
+ df_featured[f'{prop}_range'] = df_featured[cols].max(axis=1) - df_featured[cols].min(axis=1)
133
+
134
+ return df_featured
135
+
136
+ def custom_predict(self, input_data: pd.DataFrame or np.ndarray or str) -> (np.ndarray, pd.DataFrame):
137
+ """
138
+ Generates ensembled predictions for the given input data.
139
+
140
+ This method takes input data, preprocesses it if necessary, generates a
141
+ prediction from each model in the ensemble, and returns the averaged result.
142
+
143
+ Args:
144
+ input_data (pd.DataFrame or np.ndarray or str): The input data for prediction.
145
+ Can be a pandas DataFrame, a numpy array (must be pre-processed),
146
+ or a string path to a CSV file.
147
+
148
+ Returns:
149
+ tuple: A tuple containing:
150
+ - np.ndarray: The averaged predictions as a numpy array.
151
+ - pd.DataFrame: The averaged predictions as a pandas DataFrame.
152
+ """
153
+ if not self.models:
154
+ print("Error: No models were loaded. Cannot make predictions.")
155
+ return None, None
156
+
157
+ # --- Data Preparation ---
158
+ if isinstance(input_data, str) and os.path.isfile(input_data):
159
+ print(f"Loading and processing data from CSV: {input_data}")
160
+ test_df = pd.read_csv(input_data)
161
+ processed_df = self._feature_engineering(test_df)
162
+ elif isinstance(input_data, pd.DataFrame):
163
+ print("Processing input DataFrame...")
164
+ processed_df = self._feature_engineering(input_data)
165
+ elif isinstance(input_data, np.ndarray):
166
+ print("Using input numpy array directly (assuming it's pre-processed).")
167
+ sub = input_data
168
+ else:
169
+ raise TypeError("Input data must be a pandas DataFrame, a numpy array, or a path to a CSV file.")
170
+
171
+ if isinstance(input_data, (str, pd.DataFrame)):
172
+ if "ID" in processed_df.columns:
173
+ sub = processed_df.drop(columns=["ID"]).values
174
+ else:
175
+ sub = processed_df.values
176
+
177
+ # --- Prediction Loop ---
178
+ all_fold_predictions = []
179
+ print("\nGenerating predictions from the model ensemble...")
180
+ for i, model in enumerate(self.models):
181
+ try:
182
+ y_sub = model.predict(sub)
183
+ all_fold_predictions.append(y_sub)
184
+ print(f" - Prediction from model {i+1} completed.")
185
+ except Exception as e:
186
+ print(f" - Warning: Could not predict with model {i+1}. Skipping. Error: {e}")
187
+
188
+ if not all_fold_predictions:
189
+ print("\nError: No predictions were generated from any model.")
190
+ return None, None
191
+
192
+ # --- Averaging ---
193
+ print("\nAveraging predictions from all models...")
194
+ averaged_preds_array = np.mean(all_fold_predictions, axis=0)
195
+ averaged_preds_df = pd.DataFrame(averaged_preds_array, columns=self.target_cols)
196
+ print("Ensemble prediction complete.")
197
+
198
+ return averaged_preds_array, averaged_preds_df
199
+
200
+ # This block allows the script to be run directly from the command line
201
+ if __name__ == "__main__":
202
+ parser = argparse.ArgumentParser(
203
+ description="""
204
+ Command-line interface for the TabPFNEnsemblePredictor.
205
+
206
+ Example Usage:
207
+ python inference.py --model_dir ./saved_models/ --input_path ./test_data.csv --output_path ./final_preds.csv
208
+ """,
209
+ formatter_class=argparse.RawTextHelpFormatter
210
+ )
211
+
212
+ parser.add_argument("--model_dir", type=str, required=True,
213
+ help="Directory containing the saved .tabpfn_fit model files.")
214
+ parser.add_argument("--input_path", type=str, required=True,
215
+ help="Path to the input CSV file for prediction.")
216
+ parser.add_argument("--output_path", type=str, default="predictions_ensembled.csv",
217
+ help="Path to save the final ensembled predictions CSV file.")
218
+
219
+ args = parser.parse_args()
220
+
221
+ if not os.path.isdir(args.model_dir):
222
+ print(f"Error: Model directory not found at {args.model_dir}")
223
+ elif not os.path.exists(args.input_path):
224
+ print(f"Error: Input file not found at {args.input_path}")
225
+ else:
226
+ try:
227
+ # 1. Instantiate the predictor class
228
+ predictor = TabPFNEnsemblePredictor(model_dir=args.model_dir)
229
+
230
+ # 2. Call the predict method
231
+ preds_array, preds_df = predictor.predict(args.input_path)
232
+
233
+ # 3. Save the results
234
+ if preds_df is not None:
235
+ preds_df.to_csv(args.output_path, index=False)
236
+ print(f"\nEnsembled predictions successfully saved to {args.output_path}")
237
+ print("\n--- Sample of Final Averaged Predictions ---")
238
+ print(preds_df.head())
239
+ print("------------------------------------------")
240
+
241
+ except Exception as e:
242
+ print(f"\nAn error occurred during the process: {e}")
predictor.py ADDED
@@ -0,0 +1,640 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prompt: import pandas and basic machine learning models for regression
2
+
3
+ import pandas as pd
4
+ from sklearn.linear_model import LinearRegression
5
+ from sklearn.tree import DecisionTreeRegressor
6
+ from sklearn.ensemble import RandomForestRegressor
7
+ from sklearn.svm import SVR
8
+
9
+
10
+ from sklearn.model_selection import train_test_split
11
+
12
+ import itertools
13
+ import random
14
+
15
+ import torch
16
+ import random
17
+ import numpy as np
18
+
19
+ import os
20
+ import joblib
21
+
22
+ import matplotlib.pyplot as plt
23
+
24
+ from tabpfn import TabPFNRegressor
25
+ from sklearn.model_selection import KFold
26
+ from sklearn.multioutput import MultiOutputRegressor
27
+ from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
28
+ from sklearn.preprocessing import MinMaxScaler
29
+ from sklearn.preprocessing import PolynomialFeatures
30
+
31
+ from sklearn.metrics import mean_absolute_percentage_error
32
+
33
+ from sklearn.linear_model import LinearRegression
34
+
35
+ from inference import TabPFNEnsemblePredictor # import inference.py
36
+
37
+ # from sklearn.metrics import mean_absolute_percentage_error
38
+ # from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNRegressor
39
+ from itertools import combinations
40
+ from scipy.special import comb
41
+ # from tabpfn.model.loading import (
42
+ # load_fitted_tabpfn_model,
43
+ # save_fitted_tabpfn_model,
44
+ # )
45
+
46
+
47
+ class EagleBlendPredictor:
48
+ def __init__(self, model_sources = './Models'):
49
+ """
50
+ model_sources: Dict[str, Any]
51
+ A dictionary where keys are 'BlendProperty1', ..., 'BlendProperty10'
52
+ and values are:
53
+ - loaded model objects, or
54
+ - callables returning models, or
55
+ - custom loading logic (you will supply these)
56
+ """
57
+ self.home = model_sources
58
+ self.saved_files_map = {
59
+ 1: {
60
+ "model": 'linear_model_poly_target_1.joblib',
61
+ "transform": 'poly1_features.joblib'
62
+ },
63
+ 2: {
64
+ "model": 'linear_model_poly_target_2.joblib',
65
+ "transform": 'poly2_features.joblib'
66
+ },
67
+ 5: {
68
+ "model": 'tabpfn_model_target_5.joblib', #tabpfn_model_target_5_cpu.tabpfn_fit,'tabpfn_model_target_5_cpu.tabpfn_fit'
69
+ "transform": 'poly5_features.joblib'
70
+ },
71
+ 6: {
72
+ "model": 'linear_model_poly_target_6.joblib',
73
+ "transform": 'poly6_features.joblib'
74
+ },
75
+ 7: {
76
+ "model": 'tabpfn_model_target_7.joblib',
77
+ # For Property 7, the transformation is the mixture feature generation,
78
+ # which is not a saved object like PolynomialFeatures.
79
+ # You would need to apply the generate_mixture_features function.
80
+ "transform_function": "generate_mixture_features"
81
+ },
82
+ 8: {
83
+ # For Property 8, the "model" is the initial prediction model (not explicitly saved in this workflow)
84
+ # and the correction is the piecewise function defined by parameters and threshold.
85
+ "params": 'piecewise_params_prop8.joblib',
86
+ "threshold": 'piecewise_threshold_prop8.joblib',
87
+ "correction_function": "piecewise_model" # Reference the function name
88
+ },
89
+ 10: {
90
+ "model": 'linear_model_poly_target_10.joblib',
91
+ "transform": 'poly10_features.joblib'
92
+ }
93
+ }
94
+
95
+
96
+ self.models = {}
97
+ # Load models and transformers manually
98
+ self.model_1 = joblib.load(os.path.join(self.home, self.saved_files_map[1]["model"]))
99
+ self.poly_1 = joblib.load(os.path.join(self.home, self.saved_files_map[1]["transform"]))
100
+
101
+ self.model_2 = joblib.load(os.path.join(self.home, self.saved_files_map[2]["model"]))
102
+ self.poly_2 = joblib.load(os.path.join(self.home, self.saved_files_map[2]["transform"]))
103
+
104
+ self.model_5 = joblib.load(
105
+ os.path.join(self.home, self.saved_files_map[5]["model"]), #device="cpu"
106
+ )
107
+ self.poly_5 = joblib.load(os.path.join(self.home, self.saved_files_map[5]["transform"]))
108
+
109
+ self.model_6 = joblib.load(os.path.join(self.home, self.saved_files_map[6]["model"]))
110
+ self.poly_6 = joblib.load(os.path.join(self.home, self.saved_files_map[6]["transform"]))
111
+
112
+ self.model_7 = joblib.load(
113
+ os.path.join(self.home, self.saved_files_map[7]["model"]), #device="cpu"
114
+ )
115
+ # No saved transform for model_7 — use generate_mixture_features later in prediction
116
+ self.piecewise_params_8 = joblib.load(os.path.join(self.home, self.saved_files_map[8]["params"]))
117
+ self.piecewise_threshold_8 = joblib.load(os.path.join(self.home, self.saved_files_map[8]["threshold"]))
118
+
119
+ # Use piecewise_model function later
120
+
121
+ self.model_10 = joblib.load(os.path.join(self.home, self.saved_files_map[10]["model"]))
122
+ self.poly_10 = joblib.load(os.path.join(self.home, self.saved_files_map[10]["transform"]))
123
+
124
+ self.model_3489 = TabPFNEnsemblePredictor(model_dir="Models")
125
+ pass
126
+
127
+
128
+ def piecewise_model(self, x, boundaries=np.linspace(-0.2, 0.2, 10+1)[1:-1]):
129
+ """
130
+ x: a single float value
131
+ params: list of 20 parameters [A1, B1, A2, B2, ..., A10, B10]
132
+ boundaries: 9 values that divide x into 10 regions
133
+ """
134
+ params = self.piecewise_params_8
135
+ # Unpack parameters
136
+ segments = [(params[i], params[i+1]) for i in range(0, 20, 2)]
137
+
138
+ # Piecewise logic using boundaries
139
+ for i, bound in enumerate(boundaries):
140
+ if x < bound:
141
+ A, B = segments[i]
142
+ return A * x + B
143
+ # If x is greater than all boundaries, use the last segment
144
+ A, B = segments[-1]
145
+ return A * x + B
146
+
147
+ def predict_BlendProperty1(self, data, full = True):
148
+ # Dummy custom transformation and prediction for BlendProperty1
149
+ if full:
150
+ features = self._transform1(data)
151
+ features = self.poly_1.transform(features)
152
+ else:
153
+ features = self.poly_1.transform(data)
154
+ res_df = self.model_1.predict(features)
155
+ return pd.DataFrame(res_df, columns=['BlendProperty1'])
156
+
157
+
158
+ def predict_BlendProperty2(self, data, full = True):
159
+ if full:
160
+ features = self._transform2(data)
161
+ features = self.poly_2.transform(features)
162
+ else:
163
+ features = self.poly_2.transform(data)
164
+ res_df = self.model_2.predict(features)
165
+ return pd.DataFrame(res_df, columns=['BlendProperty2'])
166
+
167
+
168
+ def predict_BlendProperty3489(self, df):
169
+ arrray,result_df = self.model_3489.custom_predict(df)
170
+ ans_df= result_df[['BlendProperty3','BlendProperty4','BlendProperty8','BlendProperty9']].copy() # Explicitly create a copy
171
+
172
+ ans_df.loc[ans_df['BlendProperty8'].abs()<0.2,'BlendProperty8'] = ans_df[ans_df['BlendProperty8'].abs()<0.2]['BlendProperty8'].apply(self.piecewise_model)
173
+ ans_df.loc[ans_df['BlendProperty9'].abs()<0.1,'BlendProperty9'] = 0 #ans_df[ans_df['BlendProperty8'].abs()<0.2]['BlendProperty8'].apply(self.piecewise_model)
174
+
175
+ return ans_df
176
+
177
+ # ndf.loc[ndf[pred_col].abs() < threshold_8, pred_col] = ndf[ndf[pred_col].abs() < threshold_8][pred_col].apply(func8)
178
+
179
+ def predict_BlendProperty5(self, data, full =True ):
180
+ if full:
181
+ features = self._transform5(data)
182
+ features = self.poly_5.transform(features)
183
+ else:
184
+ features = self.poly_5.transform(data)
185
+ res_df = self.model_5.predict(features)
186
+ return pd.DataFrame(res_df, columns=['BlendProperty5'])
187
+
188
+
189
+ def predict_BlendProperty6(self, data, full=True):
190
+ if full:
191
+ features = self._transform6(data)
192
+ features = self.poly_6.transform(features)
193
+ else:
194
+ features = self.poly_6.transform(data)
195
+ res_df = self.model_6.predict(features)
196
+ return pd.DataFrame(res_df, columns=['BlendProperty6'])
197
+
198
+
199
+ def predict_BlendProperty7(self, data, full =True)-> pd.DataFrame:
200
+ if full:
201
+ features = self._transform7(data)
202
+ else:
203
+ raise ValueError("BlendProperty7 prediction requires full data.")
204
+ res_df = self.model_7.predict(features)
205
+ return pd.DataFrame(res_df, columns=['BlendProperty7'])
206
+
207
+
208
+ def predict_BlendProperty10(self, data, full = False)-> pd.DataFrame:
209
+ if full:
210
+ features = self._transform10(data)
211
+ features = self.poly_10.transform(features)
212
+ else:
213
+ features = self.poly_10.transform(data)
214
+ res_df = self.model_10.predict(features)
215
+ return pd.DataFrame(res_df, columns=['BlendProperty10'])
216
+
217
+
218
+ def predict_all(self, df: pd.DataFrame) -> pd.DataFrame:
219
+ """
220
+ Generates predictions for all blend properties using the individual prediction methods.
221
+
222
+ Args:
223
+ df: Input DataFrame containing the features.
224
+
225
+ Returns:
226
+ DataFrame with predicted blend properties from 'BlendProperty1' to 'BlendProperty10'.
227
+ """
228
+ predictions_list = []
229
+
230
+ # Predict individual properties
231
+ predictions_list.append(self.predict_BlendProperty1(df, full=True))
232
+ predictions_list.append(self.predict_BlendProperty2(df, full=True))
233
+
234
+ # Predict BlendProperty3, 4, 8, and 9 together using predict_BlendProperty3489
235
+ # Assuming predict_BlendProperty3489 returns a DataFrame with columns for these properties.
236
+ predictions_3489_df = self.predict_BlendProperty3489(df)
237
+ predictions_list.append(predictions_3489_df[['BlendProperty3']])
238
+ predictions_list.append(predictions_3489_df[['BlendProperty4']])
239
+ predictions_list.append(predictions_3489_df[['BlendProperty8']])
240
+ predictions_list.append(predictions_3489_df[['BlendProperty9']])
241
+
242
+
243
+ predictions_list.append(self.predict_BlendProperty5(df, full=True))
244
+ predictions_list.append(self.predict_BlendProperty6(df, full=True))
245
+ predictions_list.append(self.predict_BlendProperty7(df, full=True))
246
+
247
+
248
+ predictions_list.append(self.predict_BlendProperty10(df, full=True))
249
+
250
+
251
+ # Concatenate the list of single-column DataFrames into a single DataFrame
252
+ predictions_df = pd.concat(predictions_list, axis=1)
253
+
254
+ # Ensure columns are in the desired order
255
+ ordered_cols = [f'BlendProperty{i}' for i in range(1, 11)]
256
+ # Reindex to ensure columns are in order, dropping any not generated (though all should be)
257
+ predictions_df = predictions_df.reindex(columns=ordered_cols)
258
+
259
+
260
+ return predictions_df
261
+
262
+
263
+
264
+
265
+
266
+
267
+ # Dummy transformation functions (replace with your actual logic later)
268
+ def _transform1(self, data):
269
+ """
270
+ Transforms input data (DataFrame or NumPy array) to features for BlendProperty1 prediction.
271
+
272
+ If input is a DataFrame, selects 'ComponentX_fraction' (X=1-5) and 'ComponentX_Property1' (X=1-5).
273
+ If input is a NumPy array, assumes the columns are already in the correct order:
274
+ Component1-5_fraction, Component1-5_Property1, Component1-5_Property2, ..., Component1-5_Property10
275
+ and selects the relevant columns for Property1.
276
+ Args:
277
+ data: pandas DataFrame or numpy array.
278
+
279
+ Returns:
280
+ numpy array of transformed features.
281
+ """
282
+ fraction_cols = [f'Component{i+1}_fraction' for i in range(5)]
283
+ property_cols = [f'Component{i+1}_Property1' for i in range(5)]
284
+ required_cols = fraction_cols + property_cols
285
+
286
+ if isinstance(data, pd.DataFrame):
287
+ # Select the required columns from the DataFrame
288
+ # Ensure columns exist to avoid KeyError
289
+ try:
290
+ features = data[required_cols]
291
+ except KeyError as e:
292
+ missing_col = str(e).split("'")[1]
293
+ raise ValueError(f"Input DataFrame is missing required column: {missing_col}") from e
294
+
295
+ elif isinstance(data, np.ndarray):
296
+ # Assume the NumPy array has columns in the specified order
297
+ # Select the first 5 columns (fractions) and columns for Property1 (indices 5 to 9)
298
+ if data.shape[1] < 10: # Need at least 5 fractions and 5 properties
299
+ raise ValueError(f"Input NumPy array must have at least 10 columns for this transformation.")
300
+
301
+ # Selecting columns based on the assumed order: fractions (0-4), Property1 (5-9)
302
+ features = data[:, :10] # Select first 10 columns: 5 fractions + 5 Property1
303
+
304
+ else:
305
+ raise TypeError("Input data must be a pandas DataFrame or a numpy array.")
306
+
307
+ # Return as numpy array, as expected by PolynomialFeatures.transform
308
+ return features
309
+
310
+ def _transform2(self, data):
311
+ """
312
+ Transforms input data (DataFrame or NumPy array) to features for BlendProperty2 prediction.
313
+ """
314
+ fraction_cols = [f'Component{i+1}_fraction' for i in range(5)]
315
+ property_cols = [f'Component{i+1}_Property2' for i in range(5)]
316
+ required_cols = fraction_cols + property_cols
317
+
318
+ if isinstance(data, pd.DataFrame):
319
+ try:
320
+ features = data[required_cols]
321
+ except KeyError as e:
322
+ missing_col = str(e).split("'")[1]
323
+ raise ValueError(f"Input DataFrame is missing required column: {missing_col}") from e
324
+
325
+ elif isinstance(data, np.ndarray):
326
+ # Assume the NumPy array has columns in the specified order
327
+ # Select the first 5 columns (fractions) and columns for Property2 (indices 10 to 14)
328
+ if data.shape[1] < 15: # Need at least 5 fractions, 5 Property1, and 5 Property2
329
+ raise ValueError(f"Input NumPy array must have at least 15 columns for this transformation.")
330
+
331
+ # Selecting columns based on the assumed order: fractions (0-4), Property1 (5-9), Property2 (10-14)
332
+ features = np.concatenate([data[:, :5], data[:, 10:15]], axis=1)
333
+
334
+ else:
335
+ raise TypeError("Input data must be a pandas DataFrame or a numpy array.")
336
+
337
+ return features.values if isinstance(features, pd.DataFrame) else features
338
+
339
+ def _transform3(self, data): return None
340
+
341
+ def _transform4(self, data): return None
342
+
343
+ def _transform5(self, data):
344
+ """
345
+ Transforms input data (DataFrame or NumPy array) to features for BlendProperty5 prediction.
346
+ Args:
347
+ data: pandas DataFrame or numpy array.
348
+
349
+ Returns:
350
+ numpy array of transformed features.
351
+ """
352
+ fraction_cols = [f'Component{i+1}_fraction' for i in range(5)]
353
+ property_cols = [f'Component{i+1}_Property5' for i in range(5)]
354
+ required_cols = fraction_cols + property_cols
355
+
356
+ if isinstance(data, pd.DataFrame):
357
+ try:
358
+ features = data[required_cols]
359
+ except KeyError as e:
360
+ missing_col = str(e).split("'")[1]
361
+ raise ValueError(f"Input DataFrame is missing required column: {missing_col}") from e
362
+
363
+ elif isinstance(data, np.ndarray):
364
+ # Assume the NumPy array has columns in the specified order
365
+ # Select the first 5 columns (fractions) and columns for Property5 (indices 25 to 29)
366
+ if data.shape[1] < 30: # Need at least 5 fractions and 5 properties for each of Property1-5
367
+ raise ValueError(f"Input NumPy array must have at least 30 columns for this transformation.")
368
+
369
+ # Selecting columns based on the assumed order: fractions (0-4), properties (5-9) for P1, (10-14) for P2, ..., (25-29) for P5
370
+ features = np.concatenate([data[:, :5], data[:, 25:30]], axis=1)
371
+
372
+ else:
373
+ raise TypeError("Input data must be a pandas DataFrame or a numpy array.")
374
+
375
+ return features
376
+
377
+
378
+ def _transform6(self, data):
379
+ """
380
+ Transforms input data (DataFrame or NumPy array) to features for BlendProperty6 prediction.
381
+
382
+ Args:
383
+ data: pandas DataFrame or numpy array.
384
+
385
+ Returns:
386
+ numpy array of transformed features.
387
+ """
388
+ fraction_cols = [f'Component{i+1}_fraction' for i in range(5)]
389
+ property_cols = [f'Component{i+1}_Property6' for i in range(5)]
390
+ required_cols = fraction_cols + property_cols
391
+
392
+ if isinstance(data, pd.DataFrame):
393
+ try:
394
+ features = data[required_cols]
395
+ except KeyError as e:
396
+ missing_col = str(e).split("'")[1]
397
+ raise ValueError(f"Input DataFrame is missing required column: {missing_col}") from e
398
+
399
+ elif isinstance(data, np.ndarray):
400
+ # Assume the NumPy array has columns in the specified order
401
+ # Select the first 5 columns (fractions) and columns for Property6 (indices 30 to 34)
402
+ if data.shape[1] < 35: # Need at least 5 fractions and 5 properties for each of Property1-6
403
+ raise ValueError(f"Input NumPy array must have at least 35 columns for this transformation.")
404
+
405
+ # Selecting columns based on the assumed order: fractions (0-4), properties (5-9) for P1, ..., (30-34) for P6
406
+ features = np.concatenate([data[:, :5], data[:, 30:35]], axis=1)
407
+
408
+ else:
409
+ raise TypeError("Input data must be a pandas DataFrame or a numpy array.")
410
+
411
+ return features
412
+
413
+
414
+
415
+ def _transform7(self, df: pd.DataFrame) -> pd.DataFrame:
416
+ """
417
+ Corrected transformation function for BlendProperty7 prediction.
418
+
419
+ Args:
420
+ df: Input DataFrame containing the features.
421
+
422
+ Returns:
423
+ DataFrame with generated features for BlendProperty7 prediction.
424
+ """
425
+ tn = 7
426
+ fn = tn
427
+
428
+ property_tn = [f'Component{i+1}_Property{fn}' for i in range(5)]
429
+ fraction_cols = [f'Component{i+1}_fraction' for i in range(5)]
430
+
431
+ # Generate mixture features
432
+ df_prop7 = df[fraction_cols + property_tn].reset_index(drop=True) # Reset index here
433
+ # Call the class's generate_mixture_features method
434
+ mixture_features = self.generate_mixture_features(df_prop7)
435
+
436
+ # Identify columns to concatenate (all ComponentX_PropertyY where Y != 7)
437
+ other_property_cols = [f"Component{i}_Property{j}" for j in range(1,11) for i in range(1,6) if j!= 7]
438
+
439
+ # Select these columns from the input DataFrame
440
+ try:
441
+ # Use .loc to preserve the original index when selecting columns, then reset index
442
+ other_features_df = df.loc[:, other_property_cols].reset_index(drop=True) # Reset index here
443
+ except KeyError as e:
444
+ missing_col = str(e).split("'")[1]
445
+ raise ValueError(f"Input DataFrame for _transform7 is missing required column: {missing_col}") from e
446
+
447
+
448
+ # Concatenate along columns (axis=1). Indices should now be aligned after resetting.
449
+ combined_features = pd.concat([mixture_features, other_features_df], axis=1)
450
+
451
+ return combined_features
452
+
453
+ def _transform8(self, row): return None
454
+ def _transform9(self, row): return None
455
+
456
+ def _transform10(self, data):
457
+ """
458
+ Transforms input data (DataFrame or NumPy array) to features for BlendProperty10 prediction.
459
+
460
+ If input is a DataFrame, selects 'ComponentX_fraction' (X=1-5) and 'ComponentX_Property10' (X=1-5).
461
+ If input is a NumPy array, assumes the columns are already in the correct order:
462
+ Component1-5_fraction, Component1-5_Property1, Component1-5_Property2, ..., Component1-5_Property10
463
+ and selects the relevant columns for Property10.
464
+
465
+ Args:
466
+ data: pandas DataFrame or numpy array.
467
+
468
+ Returns:
469
+ numpy array of transformed features.
470
+ """
471
+ fraction_cols = [f'Component{i+1}_fraction' for i in range(5)]
472
+ property_cols = [f'Component{i+1}_Property10' for i in range(5)]
473
+ required_cols = fraction_cols + property_cols
474
+
475
+ if isinstance(data, pd.DataFrame):
476
+ try:
477
+ features = data[required_cols]
478
+ except KeyError as e:
479
+ missing_col = str(e).split("'")[1]
480
+ raise ValueError(f"Input DataFrame is missing required column: {missing_col}") from e
481
+
482
+ elif isinstance(data, np.ndarray):
483
+ # Assume the NumPy array has columns in the specified order
484
+ # Select the first 5 columns (fractions) and columns for Property10 (indices 50 to 54)
485
+ if data.shape[1] < 55: # Need at least 5 fractions and 5 properties for each of Property1-10
486
+ raise ValueError(f"Input NumPy array must have at least 55 columns for this transformation.")
487
+
488
+ # Selecting columns based on the assumed order: fractions (0-4), properties (5-9) for P1, ..., (50-54) for P10
489
+ features = np.concatenate([data[:, :5], data[:, 50:55]], axis=1)
490
+
491
+ else:
492
+ raise TypeError("Input data must be a pandas DataFrame or a numpy array.")
493
+
494
+ return features
495
+
496
+
497
+
498
+ def generate_mixture_features(self,data):
499
+ """
500
+ Generate symmetric and weighted nonlinear interactions between fuel weights and properties.
501
+ The input 'data' should contain weights in the first 5 columns/elements and properties in the next 5.
502
+
503
+ :param data: np.ndarray, pd.DataFrame, or list of shape (n_samples, 10) or (10,)
504
+ :return: pd.DataFrame with generated features.
505
+ """
506
+ # Convert input to numpy array and handle single row/list input
507
+ if isinstance(data, pd.DataFrame):
508
+ data_array = data.values
509
+ elif isinstance(data, list):
510
+ data_array = np.array(data)
511
+ elif isinstance(data, np.ndarray):
512
+ data_array = data
513
+ else:
514
+ raise TypeError("Input data must be a pandas DataFrame, numpy array, or list.")
515
+
516
+ # Reshape single row/list input to 2D array
517
+ if data_array.ndim == 1:
518
+ data_array = data_array.reshape(1, -1)
519
+
520
+ # Ensure the input has 10 columns (5 weights + 5 properties)
521
+ if data_array.shape[1] != 10:
522
+ raise ValueError("Input data must have 10 columns/elements (5 weights and 5 properties).")
523
+
524
+ # Separate weights and properties
525
+ W = data_array[:, :5]
526
+ P = data_array[:, 5:]
527
+
528
+
529
+ n_samples, n_fuels = W.shape
530
+ features = {}
531
+
532
+ # Original weights and properties
533
+ for i in range(n_fuels):
534
+ features[f'w{i+1}'] = W[:, i]
535
+ features[f'p{i+1}'] = P[:, i]
536
+ features[f'w{i+1}_p{i+1}'] = W[:, i] * P[:, i] # weighted property
537
+
538
+ # --- 1. Weighted sum of properties ---
539
+ features['weighted_sum'] = np.sum(W * P, axis=1)
540
+
541
+ # --- 2. Weighted square of properties ---
542
+ features['weighted_sum_sq'] = np.sum(W * P**2, axis=1)
543
+
544
+ # --- 3. Weighted tanh of properties ---
545
+ features['weighted_tanh'] = np.sum(W * np.tanh(P), axis=1)
546
+
547
+ # --- 4. Weighted exponential ---
548
+ # features['weighted_exp'] = np.sum(W * np.exp(P), axis=1)
549
+ # Clip P before exponential to avoid overflow
550
+ safe_exp = np.exp(np.clip(P, a_min=None, a_max=50)) # 50 is safe upper bound
551
+ features['weighted_exp'] = np.sum(W * safe_exp, axis=1)
552
+
553
+
554
+ # --- 5. Weighted logarithm (clip to avoid -inf) ---
555
+ # features['weighted_log'] = np.sum(W * np.log(np.clip(P, 1e-6, None)), axis=1)
556
+ features['weighted_log'] = np.sum(W * np.log(np.clip(P, 1e-6, None)), axis=1)
557
+
558
+
559
+ # --- 6. Pairwise interactions (symmetric, weighted) ---
560
+ for i, j in combinations(range(n_fuels), 2):
561
+ pij = P[:, i] * P[:, j]
562
+ wij = W[:, i] * W[:, j]
563
+ features[f'pair_p{i+1}p{j+1}'] = pij
564
+ features[f'weighted_pair_p{i+1}p{j+1}'] = pij * wij
565
+
566
+ # --- 7. Triple interactions (weighted & symmetric) ---
567
+ for i, j, k in combinations(range(n_fuels), 3):
568
+ pij = P[:, i] * P[:, j] * P[:, k]
569
+ wij = W[:, i] * W[:, j] * W[:, k]
570
+ features[f'triplet_p{i+1}{j+1}{k+1}'] = pij
571
+ features[f'weighted_triplet_p{i+1}{j+1}{k+1}'] = pij * wij
572
+
573
+ # --- 8. Power series + weight modulated ---
574
+ for power in [2, 3, 4]:
575
+ features[f'power_sum_{power}'] = np.sum(W * P**power, axis=1)
576
+
577
+ # --- 9. Log-weighted property (prevent log(0)) ---
578
+ logW = np.log(np.clip(W, 1e-6, None))
579
+ features['log_weighted_p'] = np.sum(logW * P, axis=1)
580
+
581
+ # --- 10. Symmetric polynomial combinations (elementary symmetric) ---
582
+ # Up to degree 5 (since you have 5 fuels)
583
+ for r in range(1, 6):
584
+ key = f'e_sym_poly_r{r}'
585
+ val = np.zeros(n_samples)
586
+ for idx in combinations(range(n_fuels), r):
587
+ prod_p = np.prod(P[:, idx], axis=1)
588
+ val += prod_p
589
+ features[key] = val
590
+
591
+ # --- 11. Weighted interaction difference (symmetry in differences) ---
592
+ for i, j in combinations(range(n_fuels), 2):
593
+ diff = P[:, i] - P[:, j]
594
+ wdiff = W[:, i] * W[:, j]
595
+ features[f'weighted_diff_p{i+1}{j+1}'] = diff * wdiff
596
+
597
+ # --- 12. Mean, max, min (weighted) ---
598
+ total_weight = np.sum(W, axis=1, keepdims=True)
599
+ weighted_mean = np.sum(W * P, axis=1) / np.clip(total_weight.squeeze(), 1e-6, None)
600
+ features['weighted_mean'] = weighted_mean
601
+ features['max_prop'] = np.max(P, axis=1)
602
+ features['min_prop'] = np.min(P, axis=1)
603
+
604
+ # --- 13. Weighted cross-log terms ---
605
+ for i, j in combinations(range(n_fuels), 2):
606
+ log_mix = np.log(np.clip(P[:, i] + P[:, j], 1e-6, None))
607
+ wij = W[:, i] * W[:, j]
608
+ features[f'logsum_p{i+1}{j+1}'] = log_mix * wij
609
+
610
+ # --- 14. Inverse + weighted inverse ---
611
+ # features['inv_prop_sum'] = np.sum(W / np.clip(P, 1e-6, None), axis=1)
612
+ features['inv_prop_sum'] = np.sum(W / np.clip(P, 1e-6, None), axis=1)
613
+
614
+
615
+ # --- 15. Weighted relu (max(p, 0)) ---
616
+ relu = np.maximum(P, 0)
617
+ features['weighted_relu'] = np.sum(W * relu, axis=1)
618
+
619
+ # --- 16. Weighted sin/cos transforms ---
620
+ features['weighted_sin'] = np.sum(W * np.sin(P), axis=1)
621
+ features['weighted_cos'] = np.sum(W * np.cos(P), axis=1)
622
+
623
+ # --- 17. Normalized properties ---
624
+ prop_sum = np.sum(P, axis=1, keepdims=True)
625
+ normalized_P = P / np.clip(prop_sum, 1e-6, None)
626
+ for i in range(n_fuels):
627
+ features[f'norm_p{i+1}'] = normalized_P[:, i]
628
+
629
+ # --- 18. Product of all p's and all w's ---
630
+ features['total_product_p'] = np.prod(P, axis=1)
631
+ features['total_product_w'] = np.prod(W, axis=1)
632
+
633
+ # --- 19. Mixed entropic form ---
634
+ # entropy_like = -np.sum(W * np.log(np.clip(W, 1e-6, None)), axis=1)
635
+ # features['entropy_weights'] = entropy_like
636
+
637
+ # Convert to DataFrame
638
+ df = pd.DataFrame(features)
639
+
640
+ return df
requirements.txt CHANGED
@@ -1,3 +1,13 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
 
1
+ tabpfn-extensions @ git+https://github.com/PriorLabs/tabpfn-extensions.git@16e0e4f4305a3546eab5be6ebf163ff41bd3843d
2
+ scikit-learn==1.5.1
3
+ huggingface_hub
4
+ autogluon
5
+ tabpfn==2.0.9
6
+ streamlit==1.43.0
7
+ numpy==1.26.4
8
+ pandas==2.2.3
9
+ matplotlib==3.10.0
10
+ matplotlib-inline==0.1.7
11
+ seaborn==0.13.2
12
+ torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl
13
+ setuptools
setup.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ #!/bin/bash
2
+ python download_models.py
streamlit_app.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ import pkg_resources
4
+
5
+ st.title("📦 Installed Python Modules")
6
+
7
+ # Get all installed packages
8
+ packages = sorted(
9
+ [(d.project_name, d.version) for d in pkg_resources.working_set],
10
+ key=lambda x: x[0].lower()
11
+ )
12
+
13
+ # Display them
14
+ for name, version in packages:
15
+ st.write(f"{name} — {version}")