In [1]:
import autogluon
import autogluon as ag
from autogluon import TabularPrediction as task
import pandas as pd
from sklearn.model_selection import train_test_split
Note: please download the dataset legendu/avg_score_after_round3_features
from Kaggle
before proceeding to the following.
In [3]:
df = pd.read_parquet("avg_score_after_round3_features/")
train, test = train_test_split(df.iloc[:, 4:], test_size=0.4, random_state=119)
In [4]:
train.shape
Out[4]:
(553608, 35)
In [5]:
test.shape
Out[5]:
(369072, 35)
In [6]:
train_data = task.Dataset(df=train)
test_data = task.Dataset(df=test)
In [ ]:
model = task.fit(
train_data=train_data, output_directory="auto_gluon", label="avg_score_after_round3"
)
Beginning AutoGluon training ... AutoGluon will save models to auto_gluon/ Train Data Rows: 553608 Train Data Columns: 35 Preprocessing data ... Here are the first 10 unique label values in your data: [ 5.41576689 0.59051321 3.5933087 0.22835347 2.58887605 0.03626061 15.30469829 4.38147371 4.57907499 9.85850969] AutoGluon infers your prediction problem is: regression (because dtype of label-column == float and many unique label-values observed) If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression']) Feature Generator processed 553608 data points with 34 features Original Features: int features: 11 float features: 23 Generated Features: int features: 0 All Features: int features: 11 float features: 23 Data preprocessing and feature engineering runtime = 1.72s ... AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error To change this, specify the eval_metric argument of fit() AutoGluon will early stop models using evaluation metric: root_mean_squared_error /usr/lib/python3.7/imp.py:342: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working return _load(spec) Fitting model: RandomForestRegressorMSE ... -1.111 = Validation root_mean_squared_error score 153.25s = Training runtime 0.55s = Validation runtime
In [13]:
model.fit_summary()
*** Summary of fit() *** Number of models trained: 9 Types of models trained: {'LGBModel', 'CatboostModel', 'KNNModel', 'WeightedEnsembleModel', 'TabularNeuralNetModel', 'RFModel'} Validation performance of individual models: {'RandomForestRegressorMSE': -1.0657452978721853, 'ExtraTreesRegressorMSE': -0.9910711261197889, 'KNeighborsRegressorUnif': -1.7777866717734974, 'KNeighborsRegressorDist': -1.5954465432398592, 'LightGBMRegressor': -1.06333423182554, 'CatboostRegressor': -1.0040615551029053, 'NeuralNetRegressor': -0.9160444564621223, 'LightGBMRegressorCustom': -1.038906098211013, 'weighted_ensemble_k0_l1': -0.8812506880621533} Best model (based on validation performance): weighted_ensemble_k0_l1 Hyperparameter-tuning used: False Bagging used: False Stack-ensembling used: False User-specified hyperparameters: {'NN': {'num_epochs': 500}, 'GBM': {'num_boost_round': 10000}, 'CAT': {'iterations': 10000}, 'RF': {'n_estimators': 300}, 'XT': {'n_estimators': 300}, 'KNN': {}, 'custom': ['GBM']} Plot summary of models saved to file: SummaryOfModels.html *** End of fit() summary ***
/usr/lib/python3.7/imp.py:342: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working return _load(spec) /usr/local/lib/python3.7/dist-packages/autogluon/utils/plots.py:133: UserWarning: AutoGluon summary plots cannot be created because bokeh is not installed. Please do: "pip install bokeh" warnings.warn('AutoGluon summary plots cannot be created because bokeh is not installed. Please do: "pip install bokeh"')
Out[13]:
{'model_types': {'RandomForestRegressorMSE': 'RFModel', 'ExtraTreesRegressorMSE': 'RFModel', 'KNeighborsRegressorUnif': 'KNNModel', 'KNeighborsRegressorDist': 'KNNModel', 'LightGBMRegressor': 'LGBModel', 'CatboostRegressor': 'CatboostModel', 'NeuralNetRegressor': 'TabularNeuralNetModel', 'LightGBMRegressorCustom': 'LGBModel', 'weighted_ensemble_k0_l1': 'WeightedEnsembleModel'}, 'model_performance': {'RandomForestRegressorMSE': -1.0657452978721853, 'ExtraTreesRegressorMSE': -0.9910711261197889, 'KNeighborsRegressorUnif': -1.7777866717734974, 'KNeighborsRegressorDist': -1.5954465432398592, 'LightGBMRegressor': -1.06333423182554, 'CatboostRegressor': -1.0040615551029053, 'NeuralNetRegressor': -0.9160444564621223, 'LightGBMRegressorCustom': -1.038906098211013, 'weighted_ensemble_k0_l1': -0.8812506880621533}, 'model_best': 'weighted_ensemble_k0_l1', 'model_paths': {'RandomForestRegressorMSE': 'auto_gluon/models/RandomForestRegressorMSE/', 'ExtraTreesRegressorMSE': 'auto_gluon/models/ExtraTreesRegressorMSE/', 'KNeighborsRegressorUnif': 'auto_gluon/models/KNeighborsRegressorUnif/', 'KNeighborsRegressorDist': 'auto_gluon/models/KNeighborsRegressorDist/', 'LightGBMRegressor': 'auto_gluon/models/LightGBMRegressor/', 'CatboostRegressor': 'auto_gluon/models/CatboostRegressor/', 'NeuralNetRegressor': 'auto_gluon/models/NeuralNetRegressor/', 'LightGBMRegressorCustom': 'auto_gluon/models/LightGBMRegressorCustom/', 'weighted_ensemble_k0_l1': 'auto_gluon/models/weighted_ensemble_k0_l1/'}, 'model_fit_times': {'RandomForestRegressorMSE': 152.92721271514893, 'ExtraTreesRegressorMSE': 100.98368000984192, 'KNeighborsRegressorUnif': 19.484992265701294, 'KNeighborsRegressorDist': 19.212828636169434, 'LightGBMRegressor': 13.707395076751709, 'CatboostRegressor': 799.2617483139038, 'NeuralNetRegressor': 4635.101431131363, 'LightGBMRegressorCustom': 21.870562076568604, 'weighted_ensemble_k0_l1': 0.795548677444458}, 'model_pred_times': {'RandomForestRegressorMSE': 0.5544726848602295, 'ExtraTreesRegressorMSE': 0.738731861114502, 'KNeighborsRegressorUnif': 0.15466761589050293, 'KNeighborsRegressorDist': 0.13497662544250488, 'LightGBMRegressor': 0.02784132957458496, 'CatboostRegressor': 0.04940915107727051, 'NeuralNetRegressor': 4.679487466812134, 'LightGBMRegressorCustom': 0.03708219528198242, 'weighted_ensemble_k0_l1': 0.0013790130615234375}, 'num_bagging_folds': 0, 'stack_ensemble_levels': 0, 'feature_prune': False, 'hyperparameter_tune': False, 'hyperparameters_userspecified': {'NN': {'num_epochs': 500}, 'GBM': {'num_boost_round': 10000}, 'CAT': {'iterations': 10000}, 'RF': {'n_estimators': 300}, 'XT': {'n_estimators': 300}, 'KNN': {}, 'custom': ['GBM']}, 'model_hyperparams': {'RandomForestRegressorMSE': {'model_type': 'rf', 'n_estimators': 300, 'n_jobs': -1, 'criterion': 'mse'}, 'ExtraTreesRegressorMSE': {'model_type': 'xt', 'n_estimators': 300, 'n_jobs': -1, 'criterion': 'mse'}, 'KNeighborsRegressorUnif': {'weights': 'uniform', 'n_jobs': -1}, 'KNeighborsRegressorDist': {'weights': 'distance', 'n_jobs': -1}, 'LightGBMRegressor': {'num_boost_round': 10000, 'num_threads': -1, 'objective': 'regression', 'metric': 'regression', 'verbose': -1, 'boosting_type': 'gbdt', 'two_round': True}, 'CatboostRegressor': {'iterations': 10000, 'learning_rate': 0.1, 'random_seed': 0, 'eval_metric': <autogluon.utils.tabular.ml.models.catboost.catboost_utils.RegressionCustomMetric at 0x7f3500c8eb00>}, 'NeuralNetRegressor': {'num_epochs': 500, 'seed_value': None, 'proc.embed_min_categories': 4, 'proc.impute_strategy': 'median', 'proc.max_category_levels': 100, 'proc.skew_threshold': 0.99, 'network_type': 'widedeep', 'layers': [256, 128], 'numeric_embed_dim': 420, 'activation': 'relu', 'max_layer_width': 2056, 'embedding_size_factor': 1.0, 'embed_exponent': 0.56, 'max_embedding_dim': 100, 'y_range': (0, 57.325798296928404), 'y_range_extend': 0.05, 'use_batchnorm': True, 'dropout_prob': 0.1, 'batch_size': 512, 'loss_function': L1Loss(batch_axis=0, w=None), 'optimizer': 'adam', 'learning_rate': 0.0003, 'weight_decay': 1e-06, 'clip_gradient': 100.0, 'momentum': 0.9, 'epochs_wo_improve': 20, 'num_dataloading_workers': 20, 'ctx': cpu(0)}, 'LightGBMRegressorCustom': {'num_boost_round': 10000, 'num_threads': -1, 'objective': 'regression', 'metric': 'regression', 'verbose': -1, 'boosting_type': 'gbdt', 'two_round': True, 'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 5, 'seed_value': 0}, 'weighted_ensemble_k0_l1': {'max_models': 25, 'max_models_per_type': 5}}}
In [15]:
model.leaderboard()
model score_val fit_time pred_time_val stack_level 8 weighted_ensemble_k0_l1 -0.881251 0.795549 0.001379 1 6 NeuralNetRegressor -0.916044 4635.101431 4.679487 0 1 ExtraTreesRegressorMSE -0.991071 100.983680 0.738732 0 5 CatboostRegressor -1.004062 799.261748 0.049409 0 7 LightGBMRegressorCustom -1.038906 21.870562 0.037082 0 4 LightGBMRegressor -1.063334 13.707395 0.027841 0 0 RandomForestRegressorMSE -1.065745 152.927213 0.554473 0 3 KNeighborsRegressorDist -1.595447 19.212829 0.134977 0 2 KNeighborsRegressorUnif -1.777787 19.484992 0.154668 0
Out[15]:
model | score_val | fit_time | pred_time_val | stack_level | |
---|---|---|---|---|---|
8 | weighted_ensemble_k0_l1 | -0.881251 | 0.795549 | 0.001379 | 1 |
6 | NeuralNetRegressor | -0.916044 | 4635.101431 | 4.679487 | 0 |
1 | ExtraTreesRegressorMSE | -0.991071 | 100.983680 | 0.738732 | 0 |
5 | CatboostRegressor | -1.004062 | 799.261748 | 0.049409 | 0 |
7 | LightGBMRegressorCustom | -1.038906 | 21.870562 | 0.037082 | 0 |
4 | LightGBMRegressor | -1.063334 | 13.707395 | 0.027841 | 0 |
0 | RandomForestRegressorMSE | -1.065745 | 152.927213 | 0.554473 | 0 |
3 | KNeighborsRegressorDist | -1.595447 | 19.212829 | 0.134977 | 0 |
2 | KNeighborsRegressorUnif | -1.777787 | 19.484992 | 0.154668 | 0 |
In [16]:
dir(model)
Out[16]:
['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_createResults', '_format_results', '_learner', '_save_model', '_save_results', '_summarize', '_trainer', 'class_labels', 'eval_metric', 'evaluate', 'evaluate_predictions', 'feature_types', 'fit_summary', 'label_column', 'leaderboard', 'load', 'model_names', 'model_performance', 'output_directory', 'predict', 'predict_proba', 'problem_type', 'save']
Load Saved Models¶
The trained models are automatically saved to disk and can be load back into memory.
In [10]:
model2 = task.load("auto_gluon")
In [11]:
model2.leaderboard()
model score_val fit_time pred_time_val stack_level 8 weighted_ensemble_k0_l1 -0.881251 0.795549 0.001379 1 6 NeuralNetRegressor -0.916044 4635.101431 4.679487 0 1 ExtraTreesRegressorMSE -0.991071 100.983680 0.738732 0 5 CatboostRegressor -1.004062 799.261748 0.049409 0 7 LightGBMRegressorCustom -1.038906 21.870562 0.037082 0 4 LightGBMRegressor -1.063334 13.707395 0.027841 0 0 RandomForestRegressorMSE -1.065745 152.927213 0.554473 0 3 KNeighborsRegressorDist -1.595447 19.212829 0.134977 0 2 KNeighborsRegressorUnif -1.777787 19.484992 0.154668 0
Out[11]:
model | score_val | fit_time | pred_time_val | stack_level | |
---|---|---|---|---|---|
8 | weighted_ensemble_k0_l1 | -0.881251 | 0.795549 | 0.001379 | 1 |
6 | NeuralNetRegressor | -0.916044 | 4635.101431 | 4.679487 | 0 |
1 | ExtraTreesRegressorMSE | -0.991071 | 100.983680 | 0.738732 | 0 |
5 | CatboostRegressor | -1.004062 | 799.261748 | 0.049409 | 0 |
7 | LightGBMRegressorCustom | -1.038906 | 21.870562 | 0.037082 | 0 |
4 | LightGBMRegressor | -1.063334 | 13.707395 | 0.027841 | 0 |
0 | RandomForestRegressorMSE | -1.065745 | 152.927213 | 0.554473 | 0 |
3 | KNeighborsRegressorDist | -1.595447 | 19.212829 | 0.134977 | 0 |
2 | KNeighborsRegressorUnif | -1.777787 | 19.484992 | 0.154668 | 0 |
Further Research¶
It is strange that ExtraTreesRegressorMSE and RandomForestRegressorMSE generate huge models. Check to see what happened.
In [20]:
!du -lhd 1 auto_gluon/models
1.3M auto_gluon/models/LightGBMRegressor 100K auto_gluon/models/weighted_ensemble_k0_l1 20G auto_gluon/models/ExtraTreesRegressorMSE 311M auto_gluon/models/KNeighborsRegressorDist 311M auto_gluon/models/KNeighborsRegressorUnif 13G auto_gluon/models/RandomForestRegressorMSE 4.3M auto_gluon/models/LightGBMRegressorCustom 3.9M auto_gluon/models/NeuralNetRegressor 1.8M auto_gluon/models/CatboostRegressor 32G auto_gluon/models
In [ ]: