import pandas as pd
pd.set_option('display.float_format', '{:.2f}'.format)

df_uc = pd.read_csv('data/used_car.csv', parse_dates=['posting_date'])

df_uc.shape

(426812, 17)

df_uc.sample(3)

df_uc.dtypes

id                       int64
region                  object
price                    int64
year                   float64
manufacturer            object
model                   object
condition               object
cylinders               object
fuel                    object
odometer               float64
transmission            object
VIN                     object
drive                   object
type                    object
paint_color             object
state                   object
posting_date    datetime64[ns]
dtype: object

df_uc.drop(columns=['id']).duplicated().sum().__str__()

'4222'

df_uc = df_uc.drop(columns=['id']).drop_duplicates(keep='first')

# Descriptive stats for integers, floats and datetime
df_uc.describe(exclude=['object'])

# 1: Remove samples with a price >= 100000
df_uc = df_uc.loc[df_uc['price']<100000]

# 2: Remove samples with a year < 2000
df_uc = df_uc.loc[df_uc['year']>=2000]

# 3: Remove samples with a odometer > 200000
df_uc = df_uc.loc[df_uc['odometer']<=200000]

# Descriptive stats for strings only
df_uc.describe(include=['object'])

from util import plot_univariate

plot_univariate(
    df_uc, 
    columns=[
        'manufacturer', 'state', 'price', 'year', 'odometer', 
        'condition', 'fuel', 'type', 'drive', 'transmission', 
        'cylinders', 'paint_color'], 
    log_dict={'price': True, 'odometer': True},
    ncols=4, size=(20,15), wspace=.3)

df_uc = df_uc.loc[df_uc['price'] > 1000]

# Target variable
y = df_uc['price']

# Features
X = df_uc.drop(columns=['price'])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=333)

X_train.isnull().sum()

region               0
year                 0
manufacturer      7169
model             2011
condition       101347
cylinders       111302
fuel              1601
odometer             0
transmission      1087
VIN              90247
drive            79740
type             51882
paint_color      73975
state                0
posting_date         0
dtype: int64

from util import vin_to_year

X_train.loc[X_train.year.isnull(), 'year'] = X_train.loc[
    X_train.year.isnull()].VIN.apply(lambda x: x[9]).map(vin_to_year)

from util import vin_to_manufacturer

X_train.loc[
    X_train.manufacturer.isnull() & X_train.VIN.notnull(), 
    'manufacturer'
] = X_train.loc[
    X_train.manufacturer.isnull() & X_train.VIN.notnull()
].VIN.apply(lambda x: x[0:3]).map(vin_to_manufacturer)

from sklearn.impute import SimpleImputer

X_train[['model']] = SimpleImputer(
    strategy='constant', 
    fill_value='unknown').fit_transform(X_train[['model']])

X_train[['paint_color']] = SimpleImputer(
    strategy='most_frequent').fit_transform(X_train[['paint_color']])

from util import ConditionalImputer
from tqdm import tqdm

columns_to_impute = ['cylinders', 'fuel', 'transmission', 'drive']

for column in tqdm(columns_to_impute):
    # Create a list of condition columns excluding the current target column
    relevant_cols = ['manufacturer', 'type'] + [column]
    
    # Impute the target column using the adjusted condition columns
    X_train[column] = ConditionalImputer(
        target_col=column, 
        condition_cols=['manufacturer', 'type'], 
        strategy='most_frequent').fit_transform(X_train[relevant_cols])[column]

100%|██████████| 4/4 [01:10<00:00, 17.56s/it]

X_train['age'] = X_train['posting_date'].dt.year - X_train['year']

from sklearn.impute import KNNImputer

X_train[['age', 'odometer']] = KNNImputer(
    n_neighbors=10).fit_transform(X_train[['age', 'odometer']])

X_train = X_train.loc[X_train.drop(
    columns=['VIN', 'condition']).notnull().all(axis=1)]
y_train = y_train.loc[X_train.index]

X_train.isnull().sum()

region              0
year                0
manufacturer        0
model               0
condition       99550
cylinders           0
fuel                0
odometer            0
transmission        0
VIN             86985
drive               0
type                0
paint_color         0
state               0
posting_date        0
age                 0
dtype: int64

X_train.shape

(259935, 16)

X_train = X_train.drop(columns=['condition', 'region', 'state', 'VIN'])

X_train = X_train.drop(columns=['year', 'posting_date'])

X_train['sport'] = X_train['model'].apply(
    lambda x: 'sport' in x.lower()).astype(int)

X_train = X_train.drop(columns=['model'])

from sklearn.preprocessing import OneHotEncoder

# Step 1: Fit and transform the data
encoder = OneHotEncoder(
    sparse_output=False, 
    min_frequency=5, 
    handle_unknown='infrequent_if_exist').fit(
        X_train[['manufacturer', 'fuel', 'cylinders', 
                 'paint_color', 'type', 'drive', 'transmission']])

# Transform the data
encoded_data = encoder.transform(
    X_train[['manufacturer', 'fuel', 'cylinders', 
             'paint_color', 'type', 'drive', 'transmission']])

# Step 2: Create a DataFrame from the encoded data
encoded_df = pd.DataFrame(
    encoded_data, columns=encoder.get_feature_names_out())

# Step 3: Concatenate the original DataFrame with the encoded DataFrame
X_train = pd.concat([
    X_train.reset_index(drop=True), 
    encoded_df.reset_index(drop=True)], 
    axis=1)

# Step 4: Drop the original categorical columns if no longer needed
X_train.drop(
    columns=['manufacturer', 'fuel', 'cylinders', 
             'paint_color', 'type', 'drive', 'transmission'], 
    inplace=True)

X_train.head(3)

from sklearn import tree

model = tree.DecisionTreeRegressor(
    max_depth=2, 
    min_samples_leaf=1000).fit(X_train, y_train)

tree_plot = tree.export_graphviz(
    model,                                                                                      # The trained decision tree model to visualize.
    feature_names=X_train.columns,                                                              # Names of the features used in the model for labeling nodes.
    filled=True,                                                                                # Fills the nodes with colors based on the predicted class.
    rounded=True,                                                                               # Draws node boxes with rounded corners for better aesthetics.
    special_characters=True,                                                                    # Allows the use of special characters in feature names for proper rendering.
    proportion=True,                                                                            # Displays the proportion of samples at each node instead of absolute counts.
    impurity=False,                                                                             # Omits impurity values (like Gini or entropy) from the node display.
    precision=0                                                                                 # Sets the precision for floating-point values to zero for cleaner output.
)

import graphviz

graphviz.Source(tree_plot)

from util import plot_predictions

y_pred = model.predict(X_train)
plot_predictions(y_train, y_pred)

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=100, 
    max_depth=20, 
    bootstrap=True,
    max_features='sqrt',
    random_state=333).fit(X_train, y_train)

y_pred = model.predict(X_train)
plot_predictions(y_train, y_pred)

from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=20,
    learning_rate=0.1,
    max_features='sqrt',
    random_state=333).fit(X_train, y_train)

y_pred = model.predict(X_train)
plot_predictions(y_train, y_pred)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_grid = {
    'decision_tree': {
        'max_depth': randint(1, 20),
        'min_samples_leaf': randint(1, 20),
    },
    'random_forest': {
        'n_estimators': randint(100, 300),
        'max_depth': randint(1, 20),
        'min_samples_leaf': randint(1, 20),
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True, False],
    },
    'gradient_boosting': {
        'n_estimators': randint(100, 300),
        'max_depth': randint(1, 20),
        'min_samples_leaf': randint(1, 20),
        'max_features': ['sqrt', 'log2'],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
    }
}

models = {
    'decision_tree': tree.DecisionTreeRegressor(),
    'random_forest': RandomForestRegressor(random_state=333),
    'gradient_boosting': GradientBoostingRegressor(random_state=333),
}

results = {}

for model in models.keys():

    # Initialize Random Search
    grid_search = RandomizedSearchCV(
        estimator=models[model], 
        param_distributions=param_grid[model], 
        n_iter=20,
        cv=3, 
        scoring='neg_root_mean_squared_error',
        return_train_score=True,
        random_state=333,
        n_jobs=-1)

    # Fit Random Search
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best score
    results[model] = grid_search.cv_results_

from util import print_best_models

for model in models:
    print_best_models(results, model)

Model: decision_tree 
Best score: 5709.03 RMSE 
Best parameters: {'max_depth': 17, 'min_samples_leaf': 2} 

Model: random_forest 
Best score: 6423.77 RMSE 
Best parameters: {'bootstrap': False, 'max_depth': 18, 'max_features': 'sqrt', 'min_samples_leaf': 16, 'n_estimators': 140} 

Model: gradient_boosting 
Best score: 4973.33 RMSE 
Best parameters: {'learning_rate': 0.2, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 8, 'n_estimators': 251}

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from util import VINReplacer, ConditionalImputer, AgeCalculator, SportColumn, ColumnDropper, DataFrameSimpleImputer

pipeline_steps = [    
    # Data Preprocessing
    ('vin_replacer', VINReplacer(
        vin_to_year=vin_to_year, vin_to_manufacturer=vin_to_manufacturer)),
    ('column_transformer_model', ColumnTransformer(
        transformers=[('model_imputer', SimpleImputer(
            strategy='constant', fill_value='unknown'), ['model'])],
        remainder='passthrough',
        verbose_feature_names_out=False)
    ),
    ('column_transformer_paint_color', ColumnTransformer(
        transformers=[('paint_color_imputer', SimpleImputer(
            strategy='most_frequent'), ['paint_color'])],
        remainder='passthrough',
        verbose_feature_names_out=False)
    ),    
    ('cylinders_imputer', ConditionalImputer(
        target_col='cylinders', condition_cols=['manufacturer', 'type'])),
    ('fuel_imputer', ConditionalImputer(
        target_col='fuel', condition_cols=['manufacturer', 'type'])),
    ('transmission_imputer', ConditionalImputer(
        target_col='transmission', condition_cols=['manufacturer', 'type'])),
    ('drive_imputer', ConditionalImputer(
        target_col='drive', condition_cols=['manufacturer', 'type']))
]

pipeline_steps.extend([
    # Feature Engineering
    ('age_calculator', AgeCalculator()),
    ('column_transformer_knn', ColumnTransformer(
        transformers=[('knn_imputer', KNNImputer(
            n_neighbors=10), ['age', 'odometer'])],
        remainder='passthrough',
        verbose_feature_names_out=False)
    ),
    ('sport_column', SportColumn()),
    ('column_transformer_encoder', ColumnTransformer(
        transformers=[('one_hot_encoder', OneHotEncoder(
            sparse_output=False, min_frequency=5, handle_unknown='infrequent_if_exist'), 
            ['manufacturer', 'fuel', 'cylinders', 
             'paint_color', 'type', 'drive', 'transmission'])],
        remainder='passthrough',
        verbose_feature_names_out=False)
    ),
    ('column_dropper', ColumnDropper(
        columns=['VIN', 'year', 'posting_date', 'model'])),
    ('final_simple_imputer', DataFrameSimpleImputer(
        strategy='most_frequent'))
])

pipeline = Pipeline(steps=pipeline_steps)

pipeline

Pipeline(steps=[('vin_replacer',
                 VINReplacer(vin_to_manufacturer={'AA9': 'tr-tec',
                                                  'AAA': 'audi', 'AAK': 'faw',
                                                  'AAM': 'man', 'AAP': '',
                                                  'AAV': 'volkswagen',
                                                  'AAW': 'challenger-trailer',
                                                  'ABJ': 'mitsubishi',
                                                  'ABM': 'bmw',
                                                  'AC5': 'hyundai',
                                                  'ACV': 'isuzu',
                                                  'ADB': 'mercedes-benz',
                                                  'ADD': '',
                                                  'ADM': 'general-motors',
                                                  'ADN': 'nissan',
                                                  'ADR': 'renault',
                                                  'ADX': 'tata', 'AFA': '',
                                                  'AFB': 'maz...
                                                  OneHotEncoder(handle_unknown='infrequent_if_exist',
                                                                min_frequency=5,
                                                                sparse_output=False),
                                                  ['manufacturer', 'fuel',
                                                   'cylinders', 'paint_color',
                                                   'type', 'drive',
                                                   'transmission'])],
                                   verbose_feature_names_out=False)),
                ('column_dropper',
                 ColumnDropper(columns=['VIN', 'year', 'posting_date',
                                        'model'])),
                ('final_simple_imputer',
                 DataFrameSimpleImputer(strategy='most_frequent'))])

Pipeline(steps=[('vin_replacer',
                 VINReplacer(vin_to_manufacturer={'AA9': 'tr-tec',
                                                  'AAA': 'audi', 'AAK': 'faw',
                                                  'AAM': 'man', 'AAP': '',
                                                  'AAV': 'volkswagen',
                                                  'AAW': 'challenger-trailer',
                                                  'ABJ': 'mitsubishi',
                                                  'ABM': 'bmw',
                                                  'AC5': 'hyundai',
                                                  'ACV': 'isuzu',
                                                  'ADB': 'mercedes-benz',
                                                  'ADD': '',
                                                  'ADM': 'general-motors',
                                                  'ADN': 'nissan',
                                                  'ADR': 'renault',
                                                  'ADX': 'tata', 'AFA': '',
                                                  'AFB': 'maz...
                                                  OneHotEncoder(handle_unknown='infrequent_if_exist',
                                                                min_frequency=5,
                                                                sparse_output=False),
                                                  ['manufacturer', 'fuel',
                                                   'cylinders', 'paint_color',
                                                   'type', 'drive',
                                                   'transmission'])],
                                   verbose_feature_names_out=False)),
                ('column_dropper',
                 ColumnDropper(columns=['VIN', 'year', 'posting_date',
                                        'model'])),
                ('final_simple_imputer',
                 DataFrameSimpleImputer(strategy='most_frequent'))])

VINReplacer(vin_to_manufacturer={'AA9': 'tr-tec', 'AAA': 'audi', 'AAK': 'faw',
                                 'AAM': 'man', 'AAP': '', 'AAV': 'volkswagen',
                                 'AAW': 'challenger-trailer',
                                 'ABJ': 'mitsubishi', 'ABM': 'bmw',
                                 'AC5': 'hyundai', 'ACV': 'isuzu',
                                 'ADB': 'mercedes-benz', 'ADD': '',
                                 'ADM': 'general-motors', 'ADN': 'nissan',
                                 'ADR': 'renault', 'ADX': 'tata', 'AFA': '',
                                 'AFB': 'mazda', 'AFD': 'baic', 'AHH': 'hino',
                                 'A...
                                 'CN1': 'tr-tec', 'DF9/': 'laraki',
                                 'EBZ': 'nizhekotrans',
                                 'H0D': 'taizhou-qianxin-vehicle-co-ltd', ...},
            vin_to_year={'1': 2001, '2': 2002, '3': 2003, '4': 2004, '5': 2005,
                         '6': 2006, '7': 2007, '8': 2008, '9': 2009, 'A': 2010,
                         'B': 2011, 'C': 2012, 'D': 2013, 'E': 2014, 'F': 2015,
                         'G': 2016, 'H': 2017, 'J': 2018, 'K': 2019, 'L': 2020,
                         'M': 2021, 'N': 2022, 'P': 2023, 'R': 2024, 'S': 2025,
                         'T': 1996, 'V': 1997, 'W': 1998, 'X': 1999,
                         'Y': 2000})

ColumnTransformer(remainder='passthrough',
                  transformers=[('model_imputer',
                                 SimpleImputer(fill_value='unknown',
                                               strategy='constant'),
                                 ['model'])],
                  verbose_feature_names_out=False)

['model']

SimpleImputer(fill_value='unknown', strategy='constant')

passthrough

ColumnTransformer(remainder='passthrough',
                  transformers=[('paint_color_imputer',
                                 SimpleImputer(strategy='most_frequent'),
                                 ['paint_color'])],
                  verbose_feature_names_out=False)

['paint_color']

pipeline_fitted = pipeline.fit(X_train)

X_train_transformed = pipeline_fitted.transform(X_train)
X_test_transformed = pipeline_fitted.transform(X_test)

from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(X_train_transformed, y_train)

y_pred_benchmark = model.predict(X_test_transformed)
plot_predictions(y_test, y_pred_benchmark)

model = GradientBoostingRegressor(
    n_estimators=251,
    max_depth=15,
    learning_rate=0.2,
    max_features='sqrt',
    min_samples_leaf=8,
    random_state=333).fit(X_train_transformed, y_train)

y_pred = model.predict(X_test_transformed)
plot_predictions(y_test, y_pred)

!fastapi dev ml_fastapi.py

!streamlit run ml_streamlit.py

	id	region	price	year	manufacturer	model	condition	cylinders	fuel	odometer	transmission	VIN	drive	type	paint_color	state	posting_date
248122	7314143256	las vegas	33999	2015.00	ford	f150 supercrew cab	NaN	6 cylinders	gas	66812.00	automatic	1FTEW1CPXFKD59968	NaN	pickup	white	nv	2021-04-29
200197	7316290358	grand rapids	6900	2000.00	ford	f350	good	8 cylinders	diesel	293000.00	automatic	NaN	4wd	NaN	white	mi	2021-05-03
161659	7311899881	omaha / council bluffs	29592	NaN	NaN	n Frontier	NaN	6 cylinders	gas	4142.00	automatic	1N6DD0EV0KN871266	4wd	pickup	silver	ia	2021-04-24

	price	year	odometer	posting_date
count	422590.00	421475.00	418324.00	422590
mean	75774.14	2011.23	97993.36	2021-04-23 08:18:04.557609216
min	0.00	1900.00	0.00	2021-04-04 00:00:00
25%	5950.00	2008.00	37531.00	2021-04-17 00:00:00
50%	13988.00	2013.00	85377.00	2021-04-25 00:00:00
75%	26500.00	2017.00	133600.00	2021-05-01 00:00:00
max	3736928711.00	2022.00	10000000.00	2021-05-05 00:00:00
std	12243930.69	9.47	213704.62	NaN

	region	manufacturer	model	condition	cylinders	fuel	transmission	VIN	drive	type	paint_color	state
count	372280	361468	369065	218465	214072	370244	370764	247417	259796	297770	261213	372280
unique	404	41	23395	6	8	5	3	108744	3	13	12	51
top	columbus	ford	f-150	good	6 cylinders	gas	automatic	1FMJU1JT1HEA52352	4wd	sedan	white	ca
freq	3194	60453	7019	106911	83390	310918	293591	261	117001	80398	69925	43164

	odometer	age	...	type_sedan	type_van	drive_4wd	drive_fwd	transmission_automatic	transmission_manual
0	86000.00	8.00	...	1.00	0.00	1.00	0.00	0.00	1.00
1	52159.00	2.00	...	0.00	0.00	1.00	0.00	1.00	0.00
2	107862.00	6.00	...	0.00	1.00	0.00	1.00	1.00	0.00

Hyperparameters	Decision Tree	Random Forest	Gradient Boosting
`max_depth`	Maximum depth of the tree	Maximum depth of each tree	Maximum depth of each tree
`min_samples_split`	Minimum number of samples required to split an internal node	Minimum number of samples required to split an internal node	Minimum number of samples required to split an internal node
`min_samples_leaf`	Minimum number of samples required to be at a leaf node	Minimum number of samples required to be at a leaf node	Minimum number of samples required to be at a leaf node
`max_features`	Number of features to consider when looking for the best split	Number of features to consider when looking for the best split	Number of features to consider when looking for the best split
`criterion`	Function to measure the quality of a split (e.g., `mse`, `mae`)	Function to measure the quality of a split (e.g., `mse`, `mae`)	Loss function to be optimized (e.g., `mse`, `mae`)
`splitter`	Strategy used to choose the split at each node (e.g., `best`, `random`)	-	-
`n_estimators`	-	Number of trees in the forest	Number of boosting stages to be run
`bootstrap`	-	Whether bootstrap samples are used when building trees	-
`learning_rate`	-	-	Shrinks the contribution of each tree by `learning_rate`

Machine Learning in Risk and Finance

Developing a Residual Value Risk Model

About this Course¶

What you will learn¶

What is Machine Learning?¶

Application of Machine Learning in Risk Management¶

Why Use Machine Learning?¶

A real-world use case: Residual Value Risk Modelling¶

Developing a Machine Learning Model¶

Understanding the Problem¶

Data Preprocessing¶

Data Cleaning¶

Train-Test Split¶

Handling Missing Values¶

year¶

manufacturer¶

model¶

paint_color¶

cylinders, fuel, transmission, drive, type¶

odometer¶

Feature Engineering¶

Feature Selection¶

Encoding Categorical Variables¶

Model Training¶

Decision Tree¶

Random Forest¶

Gradient Boosting¶

Model Fine Tuning¶

Model Evaluation¶

Model Deployment¶

FastAPI for deployment¶

Streamlit for deployment¶

Hyperparameter	Optimal value
`learning_rate`	0.2
`max_depth`	15
`max_features`	'sqrt'
`min_samples_leaf`	8
`n_estimators`	251