In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np

import plottingtools as pt
from importlib import reload  
pt = reload(pt)
pt.texon()

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

import ipywidgets as widgets
from ipywidgets import interact

In [2]:
# Create a random dataset
np.random.seed(0)

X = np.random.uniform(0,10,100)
y = np.sin(X)
y += np.random.normal(0,0.3,100)
X = [[x] for x in X]


# Regression Tree

## Train on complete dataset

In [3]:
def plottingfunction(k):
    regr = DecisionTreeRegressor(max_depth=k)
    regr.fit(X, y)
    
    X_pred = np.arange(0.0, 10, 0.01)[:, np.newaxis]
    y_pred = regr.predict(X_pred)

    fig, ax = pt.singleplot()
    pt.oligoscatter(ax, X, y,c="C0", label="Data")
    pt.majorline(ax, X_pred, y_pred, color="cornflowerblue", label="Prediction")
    pt.labels(ax, "$x$", "$y$")
    pt.legend(ax)
    pt.ticklabelsize(ax)
    pt.despine(ax)
    
k_slider = widgets.FloatSlider(value=1, min=1, max=15, step=1)

interact(plottingfunction, k = k_slider);

interactive(children=(FloatSlider(value=1.0, description='k', max=15.0, min=1.0, step=1.0), Output()), _dom_cl…

## Train on a subset of the data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

def plottingfunction2(k):
    regr = DecisionTreeRegressor(max_depth=k)
    regr.fit(X_train, y_train)
    

    fig, ax = pt.singleplot()
    
    
    # Training data
    pt.oligoscatter(ax, X_train, y_train, c="C0", label="Train")

    # Learned model
    X_pred = np.arange(0.0, 10, 0.01)[:, np.newaxis]
    y_pred = regr.predict(X_pred)
    pt.majorline(ax, X_pred, y_pred, color="cornflowerblue", label="Prediction")

    # Test data
    pt.oligoscatter(ax, X_test, y_test, c="C1", label="Test")
    
    # Residuals
    y_train_pred = regr.predict(X_train)
    resids = y_train - y_train_pred
    print("MSE train: ", np.mean(resids**2))
    y_test_pred = regr.predict(X_test)
    resids = y_test - y_test_pred
    print("MSE test: ", np.mean(resids**2))

    
    pt.labels(ax, "$x$", "$y$")
    pt.legend(ax)
    pt.ticklabelsize(ax)
    pt.despine(ax)    


interact(plottingfunction2, k = k_slider);


interactive(children=(FloatSlider(value=1.0, description='k', max=15.0, min=1.0, step=1.0), Output()), _dom_cl…

## Model Performance for different k

In [25]:
ks = np.arange(1,16,1)
mse_train = []
mse_test = []

for k in ks:
    regr = DecisionTreeRegressor(max_depth=k)
    regr.fit(X_train, y_train)
    
    # Residuals
    y_train_pred = regr.predict(X_train)
    resids = y_train - y_train_pred
    mse_train.append(np.mean(resids**2))
    y_test_pred = regr.predict(X_test)
    resids = y_test - y_test_pred
    mse_test.append(np.mean(resids**2))
    
def plottingfunction3(k):
    fig, ax = pt.singleplot()    
    pt.majorline(ax,ks, mse_train, label="MSE Train")
    #ax2 = ax.twinx()
    pt.majorline(ax,ks, mse_test, label="MSE Test")
    pt.labels(ax, "$\mathrm{Max. \, Depth \, k}$",\
              "$\mathrm{Mean \, Squared \, Error}$")
    pt.legend(ax)
    pt.ticklabelsize(ax)
    pt.despine(ax)
    pt.limits(ax, (0,15), (0,0.4))
    pt.lines(ax, "x", [k], alpha=.5, linestyle=":")


In [26]:

interact(plottingfunction3, k = k_slider);



interactive(children=(FloatSlider(value=1.0, description='k', max=15.0, min=1.0, step=1.0), Output()), _dom_cl…