{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Simple regression model\n",
    "\n",
    "Does money make people happier? Simple version without data splitting."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Load the data from GitHub\n",
    "LINK = \"https://raw.githubusercontent.com/kirenz/datasets/master/oecd_gdp.csv\"\n",
    "df = pd.read_csv(LINK)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data structure"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "      <th>GDP per capita</th>\n",
       "      <th>Life satisfaction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Russia</td>\n",
       "      <td>9054.914</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Turkey</td>\n",
       "      <td>9437.372</td>\n",
       "      <td>5.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Hungary</td>\n",
       "      <td>12239.894</td>\n",
       "      <td>4.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Poland</td>\n",
       "      <td>12495.334</td>\n",
       "      <td>5.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Slovak Republic</td>\n",
       "      <td>15991.736</td>\n",
       "      <td>6.1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           Country  GDP per capita  Life satisfaction\n",
       "0           Russia        9054.914                6.0\n",
       "1           Turkey        9437.372                5.6\n",
       "2          Hungary       12239.894                4.9\n",
       "3           Poland       12495.334                5.8\n",
       "4  Slovak Republic       15991.736                6.1"
      ]
     },
     "execution_count": 173,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 29 entries, 0 to 28\n",
      "Data columns (total 3 columns):\n",
      " #   Column             Non-Null Count  Dtype  \n",
      "---  ------             --------------  -----  \n",
      " 0   Country            29 non-null     object \n",
      " 1   GDP per capita     29 non-null     float64\n",
      " 2   Life satisfaction  29 non-null     float64\n",
      "dtypes: float64(2), object(1)\n",
      "memory usage: 824.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data corrections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>gdp_per_capita</th>\n",
       "      <th>life_satisfaction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Russia</td>\n",
       "      <td>9054.914</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Turkey</td>\n",
       "      <td>9437.372</td>\n",
       "      <td>5.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Hungary</td>\n",
       "      <td>12239.894</td>\n",
       "      <td>4.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Poland</td>\n",
       "      <td>12495.334</td>\n",
       "      <td>5.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Slovak Republic</td>\n",
       "      <td>15991.736</td>\n",
       "      <td>6.1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           country  gdp_per_capita  life_satisfaction\n",
       "0           Russia        9054.914                6.0\n",
       "1           Turkey        9437.372                5.6\n",
       "2          Hungary       12239.894                4.9\n",
       "3           Poland       12495.334                5.8\n",
       "4  Slovak Republic       15991.736                6.1"
      ]
     },
     "execution_count": 175,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Change column names (lower case and spaces to underscore)\n",
    "df.columns = df.columns.str.lower().str.replace(' ', '_')\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Variable lists\n",
    "\n",
    "Prepare the data for later use"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define outcome variable as y_label\n",
    "y_label = 'life_satisfaction'\n",
    "\n",
    "# select features\n",
    "X = df[[\"gdp_per_capita\"]]\n",
    "\n",
    "# create response\n",
    "y = df[y_label]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data splitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, \n",
    "                                                    test_size=0.2, \n",
    "                                                    shuffle=True,\n",
    "                                                    random_state=42)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Investigate the data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((23, 1), (23,))"
      ]
     },
     "execution_count": 178,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape, y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gdp_per_capita</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>43724.031</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9054.914</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    gdp_per_capita\n",
       "21       43724.031\n",
       "0         9054.914"
      ]
     },
     "execution_count": 179,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((6, 1), (6,))"
      ]
     },
     "execution_count": 180,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test.shape, y_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We make a copy of the training data since we don’t want to alter our data during data exploration. We will use this data for our exploratory data analysis."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.DataFrame(X_train.copy())\n",
    "df_train = df_train.join(pd.DataFrame(y_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gdp_per_capita</th>\n",
       "      <th>life_satisfaction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>43724.031</td>\n",
       "      <td>6.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9054.914</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    gdp_per_capita  life_satisfaction\n",
       "21       43724.031                6.9\n",
       "0         9054.914                6.0"
      ]
     },
     "execution_count": 182,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<div id=\"altair-viz-f968ae0700a3474d8e21c36254e91fc2\"></div>\n",
       "<script type=\"text/javascript\">\n",
       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
       "  (function(spec, embedOpt){\n",
       "    let outputDiv = document.currentScript.previousElementSibling;\n",
       "    if (outputDiv.id !== \"altair-viz-f968ae0700a3474d8e21c36254e91fc2\") {\n",
       "      outputDiv = document.getElementById(\"altair-viz-f968ae0700a3474d8e21c36254e91fc2\");\n",
       "    }\n",
       "    const paths = {\n",
       "      \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n",
       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n",
       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm//vega-lite@4.17.0?noext\",\n",
       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm//vega-embed@6?noext\",\n",
       "    };\n",
       "\n",
       "    function maybeLoadScript(lib, version) {\n",
       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
       "      return (VEGA_DEBUG[key] == version) ?\n",
       "        Promise.resolve(paths[lib]) :\n",
       "        new Promise(function(resolve, reject) {\n",
       "          var s = document.createElement('script');\n",
       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
       "          s.async = true;\n",
       "          s.onload = () => {\n",
       "            VEGA_DEBUG[key] = version;\n",
       "            return resolve(paths[lib]);\n",
       "          };\n",
       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
       "          s.src = paths[lib];\n",
       "        });\n",
       "    }\n",
       "\n",
       "    function showError(err) {\n",
       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
       "      throw err;\n",
       "    }\n",
       "\n",
       "    function displayChart(vegaEmbed) {\n",
       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
       "    }\n",
       "\n",
       "    if(typeof define === \"function\" && define.amd) {\n",
       "      requirejs.config({paths});\n",
       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
       "    } else {\n",
       "      maybeLoadScript(\"vega\", \"5\")\n",
       "        .then(() => maybeLoadScript(\"vega-lite\", \"4.17.0\"))\n",
       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
       "        .catch(showError)\n",
       "        .then(() => displayChart(vegaEmbed));\n",
       "    }\n",
       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-aa0134a50bd1d448e8e252797a5f1a5a\"}, \"mark\": {\"type\": \"circle\", \"size\": 100}, \"encoding\": {\"x\": {\"field\": \"gdp_per_capita\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"life_satisfaction\", \"type\": \"quantitative\"}}, \"selection\": {\"selector003\": {\"type\": \"interval\", \"bind\": \"scales\", \"encodings\": [\"x\", \"y\"]}}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-aa0134a50bd1d448e8e252797a5f1a5a\": [{\"gdp_per_capita\": 43724.031, \"life_satisfaction\": 6.9}, {\"gdp_per_capita\": 9054.914, \"life_satisfaction\": 6.0}, {\"gdp_per_capita\": 51350.744000000006, \"life_satisfaction\": 7.0}, {\"gdp_per_capita\": 35343.336, \"life_satisfaction\": 7.4}, {\"gdp_per_capita\": 37675.006, \"life_satisfaction\": 6.5}, {\"gdp_per_capita\": 29866.581, \"life_satisfaction\": 6.0}, {\"gdp_per_capita\": 40996.511, \"life_satisfaction\": 7.0}, {\"gdp_per_capita\": 9437.372, \"life_satisfaction\": 5.6}, {\"gdp_per_capita\": 15991.736, \"life_satisfaction\": 6.1}, {\"gdp_per_capita\": 17288.083, \"life_satisfaction\": 5.6}, {\"gdp_per_capita\": 12239.893999999998, \"life_satisfaction\": 4.9}, {\"gdp_per_capita\": 50854.583, \"life_satisfaction\": 7.5}, {\"gdp_per_capita\": 12495.334, \"life_satisfaction\": 5.8}, {\"gdp_per_capita\": 49866.266, \"life_satisfaction\": 7.2}, {\"gdp_per_capita\": 50961.865, \"life_satisfaction\": 7.3}, {\"gdp_per_capita\": 41973.988, \"life_satisfaction\": 7.4}, {\"gdp_per_capita\": 55805.204000000005, \"life_satisfaction\": 7.2}, {\"gdp_per_capita\": 43603.115, \"life_satisfaction\": 7.3}, {\"gdp_per_capita\": 19121.592, \"life_satisfaction\": 5.1}, {\"gdp_per_capita\": 27195.197, \"life_satisfaction\": 5.8}, {\"gdp_per_capita\": 37044.891, \"life_satisfaction\": 7.3}, {\"gdp_per_capita\": 43331.961, \"life_satisfaction\": 7.3}, {\"gdp_per_capita\": 18064.288, \"life_satisfaction\": 4.8}]}}, {\"mode\": \"vega-lite\"});\n",
       "</script>"
      ],
      "text/plain": [
       "alt.Chart(...)"
      ]
     },
     "execution_count": 183,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "import altair as alt\n",
    "\n",
    "# Visualize the data\n",
    "alt.Chart(df_train).mark_circle(size=100).encode(\n",
    "    x='gdp_per_capita:Q',\n",
    "    y='life_satisfaction:Q',\n",
    ").interactive()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "reg = LinearRegression()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "\n",
    "reg2 = KNeighborsRegressor(n_neighbors=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training & validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [],
   "source": [
    "# cross-validation with 5 folds\n",
    "scores = cross_val_score(reg, X_train, y_train, cv=5, scoring='neg_mean_squared_error') *-1\n",
    "\n",
    "scores2 = cross_val_score(reg2, X_train, y_train, cv=5, scoring='neg_mean_squared_error') *-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_4a955_row0_col0 {\n",
       "  background-color: #3282be;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_4a955_row0_col1 {\n",
       "  background-color: #084b93;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_4a955_row1_col0 {\n",
       "  background-color: #f5f9fe;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_4a955_row1_col1, #T_4a955_row4_col0 {\n",
       "  background-color: #08306b;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_4a955_row2_col0, #T_4a955_row3_col1 {\n",
       "  background-color: #f7fbff;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_4a955_row2_col1 {\n",
       "  background-color: #2b7bba;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "#T_4a955_row3_col0 {\n",
       "  background-color: #7fb9da;\n",
       "  color: #000000;\n",
       "}\n",
       "#T_4a955_row4_col1 {\n",
       "  background-color: #5aa2cf;\n",
       "  color: #f1f1f1;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_4a955\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_4a955_level0_col0\" class=\"col_heading level0 col0\" >lr</th>\n",
       "      <th id=\"T_4a955_level0_col1\" class=\"col_heading level0 col1\" >knn</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_4a955_level0_row0\" class=\"row_heading level0 row0\" >1</th>\n",
       "      <td id=\"T_4a955_row0_col0\" class=\"data row0 col0\" >0.335187</td>\n",
       "      <td id=\"T_4a955_row0_col1\" class=\"data row0 col1\" >0.311778</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_4a955_level0_row1\" class=\"row_heading level0 row1\" >2</th>\n",
       "      <td id=\"T_4a955_row1_col0\" class=\"data row1 col0\" >0.101782</td>\n",
       "      <td id=\"T_4a955_row1_col1\" class=\"data row1 col1\" >0.338444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_4a955_level0_row2\" class=\"row_heading level0 row2\" >3</th>\n",
       "      <td id=\"T_4a955_row2_col0\" class=\"data row2 col0\" >0.097167</td>\n",
       "      <td id=\"T_4a955_row2_col1\" class=\"data row2 col1\" >0.265333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_4a955_level0_row3\" class=\"row_heading level0 row3\" >4</th>\n",
       "      <td id=\"T_4a955_row3_col0\" class=\"data row3 col0\" >0.252955</td>\n",
       "      <td id=\"T_4a955_row3_col1\" class=\"data row3 col1\" >0.082500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_4a955_level0_row4\" class=\"row_heading level0 row4\" >5</th>\n",
       "      <td id=\"T_4a955_row4_col0\" class=\"data row4 col0\" >0.443893</td>\n",
       "      <td id=\"T_4a955_row4_col1\" class=\"data row4 col1\" >0.223889</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x18646f430>"
      ]
     },
     "execution_count": 188,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# store cross-validation scores\n",
    "df_scores = pd.DataFrame({\"lr\": scores, \n",
    "                          \"knn\": scores2})\n",
    "\n",
    "# reset index to match the number of folds\n",
    "df_scores.index += 1\n",
    "\n",
    "# print dataframe\n",
    "df_scores.style.background_gradient(cmap='Blues')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<div id=\"altair-viz-4dcf33aeb30f4908a09d7d5aaf7efdd7\"></div>\n",
       "<script type=\"text/javascript\">\n",
       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
       "  (function(spec, embedOpt){\n",
       "    let outputDiv = document.currentScript.previousElementSibling;\n",
       "    if (outputDiv.id !== \"altair-viz-4dcf33aeb30f4908a09d7d5aaf7efdd7\") {\n",
       "      outputDiv = document.getElementById(\"altair-viz-4dcf33aeb30f4908a09d7d5aaf7efdd7\");\n",
       "    }\n",
       "    const paths = {\n",
       "      \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n",
       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n",
       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm//vega-lite@4.17.0?noext\",\n",
       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm//vega-embed@6?noext\",\n",
       "    };\n",
       "\n",
       "    function maybeLoadScript(lib, version) {\n",
       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
       "      return (VEGA_DEBUG[key] == version) ?\n",
       "        Promise.resolve(paths[lib]) :\n",
       "        new Promise(function(resolve, reject) {\n",
       "          var s = document.createElement('script');\n",
       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
       "          s.async = true;\n",
       "          s.onload = () => {\n",
       "            VEGA_DEBUG[key] = version;\n",
       "            return resolve(paths[lib]);\n",
       "          };\n",
       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
       "          s.src = paths[lib];\n",
       "        });\n",
       "    }\n",
       "\n",
       "    function showError(err) {\n",
       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
       "      throw err;\n",
       "    }\n",
       "\n",
       "    function displayChart(vegaEmbed) {\n",
       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
       "    }\n",
       "\n",
       "    if(typeof define === \"function\" && define.amd) {\n",
       "      requirejs.config({paths});\n",
       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
       "    } else {\n",
       "      maybeLoadScript(\"vega\", \"5\")\n",
       "        .then(() => maybeLoadScript(\"vega-lite\", \"4.17.0\"))\n",
       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
       "        .catch(showError)\n",
       "        .then(() => displayChart(vegaEmbed));\n",
       "    }\n",
       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"repeat\": {\"layer\": [\"lr\", \"knn\"]}, \"spec\": {\"data\": {\"name\": \"data-77556653c5f502f1db5f098394ed4af6\"}, \"mark\": {\"type\": \"line\", \"point\": {}}, \"encoding\": {\"color\": {\"datum\": {\"repeat\": \"layer\"}}, \"x\": {\"axis\": {\"tickCount\": 5}, \"bin\": false, \"field\": \"index\", \"title\": \"Fold\", \"type\": \"quantitative\"}, \"y\": {\"aggregate\": \"mean\", \"field\": {\"repeat\": \"layer\"}, \"title\": \"Mean squared error (MSE)\"}}}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-77556653c5f502f1db5f098394ed4af6\": [{\"index\": 1, \"lr\": 0.33518676415781357, \"knn\": 0.31177777777777793}, {\"index\": 2, \"lr\": 0.10178156733780966, \"knn\": 0.3384444444444441}, {\"index\": 3, \"lr\": 0.09716650227981585, \"knn\": 0.26533333333333275}, {\"index\": 4, \"lr\": 0.2529549399440394, \"knn\": 0.08250000000000002}, {\"index\": 5, \"lr\": 0.44389326579833466, \"knn\": 0.22388888888888864}]}}, {\"mode\": \"vega-lite\"});\n",
       "</script>"
      ],
      "text/plain": [
       "alt.RepeatChart(...)"
      ]
     },
     "execution_count": 189,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "alt.Chart(df_scores.reset_index()).mark_line(\n",
    "     point=alt.OverlayMarkDef()\n",
    ").encode(\n",
    "    x=alt.X(\"index\", bin=False, title=\"Fold\", axis=alt.Axis(tickCount=5)),\n",
    "    y=alt.Y(\n",
    "        alt.repeat(\"layer\"), aggregate=\"mean\", title=\"Mean squared error (MSE)\"\n",
    "    ),\n",
    "    color=alt.datum(alt.repeat(\"layer\")),\n",
    ").repeat(layer=[\"lr\", \"knn\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>lr</th>\n",
       "      <td>5.0</td>\n",
       "      <td>0.246197</td>\n",
       "      <td>0.150095</td>\n",
       "      <td>0.097167</td>\n",
       "      <td>0.101782</td>\n",
       "      <td>0.252955</td>\n",
       "      <td>0.335187</td>\n",
       "      <td>0.443893</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>knn</th>\n",
       "      <td>5.0</td>\n",
       "      <td>0.244389</td>\n",
       "      <td>0.100567</td>\n",
       "      <td>0.082500</td>\n",
       "      <td>0.223889</td>\n",
       "      <td>0.265333</td>\n",
       "      <td>0.311778</td>\n",
       "      <td>0.338444</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     count      mean       std       min       25%       50%       75%  \\\n",
       "lr     5.0  0.246197  0.150095  0.097167  0.101782  0.252955  0.335187   \n",
       "knn    5.0  0.244389  0.100567  0.082500  0.223889  0.265333  0.311778   \n",
       "\n",
       "          max  \n",
       "lr   0.443893  \n",
       "knn  0.338444  "
      ]
     },
     "execution_count": 190,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_scores.describe().T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The performance difference between the two models is relatively small. Let's assume we are interested in the parameters of the linear regression and therefore choose the linear regression."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tuning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will cover model tuning (hyperparameter tuning) in another notebook and skip this part. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Final training"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train your best model with the complete training data (without cross-validation)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-7 {color: black;background-color: white;}#sk-container-id-7 pre{padding: 0;}#sk-container-id-7 div.sk-toggleable {background-color: white;}#sk-container-id-7 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-7 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-7 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-7 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-7 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-7 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-7 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-7 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-7 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-7 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-7 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-7 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-7 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-7 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-7 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-7 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-7 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-7 div.sk-item {position: relative;z-index: 1;}#sk-container-id-7 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-7 div.sk-item::before, #sk-container-id-7 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-7 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-7 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-7 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-7 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-7 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-7 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-7 div.sk-label-container {text-align: center;}#sk-container-id-7 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-7 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-7\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LinearRegression()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-7\" type=\"checkbox\" checked><label for=\"sk-estimator-id-7\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LinearRegression</label><div class=\"sk-toggleable__content\"><pre>LinearRegression()</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "LinearRegression()"
      ]
     },
     "execution_count": 191,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Fit the model\n",
    "reg.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Intercept: 4.87 \n",
      " Slope: 0.00005\n"
     ]
    }
   ],
   "source": [
    "print(f' Intercept: {reg.intercept_:.3} \\n Slope: {reg.coef_[0]:.5f}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test error"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Evaluate the final model on the test set. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.09021411430745645"
      ]
     },
     "execution_count": 193,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Prediction for our test data\n",
    "y_pred = reg.predict(X_test)\n",
    "\n",
    "# Mean squared error\n",
    "mean_squared_error(y_test, y_pred)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.12 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  },
  "nav_menu": {},
  "toc": {
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 6,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": true
  },
  "toc_position": {
   "height": "616px",
   "left": "0px",
   "right": "20px",
   "top": "106px",
   "width": "213px"
  },
  "vscode": {
   "interpreter": {
    "hash": "463226f144cc21b006ce6927bfc93dd00694e52c8bc6857abb6e555b983749e9"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}