# Simple scikit learn model

Does money make people happier? Simple version without data splitting.

## Data

### Import data

In [2]:
import pandas as pd

# Load the data from GitHub
LINK = "https://raw.githubusercontent.com/kirenz/datasets/master/oecd_gdp.csv"
df = pd.read_csv(LINK)

### Data structure

In [3]:
df

Unnamed: 0,Country,GDP per capita,Life satisfaction
0,Russia,9054.914,6.0
1,Turkey,9437.372,5.6
2,Hungary,12239.894,4.9
3,Poland,12495.334,5.8
4,Slovak Republic,15991.736,6.1
5,Estonia,17288.083,5.6
6,Greece,18064.288,4.8
7,Portugal,19121.592,5.1
8,Slovenia,20732.482,5.7
9,Spain,25864.721,6.5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Country            29 non-null     object 
 1   GDP per capita     29 non-null     float64
 2   Life satisfaction  29 non-null     float64
dtypes: float64(2), object(1)
memory usage: 824.0+ bytes


### Data corrections

In [14]:
# Change column names (lower case and spaces to underscore)
df.columns = df.columns.str.lower().str.replace(' ', '_')

# show the first 5 rows
df.head()

Unnamed: 0,country,gdp_per_capita,life_satisfaction
0,Russia,9054.914,6.0
1,Turkey,9437.372,5.6
2,Hungary,12239.894,4.9
3,Poland,12495.334,5.8
4,Slovak Republic,15991.736,6.1


### Variable lists

Prepare the data for later use

In [15]:
# define outcome variable as y_label
y_label = 'life_satisfaction'

# select features
X = df[["gdp_per_capita"]]

# create response
y = df[y_label]

### Data exploration

In [16]:
%matplotlib inline
import altair as alt

# Visualize the data
alt.Chart(df).mark_circle(size=100).encode(
    x='gdp_per_capita:Q',
    y='life_satisfaction:Q',
    color=alt.Color('country', legend=None),
    tooltip=['country', 'gdp_per_capita', 'life_satisfaction']

).interactive()

## Linear regression model

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
# Select a linear regression model
reg = LinearRegression()

### Training

In [20]:
# Fit the model
reg.fit(X, y)

In [22]:
# Model intercept
reg.intercept_

4.853052800266436

In [21]:
# Model coefficient
reg.coef_

array([4.91154459e-05])

### Prediction

In [23]:
# Prediction for our data
y_pred = reg.predict(X)

In [38]:
# Make a prediction for a specific GDP value
X_new = pd.DataFrame({'gdp_per_capita': [50000]})

reg.predict(X_new)

array([7.30882509])

### Evaluation

In [30]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [33]:
# Mean squared error
mean_squared_error(y, y_pred)

0.18075033705835142

In [34]:
# Root mean squared error
mean_squared_error(y, y_pred, squared=False)

0.4251474297915388

In [32]:
mean_absolute_error(y, y_pred)

0.35530429427921734

## K-Nearest Neighbor Model

In [40]:
from sklearn.neighbors import KNeighborsRegressor

In [46]:
reg2 = KNeighborsRegressor(n_neighbors=2)

In [47]:
reg2.fit(X, y)

In [48]:
y_pred2 = reg2.predict(X)

In [49]:
reg2.predict(X_new) 

array([7.35])

In [50]:
mean_squared_error(y, y_pred2)

0.06181034482758619

In [51]:
mean_absolute_error(y, y_pred2)

0.20517241379310344