{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Data \n" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Setup" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "%matplotlib inline\n", "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns \n", "import matplotlib.pyplot as plt\n", "\n", "from statsmodels.stats.outliers_influence import variance_inflation_factor\n", "from statsmodels.tools.tools import add_constant \n", "\n", "sns.set_theme()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Import data" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "scrolled": true, "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "ROOT = \"https://raw.githubusercontent.com/kirenz/modern-statistics/main/data/\"\n", "DATA = \"duke-forest.csv\"\n", "\n", "df = pd.read_csv(ROOT + DATA)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data inspection" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | address | \n", "price | \n", "bed | \n", "bath | \n", "area | \n", "type | \n", "year_built | \n", "heating | \n", "cooling | \n", "parking | \n", "lot | \n", "hoa | \n", "url | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 Learned Pl, Durham, NC 27705 | \n", "1520000 | \n", "3 | \n", "4.0 | \n", "6040 | \n", "Single Family | \n", "1972 | \n", "Other, Gas | \n", "central | \n", "0 spaces | \n", "0.97 | \n", "NaN | \n", "https://www.zillow.com/homedetails/1-Learned-P... | \n", "
1 | \n", "1616 Pinecrest Rd, Durham, NC 27705 | \n", "1030000 | \n", "5 | \n", "4.0 | \n", "4475 | \n", "Single Family | \n", "1969 | \n", "Forced air, Gas | \n", "central | \n", "Carport, Covered | \n", "1.38 | \n", "NaN | \n", "https://www.zillow.com/homedetails/1616-Pinecr... | \n", "
2 | \n", "2418 Wrightwood Ave, Durham, NC 27705 | \n", "420000 | \n", "2 | \n", "3.0 | \n", "1745 | \n", "Single Family | \n", "1959 | \n", "Forced air, Gas | \n", "central | \n", "Garage - Attached, Covered | \n", "0.51 | \n", "NaN | \n", "https://www.zillow.com/homedetails/2418-Wright... | \n", "
3 | \n", "2527 Sevier St, Durham, NC 27705 | \n", "680000 | \n", "4 | \n", "3.0 | \n", "2091 | \n", "Single Family | \n", "1961 | \n", "Heat pump, Other, Electric, Gas | \n", "central | \n", "Carport, Covered | \n", "0.84 | \n", "NaN | \n", "https://www.zillow.com/homedetails/2527-Sevier... | \n", "
4 | \n", "2218 Myers St, Durham, NC 27707 | \n", "428500 | \n", "4 | \n", "3.0 | \n", "1772 | \n", "Single Family | \n", "2020 | \n", "Forced air, Gas | \n", "central | \n", "0 spaces | \n", "0.16 | \n", "NaN | \n", "https://www.zillow.com/homedetails/2218-Myers-... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
93 | \n", "2507 Sevier St, Durham, NC 27705 | \n", "541000 | \n", "4 | \n", "4.0 | \n", "2740 | \n", "Single Family | \n", "1960 | \n", "Forced air, Heat pump, Gas | \n", "central | \n", "Carport, Covered | \n", "0.51 | \n", "NaN | \n", "https://www.zillow.com/homedetails/2507-Sevier... | \n", "
94 | \n", "1207 Woodburn Rd, Durham, NC 27705 | \n", "473000 | \n", "3 | \n", "3.0 | \n", "2171 | \n", "Single Family | \n", "1955 | \n", "Forced air, Electric, Gas | \n", "other | \n", "0 spaces | \n", "0.61 | \n", "NaN | \n", "https://www.zillow.com/homedetails/1207-Woodbu... | \n", "
95 | \n", "3008 Montgomery St, Durham, NC 27705 | \n", "490000 | \n", "4 | \n", "4.0 | \n", "2972 | \n", "Single Family | \n", "1984 | \n", "Forced air, Electric, Gas | \n", "central | \n", "Garage - Attached, Off-street, Covered | \n", "0.65 | \n", "NaN | \n", "https://www.zillow.com/homedetails/3008-Montgo... | \n", "
96 | \n", "1614 Pinecrest Rd, Durham, NC 27705 | \n", "815000 | \n", "4 | \n", "4.0 | \n", "3904 | \n", "Single Family | \n", "1970 | \n", "Forced air, Gas | \n", "other | \n", "Garage - Attached, Garage - Detached, Covered | \n", "1.47 | \n", "NaN | \n", "https://www.zillow.com/homedetails/1614-Pinecr... | \n", "
97 | \n", "2708 Circle Dr, Durham, NC 27705 | \n", "674500 | \n", "4 | \n", "4.0 | \n", "3766 | \n", "Single Family | \n", "1955 | \n", "Forced air, Electric, Gas | \n", "other | \n", "0 spaces | \n", "0.73 | \n", "NaN | \n", "https://www.zillow.com/homedetails/2708-Circle... | \n", "
98 rows × 13 columns
\n", "\n", " | count | \n", "unique | \n", "top | \n", "freq | \n", "
---|---|---|---|---|
type | \n", "97 | \n", "1 | \n", "Single Family | \n", "97 | \n", "
heating | \n", "97 | \n", "19 | \n", "Forced air, Gas | \n", "34 | \n", "
cooling | \n", "97 | \n", "2 | \n", "other | \n", "52 | \n", "
parking | \n", "97 | \n", "19 | \n", "0 spaces | \n", "42 | \n", "
\n", " | price | \n", "bed | \n", "bath | \n", "area | \n", "year_built | \n", "cooling | \n", "lot | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "1520000 | \n", "3 | \n", "4.0 | \n", "6040 | \n", "1972 | \n", "central | \n", "0.97 | \n", "
1 | \n", "1030000 | \n", "5 | \n", "4.0 | \n", "4475 | \n", "1969 | \n", "central | \n", "1.38 | \n", "
2 | \n", "420000 | \n", "2 | \n", "3.0 | \n", "1745 | \n", "1959 | \n", "central | \n", "0.51 | \n", "
3 | \n", "680000 | \n", "4 | \n", "3.0 | \n", "2091 | \n", "1961 | \n", "central | \n", "0.84 | \n", "
4 | \n", "428500 | \n", "4 | \n", "3.0 | \n", "1772 | \n", "2020 | \n", "central | \n", "0.16 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
93 | \n", "541000 | \n", "4 | \n", "4.0 | \n", "2740 | \n", "1960 | \n", "central | \n", "0.51 | \n", "
94 | \n", "473000 | \n", "3 | \n", "3.0 | \n", "2171 | \n", "1955 | \n", "other | \n", "0.61 | \n", "
95 | \n", "490000 | \n", "4 | \n", "4.0 | \n", "2972 | \n", "1984 | \n", "central | \n", "0.65 | \n", "
96 | \n", "815000 | \n", "4 | \n", "4.0 | \n", "3904 | \n", "1970 | \n", "other | \n", "1.47 | \n", "
97 | \n", "674500 | \n", "4 | \n", "4.0 | \n", "3766 | \n", "1955 | \n", "other | \n", "0.73 | \n", "
97 rows × 7 columns
\n", "\n", " | price | \n", "bed | \n", "bath | \n", "area | \n", "year_built | \n", "cooling | \n", "lot | \n", "
---|---|---|---|---|---|---|---|
26 | \n", "385000 | \n", "3 | \n", "2.0 | \n", "1831 | \n", "1951 | \n", "central | \n", "0.29 | \n", "
85 | \n", "485000 | \n", "4 | \n", "3.0 | \n", "2609 | \n", "1962 | \n", "other | \n", "0.52 | \n", "
2 | \n", "420000 | \n", "2 | \n", "3.0 | \n", "1745 | \n", "1959 | \n", "central | \n", "0.51 | \n", "
55 | \n", "150000 | \n", "3 | \n", "1.0 | \n", "1734 | \n", "1945 | \n", "other | \n", "0.16 | \n", "
69 | \n", "105000 | \n", "2 | \n", "1.0 | \n", "1094 | \n", "1940 | \n", "other | \n", "0.26 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
96 | \n", "815000 | \n", "4 | \n", "4.0 | \n", "3904 | \n", "1970 | \n", "other | \n", "1.47 | \n", "
70 | \n", "520000 | \n", "4 | \n", "3.0 | \n", "2637 | \n", "1968 | \n", "other | \n", "0.65 | \n", "
20 | \n", "270000 | \n", "3 | \n", "3.0 | \n", "1416 | \n", "1990 | \n", "other | \n", "0.36 | \n", "
92 | \n", "590000 | \n", "5 | \n", "3.0 | \n", "3323 | \n", "1980 | \n", "other | \n", "0.43 | \n", "
73 | \n", "592000 | \n", "3 | \n", "2.0 | \n", "2378 | \n", "1960 | \n", "other | \n", "0.75 | \n", "
78 rows × 7 columns
\n", "\n", " | count | \n", "mean | \n", "std | \n", "min | \n", "25% | \n", "50% | \n", "75% | \n", "max | \n", "
---|---|---|---|---|---|---|---|---|
price | \n", "78.0 | \n", "560762.18 | \n", "243254.08 | \n", "95000.00 | \n", "421250.00 | \n", "537500.00 | \n", "650000.00 | \n", "1520000.00 | \n", "
bed | \n", "78.0 | \n", "3.81 | \n", "0.74 | \n", "2.00 | \n", "3.00 | \n", "4.00 | \n", "4.00 | \n", "6.00 | \n", "
bath | \n", "78.0 | \n", "3.10 | \n", "0.92 | \n", "1.00 | \n", "2.50 | \n", "3.00 | \n", "4.00 | \n", "5.00 | \n", "
area | \n", "78.0 | \n", "2831.40 | \n", "986.38 | \n", "1094.00 | \n", "2095.25 | \n", "2745.00 | \n", "3261.75 | \n", "6178.00 | \n", "
year_built | \n", "78.0 | \n", "1965.82 | \n", "16.80 | \n", "1923.00 | \n", "1956.25 | \n", "1961.50 | \n", "1971.50 | \n", "2020.00 | \n", "
lot | \n", "78.0 | \n", "0.59 | \n", "0.23 | \n", "0.15 | \n", "0.45 | \n", "0.56 | \n", "0.69 | \n", "1.47 | \n", "
\n", " | price | \n", "bed | \n", "bath | \n", "area | \n", "year_built | \n", "lot | \n", "
---|---|---|---|---|---|---|
price | \n", "1.000000 | \n", "0.446668 | \n", "0.593686 | \n", "0.680012 | \n", "0.248102 | \n", "0.537264 | \n", "
bed | \n", "0.446668 | \n", "1.000000 | \n", "0.599660 | \n", "0.560258 | \n", "0.216696 | \n", "0.248166 | \n", "
bath | \n", "0.593686 | \n", "0.599660 | \n", "1.000000 | \n", "0.659879 | \n", "0.351917 | \n", "0.335490 | \n", "
area | \n", "0.680012 | \n", "0.560258 | \n", "0.659879 | \n", "1.000000 | \n", "0.165495 | \n", "0.412836 | \n", "
year_built | \n", "0.248102 | \n", "0.216696 | \n", "0.351917 | \n", "0.165495 | \n", "1.000000 | \n", "-0.047352 | \n", "
lot | \n", "0.537264 | \n", "0.248166 | \n", "0.335490 | \n", "0.412836 | \n", "-0.047352 | \n", "1.000000 | \n", "
\n", " | VIF Factor | \n", "Feature | \n", "
---|---|---|
0 | \n", "28.52 | \n", "const | \n", "
1 | \n", "1.74 | \n", "bed | \n", "
2 | \n", "2.17 | \n", "bath | \n", "
3 | \n", "2.14 | \n", "area | \n", "
4 | \n", "1.19 | \n", "lot | \n", "