{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Essential methods\n", "\n", "This introduction to pandas is based on the [official pandas documentation](https://pandas.pydata.org/docs/user_guide/10min.html). It contains information from the first steps in pandas tutorial but also covers more advanced topics.\n", "\n", "\n", "Customarily, we import as follows:" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Object creation\n", "\n", "Creating a [Series](https://pandas.pydata.org/docs/reference/api/pandas.Series.html#pandas.Series) by passing a list of values, letting pandas create a default integer index:" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 3.0\n", "2 5.0\n", "3 NaN\n", "4 6.0\n", "5 8.0\n", "dtype: float64" ] }, "execution_count": 141, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = pd.Series([1, 3, 5, np.nan, 6, 8])\n", "\n", "s" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Creating a [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html#pandas.DataFrame) by passing a NumPy array, with a datetime index and labeled columns:" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',\n", " '2021-01-05', '2021-01-06'],\n", " dtype='datetime64[ns]', freq='D')" ] }, "execution_count": 142, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dates = pd.date_range(\"20210101\", periods=6)\n", "\n", "dates" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2021-01-01-1.224629-0.479631-0.1824380.062465
2021-01-02-2.204339-0.750873-0.342183-0.309644
2021-01-03-0.6446351.1158520.7322770.075960
2021-01-04-0.313480-0.8867271.1402170.339868
2021-01-051.5917741.840986-0.125656-2.238874
2021-01-06-1.8757011.032019-0.492826-0.705127
\n", "
" ], "text/plain": [ " A B C D\n", "2021-01-01 -1.224629 -0.479631 -0.182438 0.062465\n", "2021-01-02 -2.204339 -0.750873 -0.342183 -0.309644\n", "2021-01-03 -0.644635 1.115852 0.732277 0.075960\n", "2021-01-04 -0.313480 -0.886727 1.140217 0.339868\n", "2021-01-05 1.591774 1.840986 -0.125656 -2.238874\n", "2021-01-06 -1.875701 1.032019 -0.492826 -0.705127" ] }, "execution_count": 143, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list(\"ABCD\"))\n", "\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a simple DataFrame:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_height = pd.DataFrame(\n", " { \n", " 'name': pd.Categorical([ \"Tom\", \"Marc\", \"Peter\"]),\n", " 'height': np.array([168, 193, 172],dtype='int32'),\n", " 'gender': 'male' })" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Creating a DataFrame by passing a dict of objects that can be converted to series-like." ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDEF
01.02013-01-021.03testfoo
11.02013-01-021.03trainfoo
21.02013-01-021.03testfoo
31.02013-01-021.03trainfoo
\n", "
" ], "text/plain": [ " A B C D E F\n", "0 1.0 2013-01-02 1.0 3 test foo\n", "1 1.0 2013-01-02 1.0 3 train foo\n", "2 1.0 2013-01-02 1.0 3 test foo\n", "3 1.0 2013-01-02 1.0 3 train foo" ] }, "execution_count": 144, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = pd.DataFrame(\n", " {\n", " \"A\": 1.0,\n", " \"B\": pd.Timestamp(\"20130102\"),\n", " \"C\": pd.Series(1, index=list(range(4)), dtype=\"float32\"),\n", " \"D\": np.array([3] * 4, dtype=\"int32\"),\n", " \"E\": pd.Categorical([\"test\", \"train\", \"test\", \"train\"]),\n", " \"F\": \"foo\",\n", " }\n", ")\n", "\n", "df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The columns of the resulting DataFrame have different [dtypes](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes)." ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A float64\n", "B datetime64[ns]\n", "C float32\n", "D int32\n", "E category\n", "F object\n", "dtype: object" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If you’re using IPython, tab completion for column names (as well as public attributes) is automatically enabled. Here’s a subset of the attributes that will be completed:" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [], "source": [ "# df2." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Viewing data\n", "\n", "Here is how to view the top and bottom rows of the frame:" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2021-01-01-1.224629-0.479631-0.1824380.062465
2021-01-02-2.204339-0.750873-0.342183-0.309644
2021-01-03-0.6446351.1158520.7322770.075960
2021-01-04-0.313480-0.8867271.1402170.339868
2021-01-051.5917741.840986-0.125656-2.238874
2021-01-06-1.8757011.032019-0.492826-0.705127
\n", "
" ], "text/plain": [ " A B C D\n", "2021-01-01 -1.224629 -0.479631 -0.182438 0.062465\n", "2021-01-02 -2.204339 -0.750873 -0.342183 -0.309644\n", "2021-01-03 -0.644635 1.115852 0.732277 0.075960\n", "2021-01-04 -0.313480 -0.886727 1.140217 0.339868\n", "2021-01-05 1.591774 1.840986 -0.125656 -2.238874\n", "2021-01-06 -1.875701 1.032019 -0.492826 -0.705127" ] }, "execution_count": 147, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2021-01-01-1.224629-0.479631-0.1824380.062465
2021-01-02-2.204339-0.750873-0.342183-0.309644
2021-01-03-0.6446351.1158520.7322770.075960
2021-01-04-0.313480-0.8867271.1402170.339868
2021-01-051.5917741.840986-0.125656-2.238874
\n", "
" ], "text/plain": [ " A B C D\n", "2021-01-01 -1.224629 -0.479631 -0.182438 0.062465\n", "2021-01-02 -2.204339 -0.750873 -0.342183 -0.309644\n", "2021-01-03 -0.644635 1.115852 0.732277 0.075960\n", "2021-01-04 -0.313480 -0.886727 1.140217 0.339868\n", "2021-01-05 1.591774 1.840986 -0.125656 -2.238874" ] }, "execution_count": 148, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 149, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2021-01-04-0.313480-0.8867271.1402170.339868
2021-01-051.5917741.840986-0.125656-2.238874
2021-01-06-1.8757011.032019-0.492826-0.705127
\n", "
" ], "text/plain": [ " A B C D\n", "2021-01-04 -0.313480 -0.886727 1.140217 0.339868\n", "2021-01-05 1.591774 1.840986 -0.125656 -2.238874\n", "2021-01-06 -1.875701 1.032019 -0.492826 -0.705127" ] }, "execution_count": 149, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail(3)" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "DatetimeIndex: 6 entries, 2021-01-01 to 2021-01-06\n", "Freq: D\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 A 6 non-null float64\n", " 1 B 6 non-null float64\n", " 2 C 6 non-null float64\n", " 3 D 6 non-null float64\n", "dtypes: float64(4)\n", "memory usage: 240.0 bytes\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',\n", " '2021-01-05', '2021-01-06'],\n", " dtype='datetime64[ns]', freq='D')" ] }, "execution_count": 151, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.index" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['A', 'B', 'C', 'D'], dtype='object')" ] }, "execution_count": 152, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[describe()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html#pandas.DataFrame.describe) shows a quick statistic summary of your data:" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
count6.0000006.0000006.0000006.000000
mean-0.7785020.3119380.121565-0.462559
std1.3629571.1572150.6567640.943086
min-2.204339-0.886727-0.492826-2.238874
25%-1.712933-0.683063-0.302247-0.606256
50%-0.9346320.276194-0.154047-0.123589
75%-0.3962691.0948940.5177940.072587
max1.5917741.8409861.1402170.339868
\n", "
" ], "text/plain": [ " A B C D\n", "count 6.000000 6.000000 6.000000 6.000000\n", "mean -0.778502 0.311938 0.121565 -0.462559\n", "std 1.362957 1.157215 0.656764 0.943086\n", "min -2.204339 -0.886727 -0.492826 -2.238874\n", "25% -1.712933 -0.683063 -0.302247 -0.606256\n", "50% -0.934632 0.276194 -0.154047 -0.123589\n", "75% -0.396269 1.094894 0.517794 0.072587\n", "max 1.591774 1.840986 1.140217 0.339868" ] }, "execution_count": 153, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Transposing your data:" ] }, { "cell_type": "code", "execution_count": 154, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
2021-01-012021-01-022021-01-032021-01-042021-01-052021-01-06
A-1.224629-2.204339-0.644635-0.3134801.591774-1.875701
B-0.479631-0.7508731.115852-0.8867271.8409861.032019
C-0.182438-0.3421830.7322771.140217-0.125656-0.492826
D0.062465-0.3096440.0759600.339868-2.238874-0.705127
\n", "
" ], "text/plain": [ " 2021-01-01 2021-01-02 2021-01-03 2021-01-04 2021-01-05 2021-01-06\n", "A -1.224629 -2.204339 -0.644635 -0.313480 1.591774 -1.875701\n", "B -0.479631 -0.750873 1.115852 -0.886727 1.840986 1.032019\n", "C -0.182438 -0.342183 0.732277 1.140217 -0.125656 -0.492826\n", "D 0.062465 -0.309644 0.075960 0.339868 -2.238874 -0.705127" ] }, "execution_count": 154, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sorting by an axis:" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DCBA
2021-01-010.062465-0.182438-0.479631-1.224629
2021-01-02-0.309644-0.342183-0.750873-2.204339
2021-01-030.0759600.7322771.115852-0.644635
2021-01-040.3398681.140217-0.886727-0.313480
2021-01-05-2.238874-0.1256561.8409861.591774
2021-01-06-0.705127-0.4928261.032019-1.875701
\n", "
" ], "text/plain": [ " D C B A\n", "2021-01-01 0.062465 -0.182438 -0.479631 -1.224629\n", "2021-01-02 -0.309644 -0.342183 -0.750873 -2.204339\n", "2021-01-03 0.075960 0.732277 1.115852 -0.644635\n", "2021-01-04 0.339868 1.140217 -0.886727 -0.313480\n", "2021-01-05 -2.238874 -0.125656 1.840986 1.591774\n", "2021-01-06 -0.705127 -0.492826 1.032019 -1.875701" ] }, "execution_count": 155, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_index(axis=1, ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sorting by values:" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2021-01-04-0.313480-0.8867271.1402170.339868
2021-01-02-2.204339-0.750873-0.342183-0.309644
2021-01-01-1.224629-0.479631-0.1824380.062465
2021-01-06-1.8757011.032019-0.492826-0.705127
2021-01-03-0.6446351.1158520.7322770.075960
2021-01-051.5917741.840986-0.125656-2.238874
\n", "
" ], "text/plain": [ " A B C D\n", "2021-01-04 -0.313480 -0.886727 1.140217 0.339868\n", "2021-01-02 -2.204339 -0.750873 -0.342183 -0.309644\n", "2021-01-01 -1.224629 -0.479631 -0.182438 0.062465\n", "2021-01-06 -1.875701 1.032019 -0.492826 -0.705127\n", "2021-01-03 -0.644635 1.115852 0.732277 0.075960\n", "2021-01-05 1.591774 1.840986 -0.125656 -2.238874" ] }, "execution_count": 156, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_values(by=\"B\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Selection\n", "\n", "While standard Python / NumPy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, .at, .iat, .loc and .iloc.\n", "\n", "### Getting\n", "Selecting a single column, which yields a Series, equivalent to `df.A`:" ] }, { "cell_type": "code", "execution_count": 157, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2021-01-01 -1.224629\n", "2021-01-02 -2.204339\n", "2021-01-03 -0.644635\n", "2021-01-04 -0.313480\n", "2021-01-05 1.591774\n", "2021-01-06 -1.875701\n", "Freq: D, Name: A, dtype: float64" ] }, "execution_count": 157, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"A\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Selecting via [], which slices the rows.\n" ] }, { "cell_type": "code", "execution_count": 158, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2021-01-01-1.224629-0.479631-0.1824380.062465
2021-01-02-2.204339-0.750873-0.342183-0.309644
2021-01-03-0.6446351.1158520.7322770.075960
\n", "
" ], "text/plain": [ " A B C D\n", "2021-01-01 -1.224629 -0.479631 -0.182438 0.062465\n", "2021-01-02 -2.204339 -0.750873 -0.342183 -0.309644\n", "2021-01-03 -0.644635 1.115852 0.732277 0.075960" ] }, "execution_count": 158, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[0:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2021-01-01-1.224629-0.479631-0.1824380.062465
2021-01-02-2.204339-0.750873-0.342183-0.309644
2021-01-03-0.6446351.1158520.7322770.075960
\n", "
" ], "text/plain": [ " A B C D\n", "2021-01-01 -1.224629 -0.479631 -0.182438 0.062465\n", "2021-01-02 -2.204339 -0.750873 -0.342183 -0.309644\n", "2021-01-03 -0.644635 1.115852 0.732277 0.075960" ] }, "execution_count": 159, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"20210101\":\"20210103\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Selection by label\n", "\n", "\n", "\n", "The `.loc` attribute is the primary access method. The following are valid inputs:\n", "\n", "- A single label, e.g. 5 or 'a' (Note that 5 is interpreted as a label of the index. This use is not an integer position along the index.).\n", "\n", "- A list or array of labels ['a', 'b', 'c'].\n", "\n", "- A slice object with labels 'a':'f' (Note that contrary to usual Python slices, both the start and the stop are included, when present in the index! See Slicing with labels.\n", "\n", "- A boolean array.\n", "\n", "- A callable, see [Selection By Callable](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-callable).\n", "\n", "\n", "For getting a cross section using a label:" ] }, { "cell_type": "code", "execution_count": 160, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A -1.224629\n", "B -0.479631\n", "C -0.182438\n", "D 0.062465\n", "Name: 2021-01-01 00:00:00, dtype: float64" ] }, "execution_count": 160, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[dates[0]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Selecting on a multi-axis by label:" ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
2021-01-01-1.224629-0.479631
2021-01-02-2.204339-0.750873
2021-01-03-0.6446351.115852
2021-01-04-0.313480-0.886727
2021-01-051.5917741.840986
2021-01-06-1.8757011.032019
\n", "
" ], "text/plain": [ " A B\n", "2021-01-01 -1.224629 -0.479631\n", "2021-01-02 -2.204339 -0.750873\n", "2021-01-03 -0.644635 1.115852\n", "2021-01-04 -0.313480 -0.886727\n", "2021-01-05 1.591774 1.840986\n", "2021-01-06 -1.875701 1.032019" ] }, "execution_count": 161, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[ : , [\"A\", \"B\"]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Showing label slicing, both endpoints are included:" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
2021-01-02-2.204339-0.750873
2021-01-03-0.6446351.115852
2021-01-04-0.313480-0.886727
\n", "
" ], "text/plain": [ " A B\n", "2021-01-02 -2.204339 -0.750873\n", "2021-01-03 -0.644635 1.115852\n", "2021-01-04 -0.313480 -0.886727" ] }, "execution_count": 162, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[\"20210102\":\"20210104\", [\"A\", \"B\"]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Reduction in the dimensions of the returned object:" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A -2.204339\n", "B -0.750873\n", "Name: 2021-01-02 00:00:00, dtype: float64" ] }, "execution_count": 163, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[\"20210102\", [\"A\", \"B\"]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For getting a scalar value:" ] }, { "cell_type": "code", "execution_count": 164, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-1.2246288596124404" ] }, "execution_count": 164, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[dates[0], \"A\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For getting fast access to a scalar (equivalent to the prior method) use `.at`.\n", "Use at if you only need to get or set a single value in a DataFrame or Series." ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-1.2246288596124404" ] }, "execution_count": 165, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.at[dates[0], \"A\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set value at specified row/column pair" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2021-01-01 1.000000\n", "2021-01-02 -2.204339\n", "2021-01-03 -0.644635\n", "2021-01-04 -0.313480\n", "2021-01-05 1.591774\n", "2021-01-06 -1.875701\n", "Freq: D, Name: A, dtype: float64" ] }, "execution_count": 166, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.at[dates[0], \"A\"] = 1\n", "\n", "df.A" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Selection by position\n", "\n", "\n", "pandas provides a suite of methods in order to get purely integer based indexing. \n", "\n", "The .iloc attribute is the primary access method. The following are valid inputs:\n", "\n", "- An integer e.g. 5.\n", "\n", "- A list or array of integers [4, 3, 0].\n", "\n", "- A slice object with ints 1:7.\n", "\n", "- A boolean array.\n", "\n", "- A callable, see [Selection By Callable](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-callable).\n", "\n", "See more in [Selection by Position](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-integer)\n", "\n", "\n", "Select via the position of the passed integers:" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A 1.000000\n", "B -0.479631\n", "C -0.182438\n", "D 0.062465\n", "Name: 2021-01-01 00:00:00, dtype: float64" ] }, "execution_count": 167, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "By integer slices, acting similar to NumPy/Python:" ] }, { "cell_type": "code", "execution_count": 168, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
2021-01-011.000000-0.479631
2021-01-02-2.204339-0.750873
2021-01-03-0.6446351.115852
\n", "
" ], "text/plain": [ " A B\n", "2021-01-01 1.000000 -0.479631\n", "2021-01-02 -2.204339 -0.750873\n", "2021-01-03 -0.644635 1.115852" ] }, "execution_count": 168, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[0:3, 0:2]" ] }, { "cell_type": "code", "execution_count": 169, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
2021-01-02-2.204339-0.750873
2021-01-03-0.6446351.115852
\n", "
" ], "text/plain": [ " A B\n", "2021-01-02 -2.204339 -0.750873\n", "2021-01-03 -0.644635 1.115852" ] }, "execution_count": 169, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[1:3, 0:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "By lists of integer position locations, similar to the NumPy/Python style:" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AC
2021-01-02-2.204339-0.342183
2021-01-03-0.6446350.732277
2021-01-051.591774-0.125656
\n", "
" ], "text/plain": [ " A C\n", "2021-01-02 -2.204339 -0.342183\n", "2021-01-03 -0.644635 0.732277\n", "2021-01-05 1.591774 -0.125656" ] }, "execution_count": 170, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[[1, 2, 4], [0, 2]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For slicing rows explicitly:" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2021-01-02-2.204339-0.750873-0.342183-0.309644
2021-01-03-0.6446351.1158520.7322770.075960
\n", "
" ], "text/plain": [ " A B C D\n", "2021-01-02 -2.204339 -0.750873 -0.342183 -0.309644\n", "2021-01-03 -0.644635 1.115852 0.732277 0.075960" ] }, "execution_count": 171, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[1:3, :]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For slicing columns explicitly:" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BC
2021-01-01-0.479631-0.182438
2021-01-02-0.750873-0.342183
2021-01-031.1158520.732277
2021-01-04-0.8867271.140217
2021-01-051.840986-0.125656
2021-01-061.032019-0.492826
\n", "
" ], "text/plain": [ " B C\n", "2021-01-01 -0.479631 -0.182438\n", "2021-01-02 -0.750873 -0.342183\n", "2021-01-03 1.115852 0.732277\n", "2021-01-04 -0.886727 1.140217\n", "2021-01-05 1.840986 -0.125656\n", "2021-01-06 1.032019 -0.492826" ] }, "execution_count": 172, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[:, 1:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For getting a value explicitly:" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 173, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[0, 0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For getting fast access to a scalar (equivalent to the prior method):" ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 174, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iat[0, 0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Boolean indexing\n", "\n", "Boolean indexing is used to filter subsets of data based on actual values inside the DataFrame. It is called [Boolean](https://en.wikipedia.org/wiki/Boolean_data_type) since the filter condition is associated with one of two possible values (usually denoted `True` and `False`)\n", "\n", "Using a single column’s values to select data." ] }, { "cell_type": "code", "execution_count": 175, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2021-01-011.000000-0.479631-0.1824380.062465
2021-01-051.5917741.840986-0.125656-2.238874
\n", "
" ], "text/plain": [ " A B C D\n", "2021-01-01 1.000000 -0.479631 -0.182438 0.062465\n", "2021-01-05 1.591774 1.840986 -0.125656 -2.238874" ] }, "execution_count": 175, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df[\"A\"] > 0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Selecting values from a DataFrame where a boolean condition is met." ] }, { "cell_type": "code", "execution_count": 176, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2021-01-011.000000NaNNaN0.062465
2021-01-02NaNNaNNaNNaN
2021-01-03NaN1.1158520.7322770.075960
2021-01-04NaNNaN1.1402170.339868
2021-01-051.5917741.840986NaNNaN
2021-01-06NaN1.032019NaNNaN
\n", "
" ], "text/plain": [ " A B C D\n", "2021-01-01 1.000000 NaN NaN 0.062465\n", "2021-01-02 NaN NaN NaN NaN\n", "2021-01-03 NaN 1.115852 0.732277 0.075960\n", "2021-01-04 NaN NaN 1.140217 0.339868\n", "2021-01-05 1.591774 1.840986 NaN NaN\n", "2021-01-06 NaN 1.032019 NaN NaN" ] }, "execution_count": 176, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df > 0]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's make a copy of the `df2` object’s indices and data. We use the default `deep=True` to create a new object with a copy of the calling object’s data and indices. Modifications to the data or indices of the copy will not be reflected in the original object. Note that a shallow copy (`deep=False`) shares data and index with original." ] }, { "cell_type": "code", "execution_count": 177, "metadata": {}, "outputs": [], "source": [ "df_copy = df.copy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a new column in the DataFrame:" ] }, { "cell_type": "code", "execution_count": 178, "metadata": {}, "outputs": [], "source": [ "df_copy[\"E\"] = [\"one\", \"one\", \"two\", \"three\", \"four\", \"three\"]" ] }, { "cell_type": "code", "execution_count": 179, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
2021-01-011.000000-0.479631-0.1824380.062465one
2021-01-02-2.204339-0.750873-0.342183-0.309644one
2021-01-03-0.6446351.1158520.7322770.075960two
2021-01-04-0.313480-0.8867271.1402170.339868three
2021-01-051.5917741.840986-0.125656-2.238874four
2021-01-06-1.8757011.032019-0.492826-0.705127three
\n", "
" ], "text/plain": [ " A B C D E\n", "2021-01-01 1.000000 -0.479631 -0.182438 0.062465 one\n", "2021-01-02 -2.204339 -0.750873 -0.342183 -0.309644 one\n", "2021-01-03 -0.644635 1.115852 0.732277 0.075960 two\n", "2021-01-04 -0.313480 -0.886727 1.140217 0.339868 three\n", "2021-01-05 1.591774 1.840986 -0.125656 -2.238874 four\n", "2021-01-06 -1.875701 1.032019 -0.492826 -0.705127 three" ] }, "execution_count": 179, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_copy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using the [isin()](https://pandas.pydata.org/docs/reference/api/pandas.Series.isin.html#pandas.Series.isin) method for filtering:" ] }, { "cell_type": "code", "execution_count": 180, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
2021-01-03-0.6446351.1158520.7322770.075960two
2021-01-051.5917741.840986-0.125656-2.238874four
\n", "
" ], "text/plain": [ " A B C D E\n", "2021-01-03 -0.644635 1.115852 0.732277 0.075960 two\n", "2021-01-05 1.591774 1.840986 -0.125656 -2.238874 four" ] }, "execution_count": 180, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_copy[df_copy[\"E\"].isin([\"two\", \"four\"])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setting\n", "\n", "Setting a new column automatically aligns the data by the indexes." ] }, { "cell_type": "code", "execution_count": 181, "metadata": {}, "outputs": [], "source": [ "s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range(\"20210102\", periods=6))" ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2021-01-02 1\n", "2021-01-03 2\n", "2021-01-04 3\n", "2021-01-05 4\n", "2021-01-06 5\n", "2021-01-07 6\n", "Freq: D, dtype: int64" ] }, "execution_count": 182, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s1" ] }, { "cell_type": "code", "execution_count": 183, "metadata": {}, "outputs": [], "source": [ "df[\"F\"] = s1" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDF
2021-01-011.000000-0.479631-0.1824380.062465NaN
2021-01-02-2.204339-0.750873-0.342183-0.3096441.0
2021-01-03-0.6446351.1158520.7322770.0759602.0
2021-01-04-0.313480-0.8867271.1402170.3398683.0
2021-01-051.5917741.840986-0.125656-2.2388744.0
2021-01-06-1.8757011.032019-0.492826-0.7051275.0
\n", "
" ], "text/plain": [ " A B C D F\n", "2021-01-01 1.000000 -0.479631 -0.182438 0.062465 NaN\n", "2021-01-02 -2.204339 -0.750873 -0.342183 -0.309644 1.0\n", "2021-01-03 -0.644635 1.115852 0.732277 0.075960 2.0\n", "2021-01-04 -0.313480 -0.886727 1.140217 0.339868 3.0\n", "2021-01-05 1.591774 1.840986 -0.125656 -2.238874 4.0\n", "2021-01-06 -1.875701 1.032019 -0.492826 -0.705127 5.0" ] }, "execution_count": 184, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Setting values by label:" ] }, { "cell_type": "code", "execution_count": 185, "metadata": {}, "outputs": [], "source": [ "df.at[dates[0], \"A\"] = 0" ] }, { "cell_type": "code", "execution_count": 186, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDF
2021-01-010.000000-0.479631-0.1824380.062465NaN
2021-01-02-2.204339-0.750873-0.342183-0.3096441.0
2021-01-03-0.6446351.1158520.7322770.0759602.0
2021-01-04-0.313480-0.8867271.1402170.3398683.0
2021-01-051.5917741.840986-0.125656-2.2388744.0
2021-01-06-1.8757011.032019-0.492826-0.7051275.0
\n", "
" ], "text/plain": [ " A B C D F\n", "2021-01-01 0.000000 -0.479631 -0.182438 0.062465 NaN\n", "2021-01-02 -2.204339 -0.750873 -0.342183 -0.309644 1.0\n", "2021-01-03 -0.644635 1.115852 0.732277 0.075960 2.0\n", "2021-01-04 -0.313480 -0.886727 1.140217 0.339868 3.0\n", "2021-01-05 1.591774 1.840986 -0.125656 -2.238874 4.0\n", "2021-01-06 -1.875701 1.032019 -0.492826 -0.705127 5.0" ] }, "execution_count": 186, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Setting values by position:" ] }, { "cell_type": "code", "execution_count": 187, "metadata": {}, "outputs": [], "source": [ "df.iat[0, 1] = 0" ] }, { "cell_type": "code", "execution_count": 188, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDF
2021-01-010.0000000.000000-0.1824380.062465NaN
2021-01-02-2.204339-0.750873-0.342183-0.3096441.0
2021-01-03-0.6446351.1158520.7322770.0759602.0
2021-01-04-0.313480-0.8867271.1402170.3398683.0
2021-01-051.5917741.840986-0.125656-2.2388744.0
2021-01-06-1.8757011.032019-0.492826-0.7051275.0
\n", "
" ], "text/plain": [ " A B C D F\n", "2021-01-01 0.000000 0.000000 -0.182438 0.062465 NaN\n", "2021-01-02 -2.204339 -0.750873 -0.342183 -0.309644 1.0\n", "2021-01-03 -0.644635 1.115852 0.732277 0.075960 2.0\n", "2021-01-04 -0.313480 -0.886727 1.140217 0.339868 3.0\n", "2021-01-05 1.591774 1.840986 -0.125656 -2.238874 4.0\n", "2021-01-06 -1.875701 1.032019 -0.492826 -0.705127 5.0" ] }, "execution_count": 188, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Setting by assigning with a NumPy array:" ] }, { "cell_type": "code", "execution_count": 189, "metadata": {}, "outputs": [], "source": [ "df.loc[:, \"D\"] = np.array([5] * len(df))" ] }, { "cell_type": "code", "execution_count": 190, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDF
2021-01-010.0000000.000000-0.1824385NaN
2021-01-02-2.204339-0.750873-0.34218351.0
2021-01-03-0.6446351.1158520.73227752.0
2021-01-04-0.313480-0.8867271.14021753.0
2021-01-051.5917741.840986-0.12565654.0
2021-01-06-1.8757011.032019-0.49282655.0
\n", "
" ], "text/plain": [ " A B C D F\n", "2021-01-01 0.000000 0.000000 -0.182438 5 NaN\n", "2021-01-02 -2.204339 -0.750873 -0.342183 5 1.0\n", "2021-01-03 -0.644635 1.115852 0.732277 5 2.0\n", "2021-01-04 -0.313480 -0.886727 1.140217 5 3.0\n", "2021-01-05 1.591774 1.840986 -0.125656 5 4.0\n", "2021-01-06 -1.875701 1.032019 -0.492826 5 5.0" ] }, "execution_count": 190, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A where operation with setting." ] }, { "cell_type": "code", "execution_count": 191, "metadata": {}, "outputs": [], "source": [ "df3 = df.copy()" ] }, { "cell_type": "code", "execution_count": 192, "metadata": {}, "outputs": [], "source": [ "df3[df3 > 0]= -df3" ] }, { "cell_type": "code", "execution_count": 193, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDF
2021-01-010.0000000.000000-0.182438-5NaN
2021-01-02-2.204339-0.750873-0.342183-5-1.0
2021-01-03-0.644635-1.115852-0.732277-5-2.0
2021-01-04-0.313480-0.886727-1.140217-5-3.0
2021-01-05-1.591774-1.840986-0.125656-5-4.0
2021-01-06-1.875701-1.032019-0.492826-5-5.0
\n", "
" ], "text/plain": [ " A B C D F\n", "2021-01-01 0.000000 0.000000 -0.182438 -5 NaN\n", "2021-01-02 -2.204339 -0.750873 -0.342183 -5 -1.0\n", "2021-01-03 -0.644635 -1.115852 -0.732277 -5 -2.0\n", "2021-01-04 -0.313480 -0.886727 -1.140217 -5 -3.0\n", "2021-01-05 -1.591774 -1.840986 -0.125656 -5 -4.0\n", "2021-01-06 -1.875701 -1.032019 -0.492826 -5 -5.0" ] }, "execution_count": 193, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Missing data\n", "\n", "pandas primarily uses the value np.nan to represent missing data. It is by default not included in computations. See the [Missing Data section](https://pandas.pydata.org/docs/user_guide/missing_data.html#missing-data) for details.\n", "\n", "Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data." ] }, { "cell_type": "code", "execution_count": 194, "metadata": {}, "outputs": [], "source": [ "df4 = df.reindex(index=dates[0:4], columns=list(df.columns) + [\"E\"])" ] }, { "cell_type": "code", "execution_count": 195, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDFE
2021-01-010.0000000.000000-0.1824385NaNNaN
2021-01-02-2.204339-0.750873-0.34218351.0NaN
2021-01-03-0.6446351.1158520.73227752.0NaN
2021-01-04-0.313480-0.8867271.14021753.0NaN
\n", "
" ], "text/plain": [ " A B C D F E\n", "2021-01-01 0.000000 0.000000 -0.182438 5 NaN NaN\n", "2021-01-02 -2.204339 -0.750873 -0.342183 5 1.0 NaN\n", "2021-01-03 -0.644635 1.115852 0.732277 5 2.0 NaN\n", "2021-01-04 -0.313480 -0.886727 1.140217 5 3.0 NaN" ] }, "execution_count": 195, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4" ] }, { "cell_type": "code", "execution_count": 196, "metadata": {}, "outputs": [], "source": [ "df4.loc[dates[0] : dates[1], \"E\"] = 1" ] }, { "cell_type": "code", "execution_count": 197, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDFE
2021-01-010.0000000.000000-0.1824385NaN1.0
2021-01-02-2.204339-0.750873-0.34218351.01.0
2021-01-03-0.6446351.1158520.73227752.0NaN
2021-01-04-0.313480-0.8867271.14021753.0NaN
\n", "
" ], "text/plain": [ " A B C D F E\n", "2021-01-01 0.000000 0.000000 -0.182438 5 NaN 1.0\n", "2021-01-02 -2.204339 -0.750873 -0.342183 5 1.0 1.0\n", "2021-01-03 -0.644635 1.115852 0.732277 5 2.0 NaN\n", "2021-01-04 -0.313480 -0.886727 1.140217 5 3.0 NaN" ] }, "execution_count": 197, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To drop any rows that have missing data." ] }, { "cell_type": "code", "execution_count": 198, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDFE
2021-01-02-2.204339-0.750873-0.34218351.01.0
\n", "
" ], "text/plain": [ " A B C D F E\n", "2021-01-02 -2.204339 -0.750873 -0.342183 5 1.0 1.0" ] }, "execution_count": 198, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4.dropna(how=\"any\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Filling missing data." ] }, { "cell_type": "code", "execution_count": 199, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDFE
2021-01-010.0000000.000000-0.18243855.01.0
2021-01-02-2.204339-0.750873-0.34218351.01.0
2021-01-03-0.6446351.1158520.73227752.05.0
2021-01-04-0.313480-0.8867271.14021753.05.0
\n", "
" ], "text/plain": [ " A B C D F E\n", "2021-01-01 0.000000 0.000000 -0.182438 5 5.0 1.0\n", "2021-01-02 -2.204339 -0.750873 -0.342183 5 1.0 1.0\n", "2021-01-03 -0.644635 1.115852 0.732277 5 2.0 5.0\n", "2021-01-04 -0.313480 -0.886727 1.140217 5 3.0 5.0" ] }, "execution_count": 199, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4.fillna(value=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To get the boolean mask where values are nan." ] }, { "cell_type": "code", "execution_count": 200, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDFE
2021-01-01FalseFalseFalseFalseTrueFalse
2021-01-02FalseFalseFalseFalseFalseFalse
2021-01-03FalseFalseFalseFalseFalseTrue
2021-01-04FalseFalseFalseFalseFalseTrue
\n", "
" ], "text/plain": [ " A B C D F E\n", "2021-01-01 False False False False True False\n", "2021-01-02 False False False False False False\n", "2021-01-03 False False False False False True\n", "2021-01-04 False False False False False True" ] }, "execution_count": 200, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.isna(df4)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Operations\n", "\n", "Note that operations in general exclude missing data.\n", "\n", "### Statistics\n", "\n", "Performing a descriptive statistic:" ] }, { "cell_type": "code", "execution_count": 201, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A float64\n", "B float64\n", "C float64\n", "D int64\n", "F float64\n", "dtype: object" ] }, "execution_count": 201, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 202, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A -0.574397\n", "B 0.391876\n", "C 0.121565\n", "D 5.000000\n", "F 3.000000\n", "dtype: float64" ] }, "execution_count": 202, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Same operation on the other axis:" ] }, { "cell_type": "code", "execution_count": 203, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2021-01-01 1.204390\n", "2021-01-02 0.540521\n", "2021-01-03 1.640699\n", "2021-01-04 1.588002\n", "2021-01-05 2.461421\n", "2021-01-06 1.732698\n", "Freq: D, dtype: float64" ] }, "execution_count": 203, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.mean(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Operating with objects that have different dimensionality and need alignment. In addition, pandas automatically broadcasts along the specified dimension.\n", "\n", "Let’s use the shift() function to shift the index axis by 2 periods in the positive direction (i.e., create 2021-01-05 and 2021-01-06):" ] }, { "cell_type": "code", "execution_count": 204, "metadata": {}, "outputs": [], "source": [ "s2 = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)" ] }, { "cell_type": "code", "execution_count": 205, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2021-01-01 NaN\n", "2021-01-02 NaN\n", "2021-01-03 1.0\n", "2021-01-04 3.0\n", "2021-01-05 5.0\n", "2021-01-06 NaN\n", "Freq: D, dtype: float64" ] }, "execution_count": 205, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can use the `sub()` function to subtract each element of a DataFrame with a corresponding element in a series. " ] }, { "cell_type": "code", "execution_count": 206, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDF
2021-01-010.0000000.000000-0.1824385NaN
2021-01-02-2.204339-0.750873-0.34218351.0
2021-01-03-0.6446351.1158520.73227752.0
2021-01-04-0.313480-0.8867271.14021753.0
2021-01-051.5917741.840986-0.12565654.0
2021-01-06-1.8757011.032019-0.49282655.0
\n", "
" ], "text/plain": [ " A B C D F\n", "2021-01-01 0.000000 0.000000 -0.182438 5 NaN\n", "2021-01-02 -2.204339 -0.750873 -0.342183 5 1.0\n", "2021-01-03 -0.644635 1.115852 0.732277 5 2.0\n", "2021-01-04 -0.313480 -0.886727 1.140217 5 3.0\n", "2021-01-05 1.591774 1.840986 -0.125656 5 4.0\n", "2021-01-06 -1.875701 1.032019 -0.492826 5 5.0" ] }, "execution_count": 206, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 207, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDF
2021-01-01NaNNaNNaNNaNNaN
2021-01-02NaNNaNNaNNaNNaN
2021-01-03-1.6446350.115852-0.2677234.01.0
2021-01-04-3.313480-3.886727-1.8597832.00.0
2021-01-05-3.408226-3.159014-5.1256560.0-1.0
2021-01-06NaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " A B C D F\n", "2021-01-01 NaN NaN NaN NaN NaN\n", "2021-01-02 NaN NaN NaN NaN NaN\n", "2021-01-03 -1.644635 0.115852 -0.267723 4.0 1.0\n", "2021-01-04 -3.313480 -3.886727 -1.859783 2.0 0.0\n", "2021-01-05 -3.408226 -3.159014 -5.125656 0.0 -1.0\n", "2021-01-06 NaN NaN NaN NaN NaN" ] }, "execution_count": 207, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sub(s2, axis=\"index\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Apply\n", "\n", "Applying functions to data:" ] }, { "cell_type": "code", "execution_count": 208, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDF
2021-01-010.0000000.000000-0.1824385NaN
2021-01-02-2.204339-0.750873-0.524621101.0
2021-01-03-2.8489740.3649790.207656153.0
2021-01-04-3.162453-0.5217481.347873206.0
2021-01-05-1.5706801.3192371.2222172510.0
2021-01-06-3.4463812.3512560.7293913015.0
\n", "
" ], "text/plain": [ " A B C D F\n", "2021-01-01 0.000000 0.000000 -0.182438 5 NaN\n", "2021-01-02 -2.204339 -0.750873 -0.524621 10 1.0\n", "2021-01-03 -2.848974 0.364979 0.207656 15 3.0\n", "2021-01-04 -3.162453 -0.521748 1.347873 20 6.0\n", "2021-01-05 -1.570680 1.319237 1.222217 25 10.0\n", "2021-01-06 -3.446381 2.351256 0.729391 30 15.0" ] }, "execution_count": 208, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.apply(np.cumsum)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A lambda function is a small anonymous function. It can take any number of arguments, but can only have one expression:" ] }, { "cell_type": "code", "execution_count": 209, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A 3.796112\n", "B 2.727713\n", "C 1.633043\n", "D 0.000000\n", "F 4.000000\n", "dtype: float64" ] }, "execution_count": 209, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.apply(lambda x: x.max() - x.min())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Value counts (histogramming)" ] }, { "cell_type": "code", "execution_count": 210, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 3\n", "1 0\n", "2 1\n", "3 1\n", "4 2\n", "dtype: int64" ] }, "execution_count": 210, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s3 = pd.Series(np.random.randint(0, 4, size=5))\n", "\n", "s3" ] }, { "cell_type": "code", "execution_count": 211, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1 2\n", "3 1\n", "0 1\n", "2 1\n", "dtype: int64" ] }, "execution_count": 211, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s3.value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## String methods\n", "\n", "Series is equipped with a set of string processing methods in the str attribute that make it easy to operate on each element of the array, as in the code snippet below. Note that pattern-matching in str generally uses [regular expressions](https://docs.python.org/3/library/re.html) by default (and in some cases always uses them)." ] }, { "cell_type": "code", "execution_count": 212, "metadata": {}, "outputs": [], "source": [ "s4 = pd.Series([\"A\", \"B\", \"C\", \"Aaba\", \"Baca\", np.nan, \"CABA\", \"dog\", \"cat\"])\n" ] }, { "cell_type": "code", "execution_count": 213, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 A\n", "1 B\n", "2 C\n", "3 Aaba\n", "4 Baca\n", "5 NaN\n", "6 CABA\n", "7 dog\n", "8 cat\n", "dtype: object" ] }, "execution_count": 213, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s4" ] }, { "cell_type": "code", "execution_count": 214, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 a\n", "1 b\n", "2 c\n", "3 aaba\n", "4 baca\n", "5 NaN\n", "6 caba\n", "7 dog\n", "8 cat\n", "dtype: object" ] }, "execution_count": 214, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s4.str.lower()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Merge\n", "\n", "### Concat\n", "\n", "pandas provides various facilities for easily combining together Series and DataFrame objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations.\n", "\n", "Concatenating pandas objects together with [concat()](https://pandas.pydata.org/docs/reference/api/pandas.concat.html#pandas.concat):" ] }, { "cell_type": "code", "execution_count": 215, "metadata": {}, "outputs": [], "source": [ "df5 = pd.DataFrame(np.random.randn(10, 4))" ] }, { "cell_type": "code", "execution_count": 216, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
0-0.331507-0.7128940.8641982.402081
1-1.231050-1.151274-0.612431-1.093598
2-0.3472782.007451-0.9505861.501542
30.0234531.495769-1.5524270.318594
41.0975363.403314-0.649650-0.076354
5-0.4630550.119990-0.0163591.303239
6-0.204622-0.2359220.7661440.996975
7-1.4974160.790123-0.0176150.466254
8-1.3631411.701461-1.006705-0.354086
9-0.3302990.7210741.240544-0.531562
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "0 -0.331507 -0.712894 0.864198 2.402081\n", "1 -1.231050 -1.151274 -0.612431 -1.093598\n", "2 -0.347278 2.007451 -0.950586 1.501542\n", "3 0.023453 1.495769 -1.552427 0.318594\n", "4 1.097536 3.403314 -0.649650 -0.076354\n", "5 -0.463055 0.119990 -0.016359 1.303239\n", "6 -0.204622 -0.235922 0.766144 0.996975\n", "7 -1.497416 0.790123 -0.017615 0.466254\n", "8 -1.363141 1.701461 -1.006705 -0.354086\n", "9 -0.330299 0.721074 1.240544 -0.531562" ] }, "execution_count": 216, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5" ] }, { "cell_type": "code", "execution_count": 217, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[ 0 1 2 3\n", " 0 -0.331507 -0.712894 0.864198 2.402081\n", " 1 -1.231050 -1.151274 -0.612431 -1.093598\n", " 2 -0.347278 2.007451 -0.950586 1.501542,\n", " 0 1 2 3\n", " 3 0.023453 1.495769 -1.552427 0.318594\n", " 4 1.097536 3.403314 -0.649650 -0.076354\n", " 5 -0.463055 0.119990 -0.016359 1.303239\n", " 6 -0.204622 -0.235922 0.766144 0.996975,\n", " 0 1 2 3\n", " 7 -1.497416 0.790123 -0.017615 0.466254\n", " 8 -1.363141 1.701461 -1.006705 -0.354086\n", " 9 -0.330299 0.721074 1.240544 -0.531562]" ] }, "execution_count": 217, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# break it into pieces\n", "pieces = [df5[:3], df5[3:7], df5[7:]]\n", "\n", "pieces" ] }, { "cell_type": "code", "execution_count": 218, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
0-0.331507-0.7128940.8641982.402081
1-1.231050-1.151274-0.612431-1.093598
2-0.3472782.007451-0.9505861.501542
30.0234531.495769-1.5524270.318594
41.0975363.403314-0.649650-0.076354
5-0.4630550.119990-0.0163591.303239
6-0.204622-0.2359220.7661440.996975
7-1.4974160.790123-0.0176150.466254
8-1.3631411.701461-1.006705-0.354086
9-0.3302990.7210741.240544-0.531562
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "0 -0.331507 -0.712894 0.864198 2.402081\n", "1 -1.231050 -1.151274 -0.612431 -1.093598\n", "2 -0.347278 2.007451 -0.950586 1.501542\n", "3 0.023453 1.495769 -1.552427 0.318594\n", "4 1.097536 3.403314 -0.649650 -0.076354\n", "5 -0.463055 0.119990 -0.016359 1.303239\n", "6 -0.204622 -0.235922 0.766144 0.996975\n", "7 -1.497416 0.790123 -0.017615 0.466254\n", "8 -1.363141 1.701461 -1.006705 -0.354086\n", "9 -0.330299 0.721074 1.240544 -0.531562" ] }, "execution_count": 218, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(pieces)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Join" ] }, { "cell_type": "code", "execution_count": 219, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " key lval\n", "0 foo 1\n", "1 foo 2\n", "---------------\n", " key rval\n", "0 foo 4\n", "1 foo 5\n" ] } ], "source": [ "left = pd.DataFrame({\"key\": [\"foo\", \"foo\"], \"lval\": [1, 2]})\n", "\n", "right = pd.DataFrame({\"key\": [\"foo\", \"foo\"], \"rval\": [4, 5]})\n", "\n", "print(left)\n", "print(\"-\" * 15)\n", "print(right)" ] }, { "cell_type": "code", "execution_count": 220, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keylvalrval
0foo14
1foo15
2foo24
3foo25
\n", "
" ], "text/plain": [ " key lval rval\n", "0 foo 1 4\n", "1 foo 1 5\n", "2 foo 2 4\n", "3 foo 2 5" ] }, "execution_count": 220, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(left, right, on=\"key\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Another example that can be given is:" ] }, { "cell_type": "code", "execution_count": 221, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " key lval\n", "0 foo 1\n", "1 bar 2\n", "---------------\n", " key rval\n", "0 foo 4\n", "1 bar 5\n" ] } ], "source": [ "left = pd.DataFrame({\"key\": [\"foo\", \"bar\"], \"lval\": [1, 2]})\n", "\n", "right = pd.DataFrame({\"key\": [\"foo\", \"bar\"], \"rval\": [4, 5]})\n", "\n", "print(left)\n", "print(\"-\" * 15)\n", "print(right)" ] }, { "cell_type": "code", "execution_count": 222, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keylvalrval
0foo14
1bar25
\n", "
" ], "text/plain": [ " key lval rval\n", "0 foo 1 4\n", "1 bar 2 5" ] }, "execution_count": 222, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(left, right, on=\"key\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Grouping\n", "\n", "By “group by” we are referring to a process involving one or more of the following steps:\n", "\n", "- **Splitting** the data into groups based on some criteria\n", "\n", "- **Applying** a function to each group independently\n", "\n", "- **Combining** the results into a data structure" ] }, { "cell_type": "code", "execution_count": 223, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
0fooone1.1920880.689021
1barone0.5940930.040047
2footwo-0.8470460.329965
3barthree-0.568368-1.509250
4footwo1.548249-0.667777
5bartwo0.2514810.011822
6fooone-0.1476681.181461
7foothree0.8380161.066788
\n", "
" ], "text/plain": [ " A B C D\n", "0 foo one 1.192088 0.689021\n", "1 bar one 0.594093 0.040047\n", "2 foo two -0.847046 0.329965\n", "3 bar three -0.568368 -1.509250\n", "4 foo two 1.548249 -0.667777\n", "5 bar two 0.251481 0.011822\n", "6 foo one -0.147668 1.181461\n", "7 foo three 0.838016 1.066788" ] }, "execution_count": 223, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df6 = pd.DataFrame(\n", " {\n", " \"A\": [\"foo\", \"bar\", \"foo\", \"bar\", \"foo\", \"bar\", \"foo\", \"foo\"],\n", " \"B\": [\"one\", \"one\", \"two\", \"three\", \"two\", \"two\", \"one\", \"three\"],\n", " \"C\": np.random.randn(8),\n", " \"D\": np.random.randn(8),\n", " }\n", ")\n", "\n", "df6" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Grouping and then applying the sum() function to the resulting groups." ] }, { "cell_type": "code", "execution_count": 224, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CD
A
bar0.277206-1.457381
foo2.5836402.599458
\n", "
" ], "text/plain": [ " C D\n", "A \n", "bar 0.277206 -1.457381\n", "foo 2.583640 2.599458" ] }, "execution_count": 224, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df6.groupby(\"A\").sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Grouping by multiple columns forms a hierarchical index, and again we can apply the sum() function." ] }, { "cell_type": "code", "execution_count": 225, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CD
AB
barone0.5940930.040047
three-0.568368-1.509250
two0.2514810.011822
fooone1.0444201.870481
three0.8380161.066788
two0.701203-0.337812
\n", "
" ], "text/plain": [ " C D\n", "A B \n", "bar one 0.594093 0.040047\n", " three -0.568368 -1.509250\n", " two 0.251481 0.011822\n", "foo one 1.044420 1.870481\n", " three 0.838016 1.066788\n", " two 0.701203 -0.337812" ] }, "execution_count": 225, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df6.groupby([\"A\", \"B\"]).sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reshaping\n", "\n", "### Stack" ] }, { "cell_type": "code", "execution_count": 226, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('bar', 'one'),\n", " ('bar', 'two'),\n", " ('baz', 'one'),\n", " ('baz', 'two'),\n", " ('foo', 'one'),\n", " ('foo', 'two'),\n", " ('qux', 'one'),\n", " ('qux', 'two')]" ] }, "execution_count": 226, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tuples = list(\n", " zip(\n", " *[\n", " [\"bar\", \"bar\", \"baz\", \"baz\", \"foo\", \"foo\", \"qux\", \"qux\"],\n", " [\"one\", \"two\", \"one\", \"two\", \"one\", \"two\", \"one\", \"two\"],\n", " ]\n", " )\n", ")\n", "\n", "tuples" ] }, { "cell_type": "code", "execution_count": 227, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex([('bar', 'one'),\n", " ('bar', 'two'),\n", " ('baz', 'one'),\n", " ('baz', 'two'),\n", " ('foo', 'one'),\n", " ('foo', 'two'),\n", " ('qux', 'one'),\n", " ('qux', 'two')],\n", " names=['first', 'second'])" ] }, "execution_count": 227, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index = pd.MultiIndex.from_tuples(tuples, names=[\"first\", \"second\"])\n", "\n", "index" ] } ], "metadata": { "interpreter": { "hash": "463226f144cc21b006ce6927bfc93dd00694e52c8bc6857abb6e555b983749e9" }, "kernelspec": { "display_name": "Python 3.8.2 64-bit ('base': conda)", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }