{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 处理缺失值" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
firstsecondthird
a0.4803900.4296090.836108
b0.0620580.7717860.329276
c0.0336090.9355400.969110
d0.4088930.0932990.885355
\n", "
" ], "text/plain": [ " first second third\n", "a 0.480390 0.429609 0.836108\n", "b 0.062058 0.771786 0.329276\n", "c 0.033609 0.935540 0.969110\n", "d 0.408893 0.093299 0.885355" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = pd.DataFrame(np.random.rand(4, 3), index=['a', 'b', 'c', 'd'], columns=['first', 'second', 'third'])\n", "df1" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
firstsecond
a0.6179280.839813
b0.0334860.545596
c0.0491610.763841
\n", "
" ], "text/plain": [ " first second\n", "a 0.617928 0.839813\n", "b 0.033486 0.545596\n", "c 0.049161 0.763841" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = pd.DataFrame(np.random.rand(3, 2), index=['a', 'b', 'c'], columns=['first', 'second'])\n", "df2" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
firstsecondthird
a-0.137538-0.410204NaN
b0.0285720.226190NaN
c-0.0155520.171699NaN
dNaNNaNNaN
\n", "
" ], "text/plain": [ " first second third\n", "a -0.137538 -0.410204 NaN\n", "b 0.028572 0.226190 NaN\n", "c -0.015552 0.171699 NaN\n", "d NaN NaN NaN" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 两个dataframe运算,无对应元素则为NaN\n", "df = df1 - df2\n", "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
firstsecondthird
aFalseFalseTrue
bFalseFalseTrue
cFalseFalseTrue
dTrueTrueTrue
\n", "
" ], "text/plain": [ " first second third\n", "a False False True\n", "b False False True\n", "c False False True\n", "d True True True" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 查看各元素是否为NaN\n", "df.isnull()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
firstsecond
a-0.137538-0.410204
b0.0285720.226190
c-0.0155520.171699
dNaNNaN
\n", "
" ], "text/plain": [ " first second\n", "a -0.137538 -0.410204\n", "b 0.028572 0.226190\n", "c -0.015552 0.171699\n", "d NaN NaN" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# axis=1代表针对列,thresh=2表示删除有效数据小于2的列\n", "df.dropna(axis=1, thresh=2, inplace=True)\n", "df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
firstsecond
a-0.137538-0.410204
b0.0285720.226190
c-0.0155520.171699
d0.0000000.000000
\n", "
" ], "text/plain": [ " first second\n", "a -0.137538 -0.410204\n", "b 0.028572 0.226190\n", "c -0.015552 0.171699\n", "d 0.000000 0.000000" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 用0填充缺失值\n", "df.fillna(0)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
firstsecond
a-0.137538-0.410204
b0.0285720.226190
c-0.0155520.171699
dNaNNaN
\n", "
" ], "text/plain": [ " first second\n", "a -0.137538 -0.410204\n", "b 0.028572 0.226190\n", "c -0.015552 0.171699\n", "d NaN NaN" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
firstsecond
a-0.137538-0.410204
b0.0285720.226190
c-0.0155520.171699
d-0.0155520.171699
\n", "
" ], "text/plain": [ " first second\n", "a -0.137538 -0.410204\n", "b 0.028572 0.226190\n", "c -0.015552 0.171699\n", "d -0.015552 0.171699" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 向前填充\n", "df.fillna(method='ffill')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }