{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 处理缺失值"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" first | \n",
" second | \n",
" third | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" 0.480390 | \n",
" 0.429609 | \n",
" 0.836108 | \n",
"
\n",
" \n",
" b | \n",
" 0.062058 | \n",
" 0.771786 | \n",
" 0.329276 | \n",
"
\n",
" \n",
" c | \n",
" 0.033609 | \n",
" 0.935540 | \n",
" 0.969110 | \n",
"
\n",
" \n",
" d | \n",
" 0.408893 | \n",
" 0.093299 | \n",
" 0.885355 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" first second third\n",
"a 0.480390 0.429609 0.836108\n",
"b 0.062058 0.771786 0.329276\n",
"c 0.033609 0.935540 0.969110\n",
"d 0.408893 0.093299 0.885355"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1 = pd.DataFrame(np.random.rand(4, 3), index=['a', 'b', 'c', 'd'], columns=['first', 'second', 'third'])\n",
"df1"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" first | \n",
" second | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" 0.617928 | \n",
" 0.839813 | \n",
"
\n",
" \n",
" b | \n",
" 0.033486 | \n",
" 0.545596 | \n",
"
\n",
" \n",
" c | \n",
" 0.049161 | \n",
" 0.763841 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" first second\n",
"a 0.617928 0.839813\n",
"b 0.033486 0.545596\n",
"c 0.049161 0.763841"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = pd.DataFrame(np.random.rand(3, 2), index=['a', 'b', 'c'], columns=['first', 'second'])\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" first | \n",
" second | \n",
" third | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" -0.137538 | \n",
" -0.410204 | \n",
" NaN | \n",
"
\n",
" \n",
" b | \n",
" 0.028572 | \n",
" 0.226190 | \n",
" NaN | \n",
"
\n",
" \n",
" c | \n",
" -0.015552 | \n",
" 0.171699 | \n",
" NaN | \n",
"
\n",
" \n",
" d | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" first second third\n",
"a -0.137538 -0.410204 NaN\n",
"b 0.028572 0.226190 NaN\n",
"c -0.015552 0.171699 NaN\n",
"d NaN NaN NaN"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 两个dataframe运算,无对应元素则为NaN\n",
"df = df1 - df2\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" first | \n",
" second | \n",
" third | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" False | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" b | \n",
" False | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" c | \n",
" False | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" d | \n",
" True | \n",
" True | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" first second third\n",
"a False False True\n",
"b False False True\n",
"c False False True\n",
"d True True True"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 查看各元素是否为NaN\n",
"df.isnull()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" first | \n",
" second | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" -0.137538 | \n",
" -0.410204 | \n",
"
\n",
" \n",
" b | \n",
" 0.028572 | \n",
" 0.226190 | \n",
"
\n",
" \n",
" c | \n",
" -0.015552 | \n",
" 0.171699 | \n",
"
\n",
" \n",
" d | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" first second\n",
"a -0.137538 -0.410204\n",
"b 0.028572 0.226190\n",
"c -0.015552 0.171699\n",
"d NaN NaN"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# axis=1代表针对列,thresh=2表示删除有效数据小于2的列\n",
"df.dropna(axis=1, thresh=2, inplace=True)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" first | \n",
" second | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" -0.137538 | \n",
" -0.410204 | \n",
"
\n",
" \n",
" b | \n",
" 0.028572 | \n",
" 0.226190 | \n",
"
\n",
" \n",
" c | \n",
" -0.015552 | \n",
" 0.171699 | \n",
"
\n",
" \n",
" d | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" first second\n",
"a -0.137538 -0.410204\n",
"b 0.028572 0.226190\n",
"c -0.015552 0.171699\n",
"d 0.000000 0.000000"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 用0填充缺失值\n",
"df.fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" first | \n",
" second | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" -0.137538 | \n",
" -0.410204 | \n",
"
\n",
" \n",
" b | \n",
" 0.028572 | \n",
" 0.226190 | \n",
"
\n",
" \n",
" c | \n",
" -0.015552 | \n",
" 0.171699 | \n",
"
\n",
" \n",
" d | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" first second\n",
"a -0.137538 -0.410204\n",
"b 0.028572 0.226190\n",
"c -0.015552 0.171699\n",
"d NaN NaN"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" first | \n",
" second | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" -0.137538 | \n",
" -0.410204 | \n",
"
\n",
" \n",
" b | \n",
" 0.028572 | \n",
" 0.226190 | \n",
"
\n",
" \n",
" c | \n",
" -0.015552 | \n",
" 0.171699 | \n",
"
\n",
" \n",
" d | \n",
" -0.015552 | \n",
" 0.171699 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" first second\n",
"a -0.137538 -0.410204\n",
"b 0.028572 0.226190\n",
"c -0.015552 0.171699\n",
"d -0.015552 0.171699"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 向前填充\n",
"df.fillna(method='ffill')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}