# 处理缺失值

In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.DataFrame(np.random.rand(4, 3), index=['a', 'b', 'c', 'd'], columns=['first', 'second', 'third'])
df1

Unnamed: 0,first,second,third
a,0.48039,0.429609,0.836108
b,0.062058,0.771786,0.329276
c,0.033609,0.93554,0.96911
d,0.408893,0.093299,0.885355


In [3]:
df2 = pd.DataFrame(np.random.rand(3, 2), index=['a', 'b', 'c'], columns=['first', 'second'])
df2

Unnamed: 0,first,second
a,0.617928,0.839813
b,0.033486,0.545596
c,0.049161,0.763841


In [4]:
# 两个dataframe运算，无对应元素则为NaN
df = df1 - df2
df

Unnamed: 0,first,second,third
a,-0.137538,-0.410204,
b,0.028572,0.22619,
c,-0.015552,0.171699,
d,,,


In [5]:
# 查看各元素是否为NaN
df.isnull()

Unnamed: 0,first,second,third
a,False,False,True
b,False,False,True
c,False,False,True
d,True,True,True


In [6]:
# axis=1代表针对列，thresh=2表示删除有效数据小于2的列
df.dropna(axis=1, thresh=2, inplace=True)
df

Unnamed: 0,first,second
a,-0.137538,-0.410204
b,0.028572,0.22619
c,-0.015552,0.171699
d,,


In [7]:
# 用0填充缺失值
df.fillna(0)

Unnamed: 0,first,second
a,-0.137538,-0.410204
b,0.028572,0.22619
c,-0.015552,0.171699
d,0.0,0.0


In [8]:
df

Unnamed: 0,first,second
a,-0.137538,-0.410204
b,0.028572,0.22619
c,-0.015552,0.171699
d,,


In [9]:
# 向前填充
df.fillna(method='ffill')

Unnamed: 0,first,second
a,-0.137538,-0.410204
b,0.028572,0.22619
c,-0.015552,0.171699
d,-0.015552,0.171699
