处理缺失值#

import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.random.rand(4, 3), index=['a', 'b', 'c', 'd'], columns=['first', 'second', 'third'])
df1
first second third
a 0.913249 0.970337 0.471116
b 0.710308 0.178931 0.797251
c 0.053555 0.138023 0.984439
d 0.887249 0.602075 0.364529
df2 = pd.DataFrame(np.random.rand(3, 2), index=['a', 'b', 'c'], columns=['first', 'second'])
df2
first second
a 0.250653 0.421736
b 0.574175 0.030342
c 0.812355 0.096477
# 两个dataframe运算,无对应元素则为NaN
df = df1 - df2
df
first second third
a 0.662596 0.548601 NaN
b 0.136133 0.148589 NaN
c -0.758799 0.041547 NaN
d NaN NaN NaN
# 查看各元素是否为NaN
df.isnull()
first second third
a False False True
b False False True
c False False True
d True True True
# axis=1代表针对列,thresh=2表示删除有效数据小于2的列
df.dropna(axis=1, thresh=2, inplace=True)
df
first second
a 0.662596 0.548601
b 0.136133 0.148589
c -0.758799 0.041547
d NaN NaN
# 用0填充缺失值
df.fillna(0)
first second
a 0.662596 0.548601
b 0.136133 0.148589
c -0.758799 0.041547
d 0.000000 0.000000
df
first second
a 0.662596 0.548601
b 0.136133 0.148589
c -0.758799 0.041547
d NaN NaN
# 向前填充
df.fillna(method='ffill')
first second
a 0.662596 0.548601
b 0.136133 0.148589
c -0.758799 0.041547
d -0.758799 0.041547