处理缺失值#
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.random.rand(4, 3), index=['a', 'b', 'c', 'd'], columns=['first', 'second', 'third'])
df1
first | second | third | |
---|---|---|---|
a | 0.913249 | 0.970337 | 0.471116 |
b | 0.710308 | 0.178931 | 0.797251 |
c | 0.053555 | 0.138023 | 0.984439 |
d | 0.887249 | 0.602075 | 0.364529 |
df2 = pd.DataFrame(np.random.rand(3, 2), index=['a', 'b', 'c'], columns=['first', 'second'])
df2
first | second | |
---|---|---|
a | 0.250653 | 0.421736 |
b | 0.574175 | 0.030342 |
c | 0.812355 | 0.096477 |
# 两个dataframe运算,无对应元素则为NaN
df = df1 - df2
df
first | second | third | |
---|---|---|---|
a | 0.662596 | 0.548601 | NaN |
b | 0.136133 | 0.148589 | NaN |
c | -0.758799 | 0.041547 | NaN |
d | NaN | NaN | NaN |
# 查看各元素是否为NaN
df.isnull()
first | second | third | |
---|---|---|---|
a | False | False | True |
b | False | False | True |
c | False | False | True |
d | True | True | True |
# axis=1代表针对列,thresh=2表示删除有效数据小于2的列
df.dropna(axis=1, thresh=2, inplace=True)
df
first | second | |
---|---|---|
a | 0.662596 | 0.548601 |
b | 0.136133 | 0.148589 |
c | -0.758799 | 0.041547 |
d | NaN | NaN |
# 用0填充缺失值
df.fillna(0)
first | second | |
---|---|---|
a | 0.662596 | 0.548601 |
b | 0.136133 | 0.148589 |
c | -0.758799 | 0.041547 |
d | 0.000000 | 0.000000 |
df
first | second | |
---|---|---|
a | 0.662596 | 0.548601 |
b | 0.136133 | 0.148589 |
c | -0.758799 | 0.041547 |
d | NaN | NaN |
# 向前填充
df.fillna(method='ffill')
first | second | |
---|---|---|
a | 0.662596 | 0.548601 |
b | 0.136133 | 0.148589 |
c | -0.758799 | 0.041547 |
d | -0.758799 | 0.041547 |