统计信息#

import pandas as pd
import numpy as np

通过describe查看总体信息#

df = pd.read_csv('sample.csv', index_col=0)
df
duration playtime up favorite comment share
a 8 66 30 34 8 40
b 59 84 47 31 50 91
c 31 18 78 64 28 57
d 19 47 20 8 12 96
e 99 64 57 19 33 63
f 73 82 48 44 57 4
g 25 65 98 9 83 97
h 96 31 66 13 13 29
df.describe()
duration playtime up favorite comment share
count 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000
mean 51.250000 57.125000 55.500000 27.750000 35.500000 59.625000
std 35.507544 23.436464 25.253005 19.506409 26.202508 34.100429
min 8.000000 18.000000 20.000000 8.000000 8.000000 4.000000
25% 23.500000 43.000000 42.750000 12.000000 12.750000 37.250000
50% 45.000000 64.500000 52.500000 25.000000 30.500000 60.000000
75% 78.750000 70.000000 69.000000 36.500000 51.750000 92.250000
max 99.000000 84.000000 98.000000 64.000000 83.000000 97.000000
df['up'].describe()
count     8.000000
mean     55.500000
std      25.253005
min      20.000000
25%      42.750000
50%      52.500000
75%      69.000000
max      98.000000
Name: up, dtype: float64

统计函数#

# 也有count,sum,median,min,max,std等统计函数
df['up'].mean()
55.5
# 75%分位数
df.quantile(q=0.75)
duration    78.75
playtime    70.00
up          69.00
favorite    36.50
comment     51.75
share       92.25
Name: 0.75, dtype: float64
# 各列的相关系数
df.corr()
duration playtime up favorite comment share
duration 1.000000 0.074976 0.157567 -0.118493 0.046985 -0.404477
playtime 0.074976 1.000000 -0.263464 -0.154291 0.501668 0.024556
up 0.157567 -0.263464 1.000000 0.070472 0.617680 0.094974
favorite -0.118493 -0.154291 0.070472 1.000000 -0.033540 -0.433129
comment 0.046985 0.501668 0.617680 -0.033540 1.000000 0.226952
share -0.404477 0.024556 0.094974 -0.433129 0.226952 1.000000

cut和qcut#

cut等间距分箱#

df
duration playtime up favorite comment share
a 8 66 30 34 8 40
b 59 84 47 31 50 91
c 31 18 78 64 28 57
d 19 47 20 8 12 96
e 99 64 57 19 33 63
f 73 82 48 44 57 4
g 25 65 98 9 83 97
h 96 31 66 13 13 29
pd.cut(df['duration'], 4, labels=None)
a    (7.909, 30.75]
b     (53.5, 76.25]
c     (30.75, 53.5]
d    (7.909, 30.75]
e     (76.25, 99.0]
f     (53.5, 76.25]
g    (7.909, 30.75]
h     (76.25, 99.0]
Name: duration, dtype: category
Categories (4, interval[float64, right]): [(7.909, 30.75] < (30.75, 53.5] < (53.5, 76.25] < (76.25, 99.0]]

qcut按分位数分箱#

# 将duration分箱,构成新的一列
df['duration_section'] = pd.qcut(df['duration'], 3, labels=False)
df
duration playtime up favorite comment share duration_section
a 8 66 30 34 8 40 0
b 59 84 47 31 50 91 1
c 31 18 78 64 28 57 1
d 19 47 20 8 12 96 0
e 99 64 57 19 33 63 2
f 73 82 48 44 57 4 2
g 25 65 98 9 83 97 0
h 96 31 66 13 13 29 2