统计信息#
import pandas as pd
import numpy as np
通过describe查看总体信息#
df = pd.read_csv('sample.csv', index_col=0)
df
duration | playtime | up | favorite | comment | share | |
---|---|---|---|---|---|---|
a | 8 | 66 | 30 | 34 | 8 | 40 |
b | 59 | 84 | 47 | 31 | 50 | 91 |
c | 31 | 18 | 78 | 64 | 28 | 57 |
d | 19 | 47 | 20 | 8 | 12 | 96 |
e | 99 | 64 | 57 | 19 | 33 | 63 |
f | 73 | 82 | 48 | 44 | 57 | 4 |
g | 25 | 65 | 98 | 9 | 83 | 97 |
h | 96 | 31 | 66 | 13 | 13 | 29 |
df.describe()
duration | playtime | up | favorite | comment | share | |
---|---|---|---|---|---|---|
count | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 |
mean | 51.250000 | 57.125000 | 55.500000 | 27.750000 | 35.500000 | 59.625000 |
std | 35.507544 | 23.436464 | 25.253005 | 19.506409 | 26.202508 | 34.100429 |
min | 8.000000 | 18.000000 | 20.000000 | 8.000000 | 8.000000 | 4.000000 |
25% | 23.500000 | 43.000000 | 42.750000 | 12.000000 | 12.750000 | 37.250000 |
50% | 45.000000 | 64.500000 | 52.500000 | 25.000000 | 30.500000 | 60.000000 |
75% | 78.750000 | 70.000000 | 69.000000 | 36.500000 | 51.750000 | 92.250000 |
max | 99.000000 | 84.000000 | 98.000000 | 64.000000 | 83.000000 | 97.000000 |
df['up'].describe()
count 8.000000
mean 55.500000
std 25.253005
min 20.000000
25% 42.750000
50% 52.500000
75% 69.000000
max 98.000000
Name: up, dtype: float64
统计函数#
# 也有count,sum,median,min,max,std等统计函数
df['up'].mean()
55.5
# 75%分位数
df.quantile(q=0.75)
duration 78.75
playtime 70.00
up 69.00
favorite 36.50
comment 51.75
share 92.25
Name: 0.75, dtype: float64
# 各列的相关系数
df.corr()
duration | playtime | up | favorite | comment | share | |
---|---|---|---|---|---|---|
duration | 1.000000 | 0.074976 | 0.157567 | -0.118493 | 0.046985 | -0.404477 |
playtime | 0.074976 | 1.000000 | -0.263464 | -0.154291 | 0.501668 | 0.024556 |
up | 0.157567 | -0.263464 | 1.000000 | 0.070472 | 0.617680 | 0.094974 |
favorite | -0.118493 | -0.154291 | 0.070472 | 1.000000 | -0.033540 | -0.433129 |
comment | 0.046985 | 0.501668 | 0.617680 | -0.033540 | 1.000000 | 0.226952 |
share | -0.404477 | 0.024556 | 0.094974 | -0.433129 | 0.226952 | 1.000000 |
cut和qcut#
cut等间距分箱#
df
duration | playtime | up | favorite | comment | share | |
---|---|---|---|---|---|---|
a | 8 | 66 | 30 | 34 | 8 | 40 |
b | 59 | 84 | 47 | 31 | 50 | 91 |
c | 31 | 18 | 78 | 64 | 28 | 57 |
d | 19 | 47 | 20 | 8 | 12 | 96 |
e | 99 | 64 | 57 | 19 | 33 | 63 |
f | 73 | 82 | 48 | 44 | 57 | 4 |
g | 25 | 65 | 98 | 9 | 83 | 97 |
h | 96 | 31 | 66 | 13 | 13 | 29 |
pd.cut(df['duration'], 4, labels=None)
a (7.909, 30.75]
b (53.5, 76.25]
c (30.75, 53.5]
d (7.909, 30.75]
e (76.25, 99.0]
f (53.5, 76.25]
g (7.909, 30.75]
h (76.25, 99.0]
Name: duration, dtype: category
Categories (4, interval[float64, right]): [(7.909, 30.75] < (30.75, 53.5] < (53.5, 76.25] < (76.25, 99.0]]
qcut按分位数分箱#
# 将duration分箱,构成新的一列
df['duration_section'] = pd.qcut(df['duration'], 3, labels=False)
df
duration | playtime | up | favorite | comment | share | duration_section | |
---|---|---|---|---|---|---|---|
a | 8 | 66 | 30 | 34 | 8 | 40 | 0 |
b | 59 | 84 | 47 | 31 | 50 | 91 | 1 |
c | 31 | 18 | 78 | 64 | 28 | 57 | 1 |
d | 19 | 47 | 20 | 8 | 12 | 96 | 0 |
e | 99 | 64 | 57 | 19 | 33 | 63 | 2 |
f | 73 | 82 | 48 | 44 | 57 | 4 | 2 |
g | 25 | 65 | 98 | 9 | 83 | 97 | 0 |
h | 96 | 31 | 66 | 13 | 13 | 29 | 2 |