统计信息

Contents

统计信息#

import pandas as pd
import numpy as np

通过describe查看总体信息#

df = pd.read_csv('sample.csv', index_col=0)
df

	duration	playtime	up	favorite	comment	share
a	8	66	30	34	8	40
b	59	84	47	31	50	91
c	31	18	78	64	28	57
d	19	47	20	8	12	96
e	99	64	57	19	33	63
f	73	82	48	44	57	4
g	25	65	98	9	83	97
h	96	31	66	13	13	29

df.describe()

	duration	playtime	up	favorite	comment	share
count	8.000000	8.000000	8.000000	8.000000	8.000000	8.000000
mean	51.250000	57.125000	55.500000	27.750000	35.500000	59.625000
std	35.507544	23.436464	25.253005	19.506409	26.202508	34.100429
min	8.000000	18.000000	20.000000	8.000000	8.000000	4.000000
25%	23.500000	43.000000	42.750000	12.000000	12.750000	37.250000
50%	45.000000	64.500000	52.500000	25.000000	30.500000	60.000000
75%	78.750000	70.000000	69.000000	36.500000	51.750000	92.250000
max	99.000000	84.000000	98.000000	64.000000	83.000000	97.000000

df['up'].describe()

count     8.000000
mean     55.500000
std      25.253005
min      20.000000
25%      42.750000
50%      52.500000
75%      69.000000
max      98.000000
Name: up, dtype: float64

统计函数#

# 也有count,sum,median,min,max,std等统计函数
df['up'].mean()

55.5

# 75%分位数
df.quantile(q=0.75)

duration    78.75
playtime    70.00
up          69.00
favorite    36.50
comment     51.75
share       92.25
Name: 0.75, dtype: float64

# 各列的相关系数
df.corr()

	duration	playtime	up	favorite	comment	share
duration	1.000000	0.074976	0.157567	-0.118493	0.046985	-0.404477
playtime	0.074976	1.000000	-0.263464	-0.154291	0.501668	0.024556
up	0.157567	-0.263464	1.000000	0.070472	0.617680	0.094974
favorite	-0.118493	-0.154291	0.070472	1.000000	-0.033540	-0.433129
comment	0.046985	0.501668	0.617680	-0.033540	1.000000	0.226952
share	-0.404477	0.024556	0.094974	-0.433129	0.226952	1.000000

cut和qcut#

cut等间距分箱#

df

	duration	playtime	up	favorite	comment	share
a	8	66	30	34	8	40
b	59	84	47	31	50	91
c	31	18	78	64	28	57
d	19	47	20	8	12	96
e	99	64	57	19	33	63
f	73	82	48	44	57	4
g	25	65	98	9	83	97
h	96	31	66	13	13	29

pd.cut(df['duration'], 4, labels=None)

a    (7.909, 30.75]
b     (53.5, 76.25]
c     (30.75, 53.5]
d    (7.909, 30.75]
e     (76.25, 99.0]
f     (53.5, 76.25]
g    (7.909, 30.75]
h     (76.25, 99.0]
Name: duration, dtype: category
Categories (4, interval[float64, right]): [(7.909, 30.75] < (30.75, 53.5] < (53.5, 76.25] < (76.25, 99.0]]

qcut按分位数分箱#

# 将duration分箱，构成新的一列
df['duration_section'] = pd.qcut(df['duration'], 3, labels=False)
df

	duration	playtime	up	favorite	comment	share	duration_section
a	8	66	30	34	8	40	0
b	59	84	47	31	50	91	1
c	31	18	78	64	28	57	1
d	19	47	20	8	12	96	0
e	99	64	57	19	33	63	2
f	73	82	48	44	57	4	2
g	25	65	98	9	83	97	0
h	96	31	66	13	13	29	2