{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 统计信息"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 通过describe查看总体信息"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" duration | \n",
" playtime | \n",
" up | \n",
" favorite | \n",
" comment | \n",
" share | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" 34 | \n",
" 16 | \n",
" 75 | \n",
" 77 | \n",
" 73 | \n",
" 71 | \n",
"
\n",
" \n",
" b | \n",
" 32 | \n",
" 64 | \n",
" 46 | \n",
" 94 | \n",
" 21 | \n",
" 26 | \n",
"
\n",
" \n",
" c | \n",
" 31 | \n",
" 9 | \n",
" 40 | \n",
" 25 | \n",
" 86 | \n",
" 85 | \n",
"
\n",
" \n",
" d | \n",
" 49 | \n",
" 61 | \n",
" 77 | \n",
" 41 | \n",
" 6 | \n",
" 67 | \n",
"
\n",
" \n",
" e | \n",
" 21 | \n",
" 56 | \n",
" 69 | \n",
" 75 | \n",
" 44 | \n",
" 16 | \n",
"
\n",
" \n",
" f | \n",
" 13 | \n",
" 26 | \n",
" 17 | \n",
" 3 | \n",
" 87 | \n",
" 32 | \n",
"
\n",
" \n",
" g | \n",
" 20 | \n",
" 50 | \n",
" 99 | \n",
" 59 | \n",
" 98 | \n",
" 39 | \n",
"
\n",
" \n",
" h | \n",
" 44 | \n",
" 83 | \n",
" 7 | \n",
" 67 | \n",
" 59 | \n",
" 74 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" duration playtime up favorite comment share\n",
"a 34 16 75 77 73 71\n",
"b 32 64 46 94 21 26\n",
"c 31 9 40 25 86 85\n",
"d 49 61 77 41 6 67\n",
"e 21 56 69 75 44 16\n",
"f 13 26 17 3 87 32\n",
"g 20 50 99 59 98 39\n",
"h 44 83 7 67 59 74"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('sample.csv', index_col=0)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" duration | \n",
" playtime | \n",
" up | \n",
" favorite | \n",
" comment | \n",
" share | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 8.000000 | \n",
" 8.000000 | \n",
" 8.00000 | \n",
" 8.000000 | \n",
" 8.000000 | \n",
" 8.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 30.500000 | \n",
" 45.625000 | \n",
" 53.75000 | \n",
" 55.125000 | \n",
" 59.250000 | \n",
" 51.250000 | \n",
"
\n",
" \n",
" std | \n",
" 12.224098 | \n",
" 25.917107 | \n",
" 31.75239 | \n",
" 30.154069 | \n",
" 33.182396 | \n",
" 25.899531 | \n",
"
\n",
" \n",
" min | \n",
" 13.000000 | \n",
" 9.000000 | \n",
" 7.00000 | \n",
" 3.000000 | \n",
" 6.000000 | \n",
" 16.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 20.750000 | \n",
" 23.500000 | \n",
" 34.25000 | \n",
" 37.000000 | \n",
" 38.250000 | \n",
" 30.500000 | \n",
"
\n",
" \n",
" 50% | \n",
" 31.500000 | \n",
" 53.000000 | \n",
" 57.50000 | \n",
" 63.000000 | \n",
" 66.000000 | \n",
" 53.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 36.500000 | \n",
" 61.750000 | \n",
" 75.50000 | \n",
" 75.500000 | \n",
" 86.250000 | \n",
" 71.750000 | \n",
"
\n",
" \n",
" max | \n",
" 49.000000 | \n",
" 83.000000 | \n",
" 99.00000 | \n",
" 94.000000 | \n",
" 98.000000 | \n",
" 85.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" duration playtime up favorite comment share\n",
"count 8.000000 8.000000 8.00000 8.000000 8.000000 8.000000\n",
"mean 30.500000 45.625000 53.75000 55.125000 59.250000 51.250000\n",
"std 12.224098 25.917107 31.75239 30.154069 33.182396 25.899531\n",
"min 13.000000 9.000000 7.00000 3.000000 6.000000 16.000000\n",
"25% 20.750000 23.500000 34.25000 37.000000 38.250000 30.500000\n",
"50% 31.500000 53.000000 57.50000 63.000000 66.000000 53.000000\n",
"75% 36.500000 61.750000 75.50000 75.500000 86.250000 71.750000\n",
"max 49.000000 83.000000 99.00000 94.000000 98.000000 85.000000"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 8.00000\n",
"mean 53.75000\n",
"std 31.75239\n",
"min 7.00000\n",
"25% 34.25000\n",
"50% 57.50000\n",
"75% 75.50000\n",
"max 99.00000\n",
"Name: up, dtype: float64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['up'].describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 统计函数"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"53.75"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 也有count,sum,median,min,max,std等统计函数\n",
"df['up'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"duration 36.50\n",
"playtime 61.75\n",
"up 75.50\n",
"favorite 75.50\n",
"comment 86.25\n",
"share 71.75\n",
"Name: 0.75, dtype: float64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 75%分位数\n",
"df.quantile(q=0.75)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" duration | \n",
" playtime | \n",
" up | \n",
" favorite | \n",
" comment | \n",
" share | \n",
"
\n",
" \n",
" \n",
" \n",
" duration | \n",
" 1.000000 | \n",
" 0.402896 | \n",
" -0.044902 | \n",
" 0.271874 | \n",
" -0.609994 | \n",
" 0.652923 | \n",
"
\n",
" \n",
" playtime | \n",
" 0.402896 | \n",
" 1.000000 | \n",
" -0.100816 | \n",
" 0.483019 | \n",
" -0.573303 | \n",
" -0.262680 | \n",
"
\n",
" \n",
" up | \n",
" -0.044902 | \n",
" -0.100816 | \n",
" 1.000000 | \n",
" 0.311575 | \n",
" -0.068404 | \n",
" -0.161640 | \n",
"
\n",
" \n",
" favorite | \n",
" 0.271874 | \n",
" 0.483019 | \n",
" 0.311575 | \n",
" 1.000000 | \n",
" -0.405799 | \n",
" -0.231075 | \n",
"
\n",
" \n",
" comment | \n",
" -0.609994 | \n",
" -0.573303 | \n",
" -0.068404 | \n",
" -0.405799 | \n",
" 1.000000 | \n",
" 0.137054 | \n",
"
\n",
" \n",
" share | \n",
" 0.652923 | \n",
" -0.262680 | \n",
" -0.161640 | \n",
" -0.231075 | \n",
" 0.137054 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" duration playtime up favorite comment share\n",
"duration 1.000000 0.402896 -0.044902 0.271874 -0.609994 0.652923\n",
"playtime 0.402896 1.000000 -0.100816 0.483019 -0.573303 -0.262680\n",
"up -0.044902 -0.100816 1.000000 0.311575 -0.068404 -0.161640\n",
"favorite 0.271874 0.483019 0.311575 1.000000 -0.405799 -0.231075\n",
"comment -0.609994 -0.573303 -0.068404 -0.405799 1.000000 0.137054\n",
"share 0.652923 -0.262680 -0.161640 -0.231075 0.137054 1.000000"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 各列的相关系数\n",
"df.corr()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## cut和qcut\n",
"\n",
"### cut等间距分箱"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" duration | \n",
" playtime | \n",
" up | \n",
" favorite | \n",
" comment | \n",
" share | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" 34 | \n",
" 16 | \n",
" 75 | \n",
" 77 | \n",
" 73 | \n",
" 71 | \n",
"
\n",
" \n",
" b | \n",
" 32 | \n",
" 64 | \n",
" 46 | \n",
" 94 | \n",
" 21 | \n",
" 26 | \n",
"
\n",
" \n",
" c | \n",
" 31 | \n",
" 9 | \n",
" 40 | \n",
" 25 | \n",
" 86 | \n",
" 85 | \n",
"
\n",
" \n",
" d | \n",
" 49 | \n",
" 61 | \n",
" 77 | \n",
" 41 | \n",
" 6 | \n",
" 67 | \n",
"
\n",
" \n",
" e | \n",
" 21 | \n",
" 56 | \n",
" 69 | \n",
" 75 | \n",
" 44 | \n",
" 16 | \n",
"
\n",
" \n",
" f | \n",
" 13 | \n",
" 26 | \n",
" 17 | \n",
" 3 | \n",
" 87 | \n",
" 32 | \n",
"
\n",
" \n",
" g | \n",
" 20 | \n",
" 50 | \n",
" 99 | \n",
" 59 | \n",
" 98 | \n",
" 39 | \n",
"
\n",
" \n",
" h | \n",
" 44 | \n",
" 83 | \n",
" 7 | \n",
" 67 | \n",
" 59 | \n",
" 74 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" duration playtime up favorite comment share\n",
"a 34 16 75 77 73 71\n",
"b 32 64 46 94 21 26\n",
"c 31 9 40 25 86 85\n",
"d 49 61 77 41 6 67\n",
"e 21 56 69 75 44 16\n",
"f 13 26 17 3 87 32\n",
"g 20 50 99 59 98 39\n",
"h 44 83 7 67 59 74"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"a (31.0, 40.0]\n",
"b (31.0, 40.0]\n",
"c (22.0, 31.0]\n",
"d (40.0, 49.0]\n",
"e (12.964, 22.0]\n",
"f (12.964, 22.0]\n",
"g (12.964, 22.0]\n",
"h (40.0, 49.0]\n",
"Name: duration, dtype: category\n",
"Categories (4, interval[float64, right]): [(12.964, 22.0] < (22.0, 31.0] < (31.0, 40.0] < (40.0, 49.0]]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.cut(df['duration'], 4, labels=None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### qcut按分位数分箱"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" duration | \n",
" playtime | \n",
" up | \n",
" favorite | \n",
" comment | \n",
" share | \n",
" duration_section | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" 34 | \n",
" 16 | \n",
" 75 | \n",
" 77 | \n",
" 73 | \n",
" 71 | \n",
" 2 | \n",
"
\n",
" \n",
" b | \n",
" 32 | \n",
" 64 | \n",
" 46 | \n",
" 94 | \n",
" 21 | \n",
" 26 | \n",
" 1 | \n",
"
\n",
" \n",
" c | \n",
" 31 | \n",
" 9 | \n",
" 40 | \n",
" 25 | \n",
" 86 | \n",
" 85 | \n",
" 1 | \n",
"
\n",
" \n",
" d | \n",
" 49 | \n",
" 61 | \n",
" 77 | \n",
" 41 | \n",
" 6 | \n",
" 67 | \n",
" 2 | \n",
"
\n",
" \n",
" e | \n",
" 21 | \n",
" 56 | \n",
" 69 | \n",
" 75 | \n",
" 44 | \n",
" 16 | \n",
" 0 | \n",
"
\n",
" \n",
" f | \n",
" 13 | \n",
" 26 | \n",
" 17 | \n",
" 3 | \n",
" 87 | \n",
" 32 | \n",
" 0 | \n",
"
\n",
" \n",
" g | \n",
" 20 | \n",
" 50 | \n",
" 99 | \n",
" 59 | \n",
" 98 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
" h | \n",
" 44 | \n",
" 83 | \n",
" 7 | \n",
" 67 | \n",
" 59 | \n",
" 74 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" duration playtime up favorite comment share duration_section\n",
"a 34 16 75 77 73 71 2\n",
"b 32 64 46 94 21 26 1\n",
"c 31 9 40 25 86 85 1\n",
"d 49 61 77 41 6 67 2\n",
"e 21 56 69 75 44 16 0\n",
"f 13 26 17 3 87 32 0\n",
"g 20 50 99 59 98 39 0\n",
"h 44 83 7 67 59 74 2"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 将duration分箱,构成新的一列\n",
"df['duration_section'] = pd.qcut(df['duration'], 3, labels=False)\n",
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}