{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 统计信息" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 通过describe查看总体信息" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
durationplaytimeupfavoritecommentshare
a341675777371
b326446942126
c31940258685
d49617741667
e215669754416
f13261738732
g205099599839
h44837675974
\n", "
" ], "text/plain": [ " duration playtime up favorite comment share\n", "a 34 16 75 77 73 71\n", "b 32 64 46 94 21 26\n", "c 31 9 40 25 86 85\n", "d 49 61 77 41 6 67\n", "e 21 56 69 75 44 16\n", "f 13 26 17 3 87 32\n", "g 20 50 99 59 98 39\n", "h 44 83 7 67 59 74" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('sample.csv', index_col=0)\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
durationplaytimeupfavoritecommentshare
count8.0000008.0000008.000008.0000008.0000008.000000
mean30.50000045.62500053.7500055.12500059.25000051.250000
std12.22409825.91710731.7523930.15406933.18239625.899531
min13.0000009.0000007.000003.0000006.00000016.000000
25%20.75000023.50000034.2500037.00000038.25000030.500000
50%31.50000053.00000057.5000063.00000066.00000053.000000
75%36.50000061.75000075.5000075.50000086.25000071.750000
max49.00000083.00000099.0000094.00000098.00000085.000000
\n", "
" ], "text/plain": [ " duration playtime up favorite comment share\n", "count 8.000000 8.000000 8.00000 8.000000 8.000000 8.000000\n", "mean 30.500000 45.625000 53.75000 55.125000 59.250000 51.250000\n", "std 12.224098 25.917107 31.75239 30.154069 33.182396 25.899531\n", "min 13.000000 9.000000 7.00000 3.000000 6.000000 16.000000\n", "25% 20.750000 23.500000 34.25000 37.000000 38.250000 30.500000\n", "50% 31.500000 53.000000 57.50000 63.000000 66.000000 53.000000\n", "75% 36.500000 61.750000 75.50000 75.500000 86.250000 71.750000\n", "max 49.000000 83.000000 99.00000 94.000000 98.000000 85.000000" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 8.00000\n", "mean 53.75000\n", "std 31.75239\n", "min 7.00000\n", "25% 34.25000\n", "50% 57.50000\n", "75% 75.50000\n", "max 99.00000\n", "Name: up, dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['up'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 统计函数" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "53.75" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 也有count,sum,median,min,max,std等统计函数\n", "df['up'].mean()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "duration 36.50\n", "playtime 61.75\n", "up 75.50\n", "favorite 75.50\n", "comment 86.25\n", "share 71.75\n", "Name: 0.75, dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 75%分位数\n", "df.quantile(q=0.75)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
durationplaytimeupfavoritecommentshare
duration1.0000000.402896-0.0449020.271874-0.6099940.652923
playtime0.4028961.000000-0.1008160.483019-0.573303-0.262680
up-0.044902-0.1008161.0000000.311575-0.068404-0.161640
favorite0.2718740.4830190.3115751.000000-0.405799-0.231075
comment-0.609994-0.573303-0.068404-0.4057991.0000000.137054
share0.652923-0.262680-0.161640-0.2310750.1370541.000000
\n", "
" ], "text/plain": [ " duration playtime up favorite comment share\n", "duration 1.000000 0.402896 -0.044902 0.271874 -0.609994 0.652923\n", "playtime 0.402896 1.000000 -0.100816 0.483019 -0.573303 -0.262680\n", "up -0.044902 -0.100816 1.000000 0.311575 -0.068404 -0.161640\n", "favorite 0.271874 0.483019 0.311575 1.000000 -0.405799 -0.231075\n", "comment -0.609994 -0.573303 -0.068404 -0.405799 1.000000 0.137054\n", "share 0.652923 -0.262680 -0.161640 -0.231075 0.137054 1.000000" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 各列的相关系数\n", "df.corr()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## cut和qcut\n", "\n", "### cut等间距分箱" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
durationplaytimeupfavoritecommentshare
a341675777371
b326446942126
c31940258685
d49617741667
e215669754416
f13261738732
g205099599839
h44837675974
\n", "
" ], "text/plain": [ " duration playtime up favorite comment share\n", "a 34 16 75 77 73 71\n", "b 32 64 46 94 21 26\n", "c 31 9 40 25 86 85\n", "d 49 61 77 41 6 67\n", "e 21 56 69 75 44 16\n", "f 13 26 17 3 87 32\n", "g 20 50 99 59 98 39\n", "h 44 83 7 67 59 74" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "a (31.0, 40.0]\n", "b (31.0, 40.0]\n", "c (22.0, 31.0]\n", "d (40.0, 49.0]\n", "e (12.964, 22.0]\n", "f (12.964, 22.0]\n", "g (12.964, 22.0]\n", "h (40.0, 49.0]\n", "Name: duration, dtype: category\n", "Categories (4, interval[float64, right]): [(12.964, 22.0] < (22.0, 31.0] < (31.0, 40.0] < (40.0, 49.0]]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.cut(df['duration'], 4, labels=None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### qcut按分位数分箱" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
durationplaytimeupfavoritecommentshareduration_section
a3416757773712
b3264469421261
c319402586851
d496177416672
e2156697544160
f132617387320
g2050995998390
h448376759742
\n", "
" ], "text/plain": [ " duration playtime up favorite comment share duration_section\n", "a 34 16 75 77 73 71 2\n", "b 32 64 46 94 21 26 1\n", "c 31 9 40 25 86 85 1\n", "d 49 61 77 41 6 67 2\n", "e 21 56 69 75 44 16 0\n", "f 13 26 17 3 87 32 0\n", "g 20 50 99 59 98 39 0\n", "h 44 83 7 67 59 74 2" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 将duration分箱,构成新的一列\n", "df['duration_section'] = pd.qcut(df['duration'], 3, labels=False)\n", "df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }