import numpy as np
import pandas as pd
df=pd.read_csv("tips.csv")
df.head()
'''
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
'''
举例1、
'''
select sex,count(1) from tips group by sex
'''
#count() 默认作用于每个列
df.groupby("sex").count()
'''
total_bill tip smoker day time size
sex
Female 87 87 87 87 87 87
Male 157 157 157 157 157 157
'''
#列名.count() 指定具体的列
df.groupby("sex").sex.count()
'''
sex
Female 87
Male 157
Name: sex, dtype: int64
'''
#方式二:
df.sex.value_counts()
'''
Male 157
Female 87
Name: sex, dtype: int64
'''
举例2、#对单列进行分组
'''
select avg(tip),sum(tip),count(1) from tips group by day
'''
df.groupby("day").agg({"tip":["mean","sum","count"]})
'''
tip
mean sum count
day
Fri 2.734737 51.96 19
Sat 2.993103 260.40 87
Sun 3.255132 247.39 76
Thur 2.771452 171.83 62
'''
举例3、对队列进行分组
'''
select smoker,day,avg(tip),sum(tip) from tips group by smoker,day
'''
df.groupby(["smoker","day"]).agg({"tip":["mean","sum"]})

对多列进行分组.png
举例4、多列进行分组之后,对smoker进行降序排序
df.groupby(["smoker","day"]).agg({"tip":["mean","sum"]}).sort_values(by="smoker",ascending=False)

对多列进行分组再排序.png
举例5、对列进行分组之后,再having过滤
'''
select smoker,day,avg(tip),sum(tip) from tips
group by smoker,day
having sum(tip)>100
'''
g=df.groupby(['smoker','day']).agg({'tip':['mean','sum']})
#先将分组结果赋值给g,g是一个DataFrame对象,因此可以用DataFrame对象的loc方法过滤数据

g对象.png
#先查看g对象的列名
g.columns
'''
MultiIndex([('tip','mean'),
('tip','sum')],
)
'''
#使用元组的形式可获取具体的列值
g[('tip','sum')]
'''
smoker day
No Fri 11.25
Sat 139.63
Sun 180.57
Thur 120.32
Yes Fri 40.71
Sat 120.77
Sun 66.82
Thur 51.51
Name: (tip, sum), dtype: float64
'''
#使用元组的形式可获取具体的列值并过滤
g.loc[g[('tip','sum')]>100]

g对象过滤.png
#方式二:
g.loc[:,('tip','sum')]>100
'''
smoker day
No Fri False
Sat True
Sun True
Thur True
Yes Fri False
Sat True
Sun False
Thur False
Name: (tip, sum), dtype: bool
'''
#使用元组的形式可获取具体的列值并过滤
g.loc[g.loc[:,('tip','sum')]>100]

g对象过滤.png
举例5、对列进行分组之后,再having过滤多列
'''
select smoker,day,avg(tip),sum(tip) from tips
group by smoker,day
having sum(tip)>100 and avg(tip)>2.7
'''
g=df.groupby(['smoker','day']).agg({'tip':['mean','sum']})
#先将分组结果赋值给g,g是一个DataFrame对象,因此可以用DataFrame对象的loc方法过滤数据
#先查看g对象的列名
#使用 & 代表and
g.loc[(g.loc[:,('tip','sum')]>100)
&
(g.loc[:,('tip','mean')]>2.7)]

g对象过滤多列.png
网友评论