Pandas를 이용한 데이터 탐색 - 데이터 그룹분석

2021. 7. 1. 15:33

df5 = df.math.groupby(df4)
df5.describe()

	count	mean	std	min	25%	50%	75%	max
math								
3	6.0	40.833333	12.812754	20.0	33.75	47.5	50.0	50.0
2	1.0	60.000000	NaN	60.0	60.00	60.0	60.0	60.0
1	3.0	86.666667	5.773503	80.0	85.00	90.0	90.0	90.0

집계함수 사용

# 특정 데이터에 대한 그룹분석이 필요할 때 사용한다.
# - 범주형 기준에 따라 그룹으로 데이터를 분할 (반별 그룹)
# - 각 그룹에 독립적으로 함수를 적용 (반별 평균)
# - 결과물을 하나의 데이터구조로 결합 (반별 평균 결합)

# Pandas : groupby 메서드

df = pd.read_csv('dataset/exam_sample.csv')

	student_no	class	science	english	math	sex
0	1	A	50	98	50	m
1	2	A	60	97	60	w
2	3	A	78	86	45	w
3	4	A	58	98	30	m
4	5	B	65	80	90	w
5	6	B	98	89	50	m

# 그룹 분석 시에는 예) class, sex 범주형 변수들이 그룹연산의 기준이 된다.

# class 로 그룹화 하기 -> DataFrameGroupBy object ref 레퍼런스를 준다.
df1 = df.groupby(['class'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f01974c6970>

df1.get_group('A')
 student_no	class	science	english	math	sex
0	1	A	50	98	50	m
1	2	A	60	97	60	w
2	3	A	78	86	45	w
3	4	A	58	98	30	m

# groupby 절

** SQL문에서는

# select avg(nvl(class,0)), class from exam where class='A'
# 80점 이상만 뽑아낼 때
# select avg(nvl(class,0)), class from exam having avg(nvl(class,0)) > 80.0

df1.get_group('A').mean()
df.groupby('class').get_group('A').mean()

student_no     2.50
science       61.50
english       94.75
math          46.25
dtype: float64

df.groupby('class').mean()

	student_no	science	english	math
class				
A	2.5	61.50	94.75	46.25
B	6.5	58.25	84.25	77.50
C	9.5	30.00	95.50	35.00

df.groupby('class').mean().loc[:'A',:]
	student_no	science	english	math
class				
A	2.5	61.5	94.75	46.25

df.groupby('class').mean().loc['A']
student_no     2.50
science       61.50
english       94.75
math          46.25
Name: A, dtype: float64

# 연습 - 반별 , 성별 그룹 평균 
df3 = df.groupby(['class','sex']).mean()

		student_no	science	english	math
class sex				
A	m	2.5	54.0	98.0	40.0
w	2.5	69.0	91.5	52.5
B	m	6.5	71.5	89.5	65.0
w	6.5	45.0	79.0	90.0
C	w	9.5	30.0	95.5	35.0

df.groupby(['컬럼명']) => df 전체의 주소이기 때문에 리스트로 가능

df['math'].groupby(df['컬럼명']) => 시리즈의 주소이기 때문에 컬럼은 전체주소에서 타고 들어가야 한다.

# 멀티인덱서 : .xs

df3.xs(('w'),level=['sex'])

	student_no	science	english	math
class				
A	2.5	69.0	91.5	52.5
B	6.5	45.0	79.0	90.0
C	9.5	30.0	95.5	35.0

df3.xs(('w','A'),level=['sex','class'])

	 student_no science	english	math
class sex				
A	w	2.5	69.0	91.5	52.5

# 반별 수학 평균 구하기
df['math'].groupby(df['class'])

# Case 1
df['math'].groupby(df['class']).mean()

# Case 2
df.groupby(['class'])['math'].mean()

class
A    46.25
B    77.50
C    35.00
Name: math, dtype: float64

# 연습문제 1 : 반별 수학점수 개수

df.groupby(['class'])['math'].count()
or
df['math'].groupby(df['class']).count()

class
A    4
B    4
C    2
Name: math, dtype: int64

# 연습문제 2 : 성별 수학평균

df_mean = df['math'].groupby(df['sex']).mean()
or
df.groupby(['sex'])['math'].mean()

sex
m    52.500000
w    59.166667
Name: math, dtype: float64

# 연습문제 3 : 남학생 수학평균

# 성별을 기준으로 남학생 데이터 추리기
df.groupby(['sex']).get_group('m')
# 남학생 데이터 중에서 수학 점수
subset = ['sex','math']
df.groupby(['sex']).get_group('m')['math']

# 남학생 수학 평균 점수
df.groupby(['sex']).get_group('m')['math'].mean()

# 연습문제 추가

import pandas as pd
import seaborn as sns

df= sns.load_dataset('tips')

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4
...	...	...	...	...	...	...	...
239	29.03	5.92	Male	No	Sat	Dinner	3
240	27.18	2.00	Female	Yes	Sat	Dinner	2
241	22.67	2.00	Male	Yes	Sat	Dinner	2
242	17.82	1.75	Male	No	Sat	Dinner	2
243	18.78	3.00	Female	No	Thur	Dinner	2

df.groupby(['sex'])['total_bill'].mean()
sex
Male      20.744076
Female    18.056897
Name: total_bill, dtype: float64

df.groupby(['sex'])['tip'].mean()
sex
Male      3.089618
Female    2.833448
Name: tip, dtype: float64

df.groupby(['time'])['total_bill'].mean()
time
Lunch     17.168676
Dinner    20.797159
Name: total_bill, dtype: float64

# 데이터의 구간화 : 데이터에 의미있는 구간을 (경계선)

# math 컬럼에 대해서 3개의 동일한 길이로 범주형 데이터로 변경
# .cut  : 동일한 길이를 기준으로 나눈다.
df1 = pd.cut(df.math,3)

0    (43.333, 66.667]
1    (43.333, 66.667]
2    (43.333, 66.667]
3     (19.93, 43.333]
4      (66.667, 90.0]
5    (43.333, 66.667]
6      (66.667, 90.0]
7      (66.667, 90.0]
8     (19.93, 43.333]
9    (43.333, 66.667]
Name: math, dtype: category
Categories (3, interval[float64]): [(19.93, 43.333] < (43.333, 66.667] < (66.667, 90.0]]

df2 = df['math'].groupby(df1)

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f019199cc70>

df2.describe()
	count	mean	std	min	25%	50%	75%	max
math								
(19.93, 43.333]	2.0	25.000000	7.071068	20.0	22.5	25.0	27.5	30.0
(43.333, 66.667] 5.0 51.000000	5.477226	45.0	50.0	50.0	50.0	60.0
(66.667, 90.0]	3.0	86.666667	5.773503	80.0	85.0	90.0	90.0	90.0

df3 = df2.agg(['count','mean','std','min','max'])

				count	mean	std	min	max
math					
(19.93, 43.333]	2	25.000000	7.071068	20	30
(43.333, 66.667]	5	51.000000	5.477226	45	60
(66.667, 90.0]	3	86.666667	5.773503	80	90

#  labels=False :    label 이 0,1,2 구간을 순차적으로 만들어 줌
# pd.cut : 동일한 길이 기준
# pd.qcut : 동일한 갯수로 나눈다.
pd.qcut(df.math,3)

0    (19.999, 50.0]
1      (50.0, 60.0]
2    (19.999, 50.0]
3    (19.999, 50.0]
4      (60.0, 90.0]
5    (19.999, 50.0]
6      (60.0, 90.0]
7      (60.0, 90.0]
8    (19.999, 50.0]
9    (19.999, 50.0]
Name: math, dtype: category
Categories (3, interval[float64]): [(19.999, 50.0] < (50.0, 60.0] < (60.0, 90.0]]

# numpy의 범위를 구하는 arange를 사용
import numpy as np
np.arange(3,0,-1)

df4 = pd.qcut(df.math,3, labels=np.arange(3,0,-1))

0    3
1    2
2    3
3    3
4    1
5    3
6    1
7    1
8    3
9    3
Name: math, dtype: category
Categories (3, int64): [3 < 2 < 1]

# [3 < 2 < 1] 순서로 동일갯수로 나눈 3개의 그룹별 통계량을 계산한다.
df5 = df.math.groupby(df4)
df5.describe()

	count	mean	std	min	25%	50%	75%	max
math								
3	6.0	40.833333	12.812754	20.0	33.75	47.5	50.0	50.0
2	1.0	60.000000	NaN	60.0	60.00	60.0	60.0	60.0
1	3.0	86.666667	5.773503	80.0	85.00	90.0	90.0	90.0

df6 = df5.agg(['count','mean','std','min','max'])

	count	mean	std	min	max
math					
3	6	40.833333	12.812754	20	50
2	1	60.000000	NaN	60	60
1	3	86.666667	5.773503	80	90

# 질문내용

import numpy as np
mydata = np.random.randn(1000)
labels = ['A','B','C']
df6 = pd.qcut(mydata, 3, labels=labels)
print(df6.value_counts())

A    334
B    333
C    333
dtype: int64

[A, C, B, B, A, ..., B, B, A, A, B]
Length: 1000
Categories (3, object): [A < B < C]

df7 = pd.cut(mydata, 3, labels=labels)
print(df7.value_counts())
df7

A    139
B    742
C    119
dtype: int64

[A, B, B, B, B, ..., B, C, B, B, B]
Length: 1000
Categories (3, object): [A < B < C]

# one-hot 인코딩

df1 = pd.get_dummies(df)

	student_no	science	english	math	class_A	class_B	class_C	sex_m	sex_w
0	1	50	98	50	1	0	0	1	0
1	2	60	97	60	1	0	0	0	1
2	3	78	86	45	1	0	0	0	1
3	4	58	98	30	1	0	0	1	0
4	5	65	80	90	0	1	0	0	1
5	6	98	89	50	0	1	0	1	0
6	7	45	90	80	0	1	0	1	0
7	8	25	78	90	0	1	0	0	1
8	9	15	98	20	0	0	1	0	1
9	10	45	93	50	0	0	1	0	1

df1.T

	0	1	2	3	4	5	6	7	8	9
student_no	1	2	3	4	5	6	7	8	9	10
science	50	60	78	58	65	98	45	25	15	45
english	98	97	86	98	80	89	90	78	98	93
math	50	60	45	30	90	50	80	90	20	50
class_A	1	1	1	1	0	0	0	0	0	0
class_B	0	0	0	0	1	1	1	1	0	0
class_C	0	0	0	0	0	0	0	0	1	1
sex_m	1	0	0	1	0	1	1	0	0	0
sex_w	0	1	1	0	1	0	0	1	1	1

# 피벗테이블
# class를 변수행 으로 하는 피벗테이블

pd.pivot_table(df, index='class', columns='sex', values='science')

sex	m	w
class		
A	54.0	69.0
B	71.5	45.0
C	NaN	30.0

# sex 변수행 으로 하는 피벗테이블
pd.pivot_table(df, index='sex', columns='class', values='science')

class	A	B	C
sex			
m	54.0	71.5	NaN
w	69.0	45.0	30.0

'PYTHON' 카테고리의 다른 글

[Pandas] 데이터시각화, Matplotlib, Spring연동(Json) (0)	2021.07.05
[Pandas] 공공데이터 활용 (0)	2021.07.01
Pandas (0)	2021.07.01
0629 시험대비 (0)	2021.06.28
Numpy 자주 사용하는 함수 (0)	2021.06.28

Rusty. Travel. Life

Pandas를 이용한 데이터 탐색 - 데이터 그룹분석

'PYTHON' 카테고리의 다른 글

+ Recent posts

티스토리툴바