census.xlsx 파일 열기
df = pd.read_excel('census.xlsx')
|
age |
workclass |
fnlwgt |
education |
education_num |
marital_status |
occupation |
relationship |
race |
sex |
capital_gain |
capital_loss |
hours_per_week |
native_country |
income |
0 |
39 |
State-gov |
77516 |
Bachelors |
13 |
Never-married |
Adm-clerical |
Not-in-family |
White |
Male |
2174 |
0 |
40 |
United-States |
<=50K |
1 |
50 |
Self-emp-not-inc |
83311 |
Bachelors |
13 |
Married-civ-spouse |
Exec-managerial |
Husband |
White |
Male |
0 |
0 |
13 |
United-States |
<=50K |
2 |
38 |
Private |
215646 |
HS-grad |
9 |
Divorced |
Handlers-cleaners |
Not-in-family |
White |
Male |
0 |
0 |
40 |
United-States |
<=50K |
3 |
53 |
Private |
234721 |
11th |
7 |
Married-civ-spouse |
Handlers-cleaners |
Husband |
Black |
Male |
0 |
0 |
40 |
United-States |
<=50K |
4 |
28 |
Private |
338409 |
Bachelors |
13 |
Married-civ-spouse |
Prof-specialty |
Wife |
Black |
Female |
0 |
0 |
40 |
Cuba |
<=50K |
소득 수준에 따라 그룹짓기
g1 = df.groupby('income')
그룹 보기
{'<=50K': Int64Index([ 0, 1, 2, 3, 4, 5, 6, 12, 13,
15,
...
32548, 32549, 32550, 32551, 32552, 32553, 32555, 32556, 32558,
32559],
dtype='int64', length=24720),
'>50K': Int64Index([ 7, 8, 9, 10, 11, 14, 19, 20, 25,
27,
...
32530, 32532, 32533, 32536, 32538, 32539, 32545, 32554, 32557,
32560],
dtype='int64', length=7841)}
특정 그룹만 보기
g1.get_group('>50K').head()
|
age |
workclass |
fnlwgt |
education |
education_num |
marital_status |
occupation |
relationship |
race |
sex |
capital_gain |
capital_loss |
hours_per_week |
native_country |
income |
7 |
52 |
Self-emp-not-inc |
209642 |
HS-grad |
9 |
Married-civ-spouse |
Exec-managerial |
Husband |
White |
Male |
0 |
0 |
45 |
United-States |
>50K |
8 |
31 |
Private |
45781 |
Masters |
14 |
Never-married |
Prof-specialty |
Not-in-family |
White |
Female |
14084 |
0 |
50 |
United-States |
>50K |
9 |
42 |
Private |
159449 |
Bachelors |
13 |
Married-civ-spouse |
Exec-managerial |
Husband |
White |
Male |
5178 |
0 |
40 |
United-States |
>50K |
10 |
37 |
Private |
280464 |
Some-college |
10 |
Married-civ-spouse |
Exec-managerial |
Husband |
Black |
Male |
0 |
0 |
80 |
United-States |
>50K |
11 |
30 |
State-gov |
141297 |
Bachelors |
13 |
Married-civ-spouse |
Prof-specialty |
Husband |
Asian-Pac-Islander |
Male |
0 |
0 |
40 |
India |
>50K |
그룹별 통계
df['education_num'].mean() # 전체 평균
10.0806793403151
g1['education_num'].mean()
income
<=50K 9.595065
>50K 11.611657
Name: education_num, dtype: float64
g1['education_num'].describe()
|
count |
mean |
std |
min |
25% |
50% |
75% |
max |
income |
|
|
|
|
|
|
|
|
<=50K |
24720.0 |
9.595065 |
2.436147 |
1.0 |
9.0 |
9.0 |
10.0 |
16.0 |
>50K |
7841.0 |
11.611657 |
2.385129 |
2.0 |
10.0 |
12.0 |
13.0 |
16.0 |
nth
g1['capital_gain'].nth(0)
income
<=50K 2174
>50K 0
Name: capital_gain, dtype: int64
g1['capital_gain'].nth(-1)
income
<=50K 0
>50K 15024
Name: capital_gain, dtype: int64
agg
aggregate: 모으다(모아서 정리하다)
g1.agg({'education_num': np.mean})
|
education_num |
income |
|
<=50K |
9.595065 |
>50K |
11.611657 |
그룹별 education_num 평균과 표준편차
g1.agg({'education_num': [np.mean, np.std]})
|
education_num |
|
mean |
std |
income |
|
|
<=50K |
9.595065 |
2.436147 |
>50K |
11.611657 |
2.385129 |
그룹별 capital_gain 평균
g1.agg({'capital_gain': np.mean})
|
capital_gain |
income |
|
<=50K |
148.752468 |
>50K |
4006.142456 |
모두 적용
g1.agg(
{
'education_num': [np.mean, np.std],
'capital_gain': np.mean
}
)
|
education_num |
capital_gain |
|
mean |
std |
mean |
income |
|
|
|
<=50K |
9.595065 |
2.436147 |
148.752468 |
>50K |
11.611657 |
2.385129 |
4006.142456 |
실습
성별(sex
)에 따라 education_num
의 평균(np.mean
)과 표준편차(np.std
)를 구하세요.
SELECT AVG(eduation_num), STD(education_num) FROM df GROUP BY sex;
df.groupby('sex').agg({'education_num': [np.mean, np.std]})
|
education_num |
|
mean |
std |
sex |
|
|
Female |
10.035744 |
2.379954 |
Male |
10.102891 |
2.662630 |
race와 income으로 그룹
g2 = df.groupby(['race', 'income'])
rie = g2.agg({'education_num': np.mean})
|
|
education_num |
race |
income |
|
Amer-Indian-Eskimo |
<=50K |
9.061818 |
>50K |
11.222222 |
Asian-Pac-Islander |
<=50K |
10.440367 |
>50K |
12.398551 |
Black |
<=50K |
9.261235 |
>50K |
11.077519 |
Other |
<=50K |
8.560976 |
>50K |
11.600000 |
White |
<=50K |
9.627422 |
>50K |
11.612196 |
reset_index
|
race |
income |
education_num |
0 |
Amer-Indian-Eskimo |
<=50K |
9.061818 |
1 |
Amer-Indian-Eskimo |
>50K |
11.222222 |
2 |
Asian-Pac-Islander |
<=50K |
10.440367 |
3 |
Asian-Pac-Islander |
>50K |
12.398551 |
4 |
Black |
<=50K |
9.261235 |
5 |
Black |
>50K |
11.077519 |
6 |
Other |
<=50K |
8.560976 |
7 |
Other |
>50K |
11.600000 |
8 |
White |
<=50K |
9.627422 |
9 |
White |
>50K |
11.612196 |
unique
컬럼에 포함된 값의 종류를 알려준다.
array(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
'Other'], dtype=object)
category
범주형 데이터의 경우 기본적으로 object
형으로 되어 있으나 pandas
의 category
형으로 변환하면 메모리를 절약하고 속도를 높일 수 있다.
dtype('int64')
260568
df['age'].astype('int16').memory_usage()
65202
dtype('O')
df['race'].memory_usage()
260616
race = df['race'].astype('category')
CategoricalDtype(categories=['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other',
'White'],
ordered=False)
32889