엑셀 파일 열기
df = pd.read_excel('census.xlsx')
첫 부분 보기
|
age |
workclass |
fnlwgt |
education |
education_num |
marital_status |
occupation |
relationship |
race |
sex |
capital_gain |
capital_loss |
hours_per_week |
native_country |
income |
0 |
39 |
State-gov |
77516 |
Bachelors |
13 |
Never-married |
Adm-clerical |
Not-in-family |
White |
Male |
2174 |
0 |
40 |
United-States |
<=50K |
1 |
50 |
Self-emp-not-inc |
83311 |
Bachelors |
13 |
Married-civ-spouse |
Exec-managerial |
Husband |
White |
Male |
0 |
0 |
13 |
United-States |
<=50K |
2 |
38 |
Private |
215646 |
HS-grad |
9 |
Divorced |
Handlers-cleaners |
Not-in-family |
White |
Male |
0 |
0 |
40 |
United-States |
<=50K |
3 |
53 |
Private |
234721 |
11th |
7 |
Married-civ-spouse |
Handlers-cleaners |
Husband |
Black |
Male |
0 |
0 |
40 |
United-States |
<=50K |
4 |
28 |
Private |
338409 |
Bachelors |
13 |
Married-civ-spouse |
Prof-specialty |
Wife |
Black |
Female |
0 |
0 |
40 |
Cuba |
<=50K |
csv로 저장, 엑셀로 저장
csv 열기
컬럼 이름
Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'sex',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income'],
dtype='object')
컬럼 선택
0 39
1 50
2 38
..
32558 58
32559 22
32560 52
Name: age, Length: 32561, dtype: int64
dtypes와 dtype
age int64
workclass object
fnlwgt int64
...
hours_per_week int64
native_country object
income object
Length: 15, dtype: object
dtype('int64')
배열로 변환
df['age'].to_numpy() # 주의: 여기까지는 import numpy 필요 없음
array([39, 50, 38, ..., 58, 22, 52], dtype=int64)
통계 계산
90
90
0 1980
1 1969
2 1981
...
32558 1961
32559 1997
32560 1967
Name: age, Length: 32561, dtype: int64
여러 컬럼에 계산
cols = ['age', 'education_num']
|
age |
education_num |
0 |
39 |
13 |
1 |
50 |
13 |
2 |
38 |
9 |
3 |
53 |
7 |
4 |
28 |
13 |
age 90
education_num 16
dtype: int64
age 90
education_num 16
dtype: int64
정렬
df.sort_values('age').head()
|
age |
workclass |
fnlwgt |
education |
education_num |
marital_status |
occupation |
relationship |
race |
sex |
capital_gain |
capital_loss |
hours_per_week |
native_country |
income |
12318 |
17 |
Private |
127366 |
11th |
7 |
Never-married |
Sales |
Own-child |
White |
Female |
0 |
0 |
8 |
United-States |
<=50K |
6312 |
17 |
Private |
132755 |
11th |
7 |
Never-married |
Sales |
Own-child |
White |
Male |
0 |
0 |
15 |
United-States |
<=50K |
30927 |
17 |
Private |
108470 |
11th |
7 |
Never-married |
Other-service |
Own-child |
Black |
Male |
0 |
0 |
17 |
United-States |
<=50K |
12787 |
17 |
Local-gov |
308901 |
11th |
7 |
Never-married |
Adm-clerical |
Own-child |
White |
Female |
0 |
0 |
15 |
United-States |
<=50K |
25755 |
17 |
? |
47407 |
11th |
7 |
Never-married |
? |
Own-child |
White |
Male |
0 |
0 |
10 |
United-States |
<=50K |
df.sort_values('age', ascending=False).head() # 내림차순
|
age |
workclass |
fnlwgt |
education |
education_num |
marital_status |
occupation |
relationship |
race |
sex |
capital_gain |
capital_loss |
hours_per_week |
native_country |
income |
5406 |
90 |
Private |
51744 |
Masters |
14 |
Never-married |
Exec-managerial |
Not-in-family |
Black |
Male |
0 |
0 |
50 |
United-States |
>50K |
6624 |
90 |
Private |
313986 |
11th |
7 |
Married-civ-spouse |
Craft-repair |
Husband |
White |
Male |
0 |
0 |
40 |
United-States |
<=50K |
20610 |
90 |
Private |
206667 |
Masters |
14 |
Married-civ-spouse |
Prof-specialty |
Wife |
White |
Female |
0 |
0 |
40 |
United-States |
>50K |
1040 |
90 |
Private |
137018 |
HS-grad |
9 |
Never-married |
Other-service |
Not-in-family |
White |
Female |
0 |
0 |
40 |
United-States |
<=50K |
1935 |
90 |
Private |
221832 |
Bachelors |
13 |
Married-civ-spouse |
Exec-managerial |
Husband |
White |
Male |
0 |
0 |
45 |
United-States |
<=50K |
쿼리
df.query('age < 18 and capital_gain > 1000')
|
age |
workclass |
fnlwgt |
education |
education_num |
marital_status |
occupation |
relationship |
race |
sex |
capital_gain |
capital_loss |
hours_per_week |
native_country |
income |
106 |
17 |
? |
304873 |
10th |
6 |
Never-married |
? |
Own-child |
White |
Female |
34095 |
0 |
32 |
United-States |
<=50K |
271 |
17 |
Private |
191260 |
9th |
5 |
Never-married |
Other-service |
Own-child |
White |
Male |
1055 |
0 |
24 |
United-States |
<=50K |
421 |
17 |
Private |
175024 |
11th |
7 |
Never-married |
Handlers-cleaners |
Own-child |
White |
Male |
2176 |
0 |
18 |
United-States |
<=50K |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
1691 |
17 |
Private |
103851 |
11th |
7 |
Never-married |
Adm-clerical |
Own-child |
White |
Female |
1055 |
0 |
20 |
United-States |
<=50K |
3605 |
17 |
Private |
130125 |
10th |
6 |
Never-married |
Other-service |
Own-child |
Amer-Indian-Eskimo |
Female |
1055 |
0 |
20 |
United-States |
<=50K |
27889 |
17 |
Private |
56536 |
11th |
7 |
Never-married |
Sales |
Own-child |
White |
Female |
1055 |
0 |
18 |
India |
<=50K |
7 rows × 15 columns
그룹
df.groupby('income').agg({'education_num': np.mean})
|
education_num |
income |
|
<=50K |
9.595065 |
>50K |
11.611657 |
상자-수염 그림
sns.boxplot(x="income", y="education_num", data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x218f61f7978>
