[python-ds] 판다스 기초 :: 마인드스케일

import pandas as pd

엑셀 파일 열기

df = pd.read_excel('census.xlsx')

첫 부분 보기

df.head()

	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	sex	capital_gain	hours_per_week	native_country	income
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K

csv로 저장, 엑셀로 저장 csv 열기

컬럼 이름

df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

컬럼 선택

df['age']

0        39
1        50
2        38
         ..
32558    58
32559    22
32560    52
Name: age, Length: 32561, dtype: int64

dtypes와 dtype

df.dtypes

age                int64
workclass         object
fnlwgt             int64
                   ...
hours_per_week     int64
native_country    object
income            object
Length: 15, dtype: object

df['age'].dtype

dtype('int64')

배열로 변환

df['age'].to_numpy()  # 주의: 여기까지는 import numpy 필요 없음

array([39, 50, 38, ..., 58, 22, 52], dtype=int64)

통계 계산

import numpy as np

np.max(df['age'])

df['age'].max()

2019 - df['age']

0        1980
1        1969
2        1981
         ...
32558    1961
32559    1997
32560    1967
Name: age, Length: 32561, dtype: int64

여러 컬럼에 계산

cols = ['age', 'education_num']

df[cols].head()

	age	education_num
0	39	13
1	50	13
2	38	9
3	53	7
4	28	13

df[cols].max()

age              90
education_num    16
dtype: int64

df[cols].agg(np.max)

age              90
education_num    16
dtype: int64

정렬

df.sort_values('age').head()

	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	sex	hours_per_week	native_country	income
12318	17	Private	127366	11th	7	Never-married	Sales	Own-child	White	Female	8	United-States	<=50K
6312	17	Private	132755	11th	7	Never-married	Sales	Own-child	White	Male	15	United-States	<=50K
30927	17	Private	108470	11th	7	Never-married	Other-service	Own-child	Black	Male	17	United-States	<=50K
12787	17	Local-gov	308901	11th	7	Never-married	Adm-clerical	Own-child	White	Female	15	United-States	<=50K
25755	17	?	47407	11th	7	Never-married	?	Own-child	White	Male	10	United-States	<=50K

df.sort_values('age', ascending=False).head()  # 내림차순

	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	sex	hours_per_week	native_country	income
5406	90	Private	51744	Masters	14	Never-married	Exec-managerial	Not-in-family	Black	Male	50	United-States	>50K
6624	90	Private	313986	11th	7	Married-civ-spouse	Craft-repair	Husband	White	Male	40	United-States	<=50K
20610	90	Private	206667	Masters	14	Married-civ-spouse	Prof-specialty	Wife	White	Female	40	United-States	>50K
1040	90	Private	137018	HS-grad	9	Never-married	Other-service	Not-in-family	White	Female	40	United-States	<=50K
1935	90	Private	221832	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	45	United-States	<=50K

쿼리

df.query('age < 18 and capital_gain > 1000')

	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	sex	capital_gain	capital_loss	hours_per_week	native_country	income
106	17	?	304873	10th	6	Never-married	?	Own-child	White	Female	34095	0	32	United-States	<=50K
271	17	Private	191260	9th	5	Never-married	Other-service	Own-child	White	Male	1055	0	24	United-States	<=50K
421	17	Private	175024	11th	7	Never-married	Handlers-cleaners	Own-child	White	Male	2176	0	18	United-States	<=50K
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1691	17	Private	103851	11th	7	Never-married	Adm-clerical	Own-child	White	Female	1055	0	20	United-States	<=50K
3605	17	Private	130125	10th	6	Never-married	Other-service	Own-child	Amer-Indian-Eskimo	Female	1055	0	20	United-States	<=50K
27889	17	Private	56536	11th	7	Never-married	Sales	Own-child	White	Female	1055	0	18	India	<=50K

7 rows × 15 columns

그룹

df.groupby('income').agg({'education_num': np.mean})

	education_num
income
<=50K	9.595065
>50K	11.611657

상자-수염 그림

import seaborn as sns

sns.boxplot(x="income", y="education_num", data=df)

<matplotlib.axes._subplots.AxesSubplot at 0x218f61f7978>