엑셀 파일 열기
df = pd.read_excel('census.xlsx')
첫 부분 보기
  
    
      |  | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income | 
  
  
    
      | 0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K | 
    
      | 1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K | 
    
      | 2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K | 
    
      | 3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K | 
    
      | 4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K | 
  
 csv로 저장, 엑셀로 저장
csv 열기 
컬럼 이름
Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')컬럼 선택
0        39
1        50
2        38
         ..
32558    58
32559    22
32560    52
Name: age, Length: 32561, dtype: int64dtypes와 dtype
age                int64
workclass         object
fnlwgt             int64
                   ...  
hours_per_week     int64
native_country    object
income            object
Length: 15, dtype: objectdtype('int64')배열로 변환
df['age'].to_numpy()  # 주의: 여기까지는 import numpy 필요 없음
array([39, 50, 38, ..., 58, 22, 52], dtype=int64)
통계 계산
90
90
0        1980
1        1969
2        1981
         ... 
32558    1961
32559    1997
32560    1967
Name: age, Length: 32561, dtype: int64여러 컬럼에 계산
cols = ['age', 'education_num']
  
    
      |  | age | education_num | 
  
  
    
      | 0 | 39 | 13 | 
    
      | 1 | 50 | 13 | 
    
      | 2 | 38 | 9 | 
    
      | 3 | 53 | 7 | 
    
      | 4 | 28 | 13 | 
  
 age              90
education_num    16
dtype: int64
age              90
education_num    16
dtype: int64
정렬
df.sort_values('age').head()
  
    
      |  | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income | 
  
  
    
      | 12318 | 17 | Private | 127366 | 11th | 7 | Never-married | Sales | Own-child | White | Female | 0 | 0 | 8 | United-States | <=50K | 
    
      | 6312 | 17 | Private | 132755 | 11th | 7 | Never-married | Sales | Own-child | White | Male | 0 | 0 | 15 | United-States | <=50K | 
    
      | 30927 | 17 | Private | 108470 | 11th | 7 | Never-married | Other-service | Own-child | Black | Male | 0 | 0 | 17 | United-States | <=50K | 
    
      | 12787 | 17 | Local-gov | 308901 | 11th | 7 | Never-married | Adm-clerical | Own-child | White | Female | 0 | 0 | 15 | United-States | <=50K | 
    
      | 25755 | 17 | ? | 47407 | 11th | 7 | Never-married | ? | Own-child | White | Male | 0 | 0 | 10 | United-States | <=50K | 
  
 df.sort_values('age', ascending=False).head()  # 내림차순
  
    
      |  | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income | 
  
  
    
      | 5406 | 90 | Private | 51744 | Masters | 14 | Never-married | Exec-managerial | Not-in-family | Black | Male | 0 | 0 | 50 | United-States | >50K | 
    
      | 6624 | 90 | Private | 313986 | 11th | 7 | Married-civ-spouse | Craft-repair | Husband | White | Male | 0 | 0 | 40 | United-States | <=50K | 
    
      | 20610 | 90 | Private | 206667 | Masters | 14 | Married-civ-spouse | Prof-specialty | Wife | White | Female | 0 | 0 | 40 | United-States | >50K | 
    
      | 1040 | 90 | Private | 137018 | HS-grad | 9 | Never-married | Other-service | Not-in-family | White | Female | 0 | 0 | 40 | United-States | <=50K | 
    
      | 1935 | 90 | Private | 221832 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 45 | United-States | <=50K | 
  
 쿼리
df.query('age < 18 and capital_gain > 1000')
  
    
      |  | age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income | 
  
  
    
      | 106 | 17 | ? | 304873 | 10th | 6 | Never-married | ? | Own-child | White | Female | 34095 | 0 | 32 | United-States | <=50K | 
    
      | 271 | 17 | Private | 191260 | 9th | 5 | Never-married | Other-service | Own-child | White | Male | 1055 | 0 | 24 | United-States | <=50K | 
    
      | 421 | 17 | Private | 175024 | 11th | 7 | Never-married | Handlers-cleaners | Own-child | White | Male | 2176 | 0 | 18 | United-States | <=50K | 
    
      | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | 
    
      | 1691 | 17 | Private | 103851 | 11th | 7 | Never-married | Adm-clerical | Own-child | White | Female | 1055 | 0 | 20 | United-States | <=50K | 
    
      | 3605 | 17 | Private | 130125 | 10th | 6 | Never-married | Other-service | Own-child | Amer-Indian-Eskimo | Female | 1055 | 0 | 20 | United-States | <=50K | 
    
      | 27889 | 17 | Private | 56536 | 11th | 7 | Never-married | Sales | Own-child | White | Female | 1055 | 0 | 18 | India | <=50K | 
  
7 rows × 15 columns
 그룹
df.groupby('income').agg({'education_num': np.mean}) 
  
    
      |  | education_num | 
    
      | income |  | 
  
  
    
      | <=50K | 9.595065 | 
    
      | >50K | 11.611657 | 
  
 상자-수염 그림
sns.boxplot(x="income", y="education_num", data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x218f61f7978>
