다스크 기초

import numpy as np

x = np.array([1, 2, 3])

x + 1

import pandas as pd

df = pd.read_excel('census.xlsx')

df.head()

%%time
df['age'].mean()

dask 설치

!conda install -y dask

from dask.distributed import Client

client = Client(n_workers=4)

client.cluster

VBox(children=(HTML(value='<h2>LocalCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    …

from time import sleep

def add(x, y):
    sleep(1)
    return x + y

%%time

add(1, 2)

Wall time: 1 s

%%time

x = add(1, 2)
y = add(3, 4)
add(x, y)

Wall time: 3 s

from dask.delayed import delayed

dadd = delayed(add)  # dadd는 add의 delay 버전

dadd(1, 2)

Delayed('add-17f5f1ca-f823-4ee4-b58a-9a611ae14d93')

%%time

dadd(1, 2).compute()

Wall time: 1.02 s

%%time

x = dadd(1, 2)
y = dadd(3, 4)
dadd(x, y).compute()

Wall time: 2.03 s

import pandas as pd

df = pd.read_excel('census.xlsx')

%%time 

df['age'].mean()

import pandas as pd

import dask.dataframe as dd

df.shape

df2 = dd.from_pandas(df, chunksize=10000)

df2

df2['age'].mean()

%%time

df2['age'].mean().compute()

df.query('sex == "Male"').to_csv('male.csv')
df.query('sex == "Female"').to_csv('female.csv')

df3 = dd.read_csv('*male.csv')

df3

df3.query('age < 18 and capital_gain > 1000').compute()

client.close()