Pandas 활용 예제집

일반적인 pandas 패턴과 이에 상응하는 DataStore 연산을 정리합니다. 대부분의 코드는 수정 없이 그대로 동작합니다.

데이터 불러오기

CSV 파일 읽기

# Pandas
import pandas as pd
df = pd.read_csv("data.csv")

# DataStore - same!
from chdb import datastore as pd
df = pd.read_csv("data.csv")

여러 파일 읽기

# Pandas
import glob
dfs = [pd.read_csv(f) for f in glob.glob("data/*.csv")]
df = pd.concat(dfs)

# DataStore - more efficient with glob pattern
df = pd.read_csv("data/*.csv")

필터링

단일 조건

# Pandas and DataStore - identical
df[df['age'] > 25]
df[df['city'] == 'NYC']
df[df['name'].str.contains('John')]

다중 조건

# AND
df[(df['age'] > 25) & (df['city'] == 'NYC')]

# OR
df[(df['age'] < 18) | (df['age'] > 65)]

# NOT
df[~(df['status'] == 'inactive')]

query() 함수 사용하기

# Pandas and DataStore - identical
df.query('age > 25 and city == "NYC"')
df.query('salary > 50000')

isin()

# Pandas and DataStore - identical
df[df['city'].isin(['NYC', 'LA', 'SF'])]

between()

# Pandas and DataStore - identical
df[df['age'].between(18, 65)]

컬럼 선택

단일 컬럼

# Pandas and DataStore - identical
df['name']
df.name  # attribute access

여러 개의 컬럼

# Pandas and DataStore - identical
df[['name', 'age', 'city']]

선택 및 필터링

# Pandas and DataStore - identical
df[df['age'] > 25][['name', 'salary']]

# DataStore also supports SQL-style
df.filter(df['age'] > 25).select('name', 'salary')

정렬

단일 컬럼

# Pandas and DataStore - identical
df.sort_values('salary')
df.sort_values('salary', ascending=False)

여러 컬럼

# Pandas and DataStore - identical
df.sort_values(['city', 'salary'], ascending=[True, False])

상위/하위 N 구하기

# Pandas and DataStore - identical
df.nlargest(10, 'salary')
df.nsmallest(5, 'age')

GroupBy와 집계

간단한 GroupBy

# Pandas and DataStore - identical
df.groupby('city')['salary'].mean()
df.groupby('city')['salary'].sum()
df.groupby('city').size()  # count

여러 집계

# Pandas and DataStore - identical
df.groupby('city')['salary'].agg(['sum', 'mean', 'count'])

df.groupby('city').agg({
    'salary': ['sum', 'mean'],
    'age': ['min', 'max']
})

이름이 지정된 집계

# Pandas and DataStore - identical
df.groupby('city').agg(
    total_salary=('salary', 'sum'),
    avg_salary=('salary', 'mean'),
    employee_count=('id', 'count')
)

여러 개의 GroupBy 키

# Pandas and DataStore - identical
df.groupby(['city', 'department'])['salary'].mean()

데이터 조인

내부 조인(Inner Join)

# Pandas
pd.merge(df1, df2, on='id')

# DataStore - same API
pd.merge(df1, df2, on='id')

# DataStore also supports
df1.join(df2, on='id')

왼쪽 조인(Left Join)

# Pandas and DataStore - identical
pd.merge(df1, df2, on='id', how='left')

서로 다른 컬럼으로 조인하기

# Pandas and DataStore - identical
pd.merge(df1, df2, left_on='emp_id', right_on='id')

문자열 연결(Concat)

# Pandas and DataStore - identical
pd.concat([df1, df2, df3])
pd.concat([df1, df2], axis=1)

문자열 처리

대소문자 변환

# Pandas and DataStore - identical
df['name'].str.upper()
df['name'].str.lower()
df['name'].str.title()

부분 문자열

# Pandas and DataStore - identical
df['name'].str[:3]        # First 3 characters
df['name'].str.slice(0, 3)

검색

# Pandas and DataStore - identical
df['name'].str.contains('John')
df['name'].str.startswith('A')
df['name'].str.endswith('son')

치환

# Pandas and DataStore - identical
df['text'].str.replace('old', 'new')
df['text'].str.replace(r'\d+', '', regex=True)  # Remove digits

분리

# Pandas and DataStore - identical
df['name'].str.split(' ')
df['name'].str.split(' ', expand=True)

길이

# Pandas and DataStore - identical
df['name'].str.len()

DateTime 연산

구성 요소 추출

# Pandas and DataStore - identical
df['date'].dt.year
df['date'].dt.month
df['date'].dt.day
df['date'].dt.dayofweek
df['date'].dt.hour

서식 지정

# Pandas and DataStore - identical
df['date'].dt.strftime('%Y-%m-%d')

누락된 데이터

결측값 확인

# Pandas and DataStore - identical
df['col'].isna()
df['col'].notna()
df.isna().sum()

결측값 삭제

# Pandas and DataStore - identical
df.dropna()
df.dropna(subset=['col1', 'col2'])

결측값 채우기

# Pandas and DataStore - identical
df.fillna(0)
df.fillna({'col1': 0, 'col2': 'Unknown'})
df.fillna(method='ffill')

새 컬럼 생성

간단한 할당

# Pandas and DataStore - identical
df['total'] = df['price'] * df['quantity']
df['age_group'] = df['age'] // 10 * 10

assign() 함수 사용하기

# Pandas and DataStore - identical
df = df.assign(
    total=df['price'] * df['quantity'],
    is_adult=df['age'] >= 18
)

조건부 처리 (where/mask)

# Pandas and DataStore - identical
df['status'] = df['age'].where(df['age'] >= 18, 'minor')

apply()를 활용한 사용자 정의 로직

# Works, but triggers pandas execution
df['category'] = df['amount'].apply(lambda x: 'high' if x > 1000 else 'low')

# DataStore alternative (stays lazy)
df['category'] = (
    df.when(df['amount'] > 1000, 'high')
      .otherwise('low')
)

재구조화

피벗 테이블

# Pandas and DataStore - identical
df.pivot_table(
    values='amount',
    index='region',
    columns='product',
    aggfunc='sum'
)

Melt (언피벗)

# Pandas and DataStore - identical
df.melt(
    id_vars=['name'],
    value_vars=['score1', 'score2', 'score3'],
    var_name='test',
    value_name='score'
)

explode 함수

# Pandas and DataStore - identical
df.explode('tags')  # Expand array column

윈도우 함수

롤링 윈도우

# Pandas and DataStore - identical
df['rolling_avg'] = df['price'].rolling(window=7).mean()
df['rolling_sum'] = df['amount'].rolling(window=30).sum()

Expanding 윈도우

# Pandas and DataStore - identical
df['cumsum'] = df['amount'].expanding().sum()
df['cummax'] = df['amount'].expanding().max()

Shift

# Pandas and DataStore - identical
df['prev_value'] = df['value'].shift(1)   # Lag
df['next_value'] = df['value'].shift(-1)  # Lead

Diff(차이)

# Pandas and DataStore - identical
df['change'] = df['value'].diff()
df['pct_change'] = df['value'].pct_change()

출력

CSV로 내보내기

# Pandas and DataStore - identical
df.to_csv("output.csv", index=False)

Parquet으로 출력

# Pandas and DataStore - identical
df.to_parquet("output.parquet")

pandas DataFrame으로 변환

# DataStore specific
pandas_df = ds.to_df()
pandas_df = ds.to_pandas()

DataStore 추가 기능

VIEW SQL

# DataStore only
print(ds.to_sql())

실행 계획

# DataStore only
ds.explain()

ClickHouse 함수

# DataStore only - extra accessors
df['domain'] = df['url'].url.domain()
df['json_value'] = df['data'].json.get_string('key')
df['ip_valid'] = df['ip'].ip.is_ipv4_string()

유니버설 URI

# DataStore only - read from anywhere
ds = DataStore.uri("s3://bucket/data.parquet")
ds = DataStore.uri("mysql://user:pass@host/db/table")

데이터 불러오기​

CSV 파일 읽기​

여러 파일 읽기​

필터링​

단일 조건​

다중 조건​

query() 함수 사용하기​

isin()​

between()​

컬럼 선택​

단일 컬럼​

여러 개의 컬럼​

선택 및 필터링​

정렬​

단일 컬럼​

여러 컬럼​

상위/하위 N 구하기​

GroupBy와 집계​

간단한 GroupBy​

여러 집계​

이름이 지정된 집계​

여러 개의 GroupBy 키​

데이터 조인​

내부 조인(Inner Join)​

왼쪽 조인(Left Join)​

서로 다른 컬럼으로 조인하기​

문자열 연결(Concat)​

문자열 처리​

대소문자 변환​

부분 문자열​

검색​

치환​

분리​

길이​

DateTime 연산​

구성 요소 추출​

서식 지정​

누락된 데이터​

결측값 확인​

결측값 삭제​

결측값 채우기​

새 컬럼 생성​

간단한 할당​

assign() 함수 사용하기​

조건부 처리 (where/mask)​

apply()를 활용한 사용자 정의 로직​

재구조화​

피벗 테이블​

Melt (언피벗)​

explode 함수​

윈도우 함수​

롤링 윈도우​

Expanding 윈도우​

Shift​

Diff(차이)​

출력​

CSV로 내보내기​

Parquet으로 출력​

pandas DataFrame으로 변환​

DataStore 추가 기능​

VIEW SQL​

실행 계획​

ClickHouse 함수​

유니버설 URI​

데이터 불러오기

CSV 파일 읽기

여러 파일 읽기

필터링

단일 조건

다중 조건

query() 함수 사용하기

isin()

between()

컬럼 선택

단일 컬럼

여러 개의 컬럼

선택 및 필터링

정렬

단일 컬럼

여러 컬럼

상위/하위 N 구하기

GroupBy와 집계

간단한 GroupBy

여러 집계

이름이 지정된 집계

여러 개의 GroupBy 키

데이터 조인

내부 조인(Inner Join)

왼쪽 조인(Left Join)

서로 다른 컬럼으로 조인하기

문자열 연결(Concat)

문자열 처리

대소문자 변환

부분 문자열

검색

치환

분리

길이

DateTime 연산

구성 요소 추출

서식 지정

누락된 데이터

결측값 확인

결측값 삭제

결측값 채우기

새 컬럼 생성

간단한 할당

assign() 함수 사용하기

조건부 처리 (where/mask)

apply()를 활용한 사용자 정의 로직

재구조화

피벗 테이블

Melt (언피벗)

explode 함수

윈도우 함수

롤링 윈도우

Expanding 윈도우

Shift

Diff(차이)

출력

CSV로 내보내기

Parquet으로 출력

pandas DataFrame으로 변환

DataStore 추가 기능

VIEW SQL

실행 계획

ClickHouse 함수

유니버설 URI