bookworm-smart-assistant/skills/data-analyst-expert/references/pandas-guide.md

2.9 KiB

Pandas 数据处理指南

数据加载

# 基础加载
df = pd.read_csv('data.csv')
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')

# 大文件优化
df = pd.read_csv('large.csv', 
    dtype={'id': 'int32', 'name': 'category'},
    usecols=['id', 'name', 'value'],
    parse_dates=['date'],
    chunksize=100000
)

数据清洗

链式操作

df_clean = (df
    .drop_duplicates(subset=['id'])
    .dropna(subset=['critical_col'])
    .assign(
        date=lambda x: pd.to_datetime(x['date']),
        value_norm=lambda x: (x['value'] - x['value'].mean()) / x['value'].std()
    )
    .query('value > 0')
    .reset_index(drop=True)
)

缺失值处理

# 查看缺失
df.isnull().sum()
df.isnull().mean() * 100

# 删除高缺失列 (>50%)
df = df.drop(columns=df.columns[df.isnull().mean() > 0.5])

# 填充
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=['object']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

异常值处理

# IQR方法
def remove_outliers_iqr(df, col, k=1.5):
    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    return df[(df[col] >= Q1 - k*IQR) & (df[col] <= Q3 + k*IQR)]

# Z-score方法
from scipy import stats
df = df[np.abs(stats.zscore(df[col])) < 3]

特征工程

时间特征

df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['quarter'] = df['date'].dt.quarter

分箱

df['age_group'] = pd.cut(df['age'], 
    bins=[0, 18, 30, 45, 60, 100],
    labels=['<18', '18-30', '30-45', '45-60', '60+']
)
df['value_q'] = pd.qcut(df['value'], q=4, labels=['Q1','Q2','Q3','Q4'])

编码

# 标签编码
df['cat_code'] = df['category'].astype('category').cat.codes

# One-Hot
df = pd.get_dummies(df, columns=['category'], prefix='cat')

聚合分析

# 多维聚合
summary = (df
    .groupby(['region', 'product'])
    .agg({
        'revenue': ['sum', 'mean'],
        'quantity': 'sum',
        'user_id': 'nunique'
    })
)
summary.columns = ['_'.join(c) for c in summary.columns]

# 透视表
pivot = df.pivot_table(
    values='revenue',
    index='region',
    columns='quarter',
    aggfunc='sum',
    margins=True
)

数据合并

# Merge
df = pd.merge(df1, df2, on='id', how='left')

# Concat
df = pd.concat([df1, df2], ignore_index=True)

性能优化

# 内存优化
for col in df.select_dtypes(include=['int']).columns:
    df[col] = pd.to_numeric(df[col], downcast='integer')
for col in df.select_dtypes(include=['object']).columns:
    if df[col].nunique() / len(df) < 0.5:
        df[col] = df[col].astype('category')

# 向量化
df['new'] = np.where(df['value'] > 100, 'high', 'low')