2.9 KiB
2.9 KiB
Pandas 数据处理指南
数据加载
# 基础加载
df = pd.read_csv('data.csv')
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# 大文件优化
df = pd.read_csv('large.csv',
dtype={'id': 'int32', 'name': 'category'},
usecols=['id', 'name', 'value'],
parse_dates=['date'],
chunksize=100000
)
数据清洗
链式操作
df_clean = (df
.drop_duplicates(subset=['id'])
.dropna(subset=['critical_col'])
.assign(
date=lambda x: pd.to_datetime(x['date']),
value_norm=lambda x: (x['value'] - x['value'].mean()) / x['value'].std()
)
.query('value > 0')
.reset_index(drop=True)
)
缺失值处理
# 查看缺失
df.isnull().sum()
df.isnull().mean() * 100
# 删除高缺失列 (>50%)
df = df.drop(columns=df.columns[df.isnull().mean() > 0.5])
# 填充
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=['object']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])
异常值处理
# IQR方法
def remove_outliers_iqr(df, col, k=1.5):
Q1, Q3 = df[col].quantile([0.25, 0.75])
IQR = Q3 - Q1
return df[(df[col] >= Q1 - k*IQR) & (df[col] <= Q3 + k*IQR)]
# Z-score方法
from scipy import stats
df = df[np.abs(stats.zscore(df[col])) < 3]
特征工程
时间特征
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['quarter'] = df['date'].dt.quarter
分箱
df['age_group'] = pd.cut(df['age'],
bins=[0, 18, 30, 45, 60, 100],
labels=['<18', '18-30', '30-45', '45-60', '60+']
)
df['value_q'] = pd.qcut(df['value'], q=4, labels=['Q1','Q2','Q3','Q4'])
编码
# 标签编码
df['cat_code'] = df['category'].astype('category').cat.codes
# One-Hot
df = pd.get_dummies(df, columns=['category'], prefix='cat')
聚合分析
# 多维聚合
summary = (df
.groupby(['region', 'product'])
.agg({
'revenue': ['sum', 'mean'],
'quantity': 'sum',
'user_id': 'nunique'
})
)
summary.columns = ['_'.join(c) for c in summary.columns]
# 透视表
pivot = df.pivot_table(
values='revenue',
index='region',
columns='quarter',
aggfunc='sum',
margins=True
)
数据合并
# Merge
df = pd.merge(df1, df2, on='id', how='left')
# Concat
df = pd.concat([df1, df2], ignore_index=True)
性能优化
# 内存优化
for col in df.select_dtypes(include=['int']).columns:
df[col] = pd.to_numeric(df[col], downcast='integer')
for col in df.select_dtypes(include=['object']).columns:
if df[col].nunique() / len(df) < 0.5:
df[col] = df[col].astype('category')
# 向量化
df['new'] = np.where(df['value'] > 100, 'high', 'low')