2.8 KiB
2.8 KiB
统计分析和假设检验
描述性统计
def describe_column(series):
return {
'count': len(series),
'mean': series.mean(),
'std': series.std(),
'min': series.min(),
'q25': series.quantile(0.25),
'median': series.median(),
'q75': series.quantile(0.75),
'max': series.max(),
'skew': series.skew(),
'kurtosis': series.kurtosis()
}
T检验
from scipy import stats
# 独立样本T检验(两组均值比较)
stat, p = stats.ttest_ind(group1, group2)
# 配对T检验
stat, p = stats.ttest_rel(before, after)
# 单样本T检验
stat, p = stats.ttest_1samp(sample, popmean=100)
# 结果解读
print(f"p={p:.4f}, {'显著' if p < 0.05 else '不显著'}")
卡方检验
# 分类变量独立性检验
contingency = pd.crosstab(df['col1'], df['col2'])
chi2, p, dof, expected = stats.chi2_contingency(contingency)
A/B测试
from statsmodels.stats.proportion import proportions_ztest
def ab_test(ctrl_conv, ctrl_n, treat_conv, treat_n, alpha=0.05):
count = np.array([treat_conv, ctrl_conv])
nobs = np.array([treat_n, ctrl_n])
stat, p = proportions_ztest(count, nobs)
ctrl_rate = ctrl_conv / ctrl_n
treat_rate = treat_conv / treat_n
lift = (treat_rate - ctrl_rate) / ctrl_rate * 100
return {
'ctrl_rate': f"{ctrl_rate:.2%}",
'treat_rate': f"{treat_rate:.2%}",
'lift': f"{lift:.1f}%",
'p_value': p,
'significant': p < alpha
}
ANOVA
# 单因素方差分析
f_stat, p = stats.f_oneway(g1, g2, g3)
# 事后检验
from statsmodels.stats.multicomp import pairwise_tukeyhsd
tukey = pairwise_tukeyhsd(df['value'], df['group'])
print(tukey.summary())
相关性分析
# Pearson/Spearman相关
corr = df.select_dtypes(include=[np.number]).corr(method='pearson')
# 找高相关对
high_corr = []
for i in range(len(corr.columns)):
for j in range(i+1, len(corr.columns)):
if abs(corr.iloc[i,j]) > 0.7:
high_corr.append((corr.columns[i], corr.columns[j], corr.iloc[i,j]))
回归分析
import statsmodels.api as sm
X = df[features]
y = df[target]
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())
print(f"R² = {model.rsquared:.4f}")
效应量
# Cohen's d
def cohens_d(g1, g2):
n1, n2 = len(g1), len(g2)
pooled_std = np.sqrt(((n1-1)*g1.var() + (n2-1)*g2.var()) / (n1+n2-2))
return (g1.mean() - g2.mean()) / pooled_std
# |d| < 0.2 小, 0.2-0.8 中, > 0.8 大
样本量计算
from statsmodels.stats.power import TTestIndPower
analysis = TTestIndPower()
n = analysis.solve_power(effect_size=0.5, alpha=0.05, power=0.8)
print(f"每组需要 {int(np.ceil(n))} 样本")