bookworm-smart-assistant/skills/data-analyst-expert/references/statistics.md

2.8 KiB
Raw Blame History

统计分析和假设检验

描述性统计

def describe_column(series):
    return {
        'count': len(series),
        'mean': series.mean(),
        'std': series.std(),
        'min': series.min(),
        'q25': series.quantile(0.25),
        'median': series.median(),
        'q75': series.quantile(0.75),
        'max': series.max(),
        'skew': series.skew(),
        'kurtosis': series.kurtosis()
    }

T检验

from scipy import stats

# 独立样本T检验两组均值比较
stat, p = stats.ttest_ind(group1, group2)

# 配对T检验
stat, p = stats.ttest_rel(before, after)

# 单样本T检验
stat, p = stats.ttest_1samp(sample, popmean=100)

# 结果解读
print(f"p={p:.4f}, {'显著' if p < 0.05 else '不显著'}")

卡方检验

# 分类变量独立性检验
contingency = pd.crosstab(df['col1'], df['col2'])
chi2, p, dof, expected = stats.chi2_contingency(contingency)

A/B测试

from statsmodels.stats.proportion import proportions_ztest

def ab_test(ctrl_conv, ctrl_n, treat_conv, treat_n, alpha=0.05):
    count = np.array([treat_conv, ctrl_conv])
    nobs = np.array([treat_n, ctrl_n])
    stat, p = proportions_ztest(count, nobs)
    
    ctrl_rate = ctrl_conv / ctrl_n
    treat_rate = treat_conv / treat_n
    lift = (treat_rate - ctrl_rate) / ctrl_rate * 100
    
    return {
        'ctrl_rate': f"{ctrl_rate:.2%}",
        'treat_rate': f"{treat_rate:.2%}",
        'lift': f"{lift:.1f}%",
        'p_value': p,
        'significant': p < alpha
    }

ANOVA

# 单因素方差分析
f_stat, p = stats.f_oneway(g1, g2, g3)

# 事后检验
from statsmodels.stats.multicomp import pairwise_tukeyhsd
tukey = pairwise_tukeyhsd(df['value'], df['group'])
print(tukey.summary())

相关性分析

# Pearson/Spearman相关
corr = df.select_dtypes(include=[np.number]).corr(method='pearson')

# 找高相关对
high_corr = []
for i in range(len(corr.columns)):
    for j in range(i+1, len(corr.columns)):
        if abs(corr.iloc[i,j]) > 0.7:
            high_corr.append((corr.columns[i], corr.columns[j], corr.iloc[i,j]))

回归分析

import statsmodels.api as sm

X = df[features]
y = df[target]
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
print(model.summary())
print(f"R² = {model.rsquared:.4f}")

效应量

# Cohen's d
def cohens_d(g1, g2):
    n1, n2 = len(g1), len(g2)
    pooled_std = np.sqrt(((n1-1)*g1.var() + (n2-1)*g2.var()) / (n1+n2-2))
    return (g1.mean() - g2.mean()) / pooled_std
# |d| < 0.2 小, 0.2-0.8 中, > 0.8 大

样本量计算

from statsmodels.stats.power import TTestIndPower

analysis = TTestIndPower()
n = analysis.solve_power(effect_size=0.5, alpha=0.05, power=0.8)
print(f"每组需要 {int(np.ceil(n))} 样本")