130 lines
2.8 KiB
Markdown
130 lines
2.8 KiB
Markdown
# 统计分析和假设检验
|
||
|
||
## 描述性统计
|
||
|
||
```python
|
||
def describe_column(series):
|
||
return {
|
||
'count': len(series),
|
||
'mean': series.mean(),
|
||
'std': series.std(),
|
||
'min': series.min(),
|
||
'q25': series.quantile(0.25),
|
||
'median': series.median(),
|
||
'q75': series.quantile(0.75),
|
||
'max': series.max(),
|
||
'skew': series.skew(),
|
||
'kurtosis': series.kurtosis()
|
||
}
|
||
```
|
||
|
||
## T检验
|
||
|
||
```python
|
||
from scipy import stats
|
||
|
||
# 独立样本T检验(两组均值比较)
|
||
stat, p = stats.ttest_ind(group1, group2)
|
||
|
||
# 配对T检验
|
||
stat, p = stats.ttest_rel(before, after)
|
||
|
||
# 单样本T检验
|
||
stat, p = stats.ttest_1samp(sample, popmean=100)
|
||
|
||
# 结果解读
|
||
print(f"p={p:.4f}, {'显著' if p < 0.05 else '不显著'}")
|
||
```
|
||
|
||
## 卡方检验
|
||
|
||
```python
|
||
# 分类变量独立性检验
|
||
contingency = pd.crosstab(df['col1'], df['col2'])
|
||
chi2, p, dof, expected = stats.chi2_contingency(contingency)
|
||
```
|
||
|
||
## A/B测试
|
||
|
||
```python
|
||
from statsmodels.stats.proportion import proportions_ztest
|
||
|
||
def ab_test(ctrl_conv, ctrl_n, treat_conv, treat_n, alpha=0.05):
|
||
count = np.array([treat_conv, ctrl_conv])
|
||
nobs = np.array([treat_n, ctrl_n])
|
||
stat, p = proportions_ztest(count, nobs)
|
||
|
||
ctrl_rate = ctrl_conv / ctrl_n
|
||
treat_rate = treat_conv / treat_n
|
||
lift = (treat_rate - ctrl_rate) / ctrl_rate * 100
|
||
|
||
return {
|
||
'ctrl_rate': f"{ctrl_rate:.2%}",
|
||
'treat_rate': f"{treat_rate:.2%}",
|
||
'lift': f"{lift:.1f}%",
|
||
'p_value': p,
|
||
'significant': p < alpha
|
||
}
|
||
```
|
||
|
||
## ANOVA
|
||
|
||
```python
|
||
# 单因素方差分析
|
||
f_stat, p = stats.f_oneway(g1, g2, g3)
|
||
|
||
# 事后检验
|
||
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
||
tukey = pairwise_tukeyhsd(df['value'], df['group'])
|
||
print(tukey.summary())
|
||
```
|
||
|
||
## 相关性分析
|
||
|
||
```python
|
||
# Pearson/Spearman相关
|
||
corr = df.select_dtypes(include=[np.number]).corr(method='pearson')
|
||
|
||
# 找高相关对
|
||
high_corr = []
|
||
for i in range(len(corr.columns)):
|
||
for j in range(i+1, len(corr.columns)):
|
||
if abs(corr.iloc[i,j]) > 0.7:
|
||
high_corr.append((corr.columns[i], corr.columns[j], corr.iloc[i,j]))
|
||
```
|
||
|
||
## 回归分析
|
||
|
||
```python
|
||
import statsmodels.api as sm
|
||
|
||
X = df[features]
|
||
y = df[target]
|
||
X = sm.add_constant(X)
|
||
|
||
model = sm.OLS(y, X).fit()
|
||
print(model.summary())
|
||
print(f"R² = {model.rsquared:.4f}")
|
||
```
|
||
|
||
## 效应量
|
||
|
||
```python
|
||
# Cohen's d
|
||
def cohens_d(g1, g2):
|
||
n1, n2 = len(g1), len(g2)
|
||
pooled_std = np.sqrt(((n1-1)*g1.var() + (n2-1)*g2.var()) / (n1+n2-2))
|
||
return (g1.mean() - g2.mean()) / pooled_std
|
||
# |d| < 0.2 小, 0.2-0.8 中, > 0.8 大
|
||
```
|
||
|
||
## 样本量计算
|
||
|
||
```python
|
||
from statsmodels.stats.power import TTestIndPower
|
||
|
||
analysis = TTestIndPower()
|
||
n = analysis.solve_power(effect_size=0.5, alpha=0.05, power=0.8)
|
||
print(f"每组需要 {int(np.ceil(n))} 样本")
|
||
```
|