数据分析基础:统计计算
导入库并创建数据
首先,我们需要导入必要的库,并创建一个简单的列表数据,后续对该列表进行相应的数据统计分析。
import numpy as np
from scipy import stats
data = [1, 2, 3, 4, 5, 10, 4, 5, 10, 4, 5]
1. 计算基本统计量
mean = np.mean(data) # 平均值
max_value = np.max(data) # 最大值
min_value = np.min(data) # 最小值
median = np.median(data) # 中位数
mode = stats.mode(data).mode[0] # 众数
variance = np.var(data) # 方差
std_dev = np.std(data) # 标准差
range_value = np.ptp(data) # 极差
print(f"平均值: {mean}")
print(f"最大值: {max_value}")
print(f"最小值: {min_value}")
print(f"中位数: {median}")
print(f"众数: {mode}")
print(f"方差: {variance}")
print(f"标准差: {std_dev}")
print(f"极差: {range_value}")
2. 计算分位数
q1 = np.percentile(data, 25) # 第一四分位数
q3 = np.percentile(data, 75) # 第三四分位数
percentile_90 = np.percentile(data, 90) # 第90百分位数
print(f"第一四分位数: {q1}")
print(f"第三四分位数: {q3}")
print(f"第90百分位数: {percentile_90}")
3. 计算偏度
skewness = stats.skew(data)
print(f"偏度: {skewness}")
4. 计算峰度
kurtosis = stats.kurtosis(data)
print(f"峰度: {kurtosis}")
5. 计算相关系数
data1 = [1, 2, 3, 4, 5]
data2 = [2, 4, 6, 8, 10]
correlation = np.corrcoef(data1, data2)[0, 1]
print(f"相关系数: {correlation}")
6. 计算协方差
covariance = np.cov(data1, data2)[0, 1]
print(f"协方差: {covariance}")
7. 计算累积和
cumulative_sum = np.cumsum(data)
print(f"累积和: {cumulative_sum}")
8. 计算累积积
cumulative_product = np.cumprod(data)
print(f"累积积: {cumulative_product}")
9. 计算累积最大值和最小值
cumulative_max = np.maximum.accumulate(data)
cumulative_min = np.minimum.accumulate(data)
print(f"累积最大值: {cumulative_max}")
print(f"累积最小值: {cumulative_min}")
10. 计算累积平均值
cumulative_mean = np.cumsum(data) / np.arange(1, len(data) + 1)
print(f"累积平均值: {cumulative_mean}")
11. 计算累积方差
cumulative_variance = np.cumsum((data - mean) ** 2) / np.arange(1, len(data) + 1)
print(f"累积方差: {cumulative_variance}")
12. 计算累积标准差
cumulative_std_dev = np.sqrt(cumulative_variance)
print(f"累积标准差: {cumulative_std_dev}")
13. 计算移动平均
def moving_average(data, window_size):
return [sum(data[i:i+window_size])/window_size for i in range(len(data)-window_size+1)]
window_size = 3
moving_avg = moving_average(data, window_size)
print(f"移动平均: {moving_avg}")
14. 计算指数加权移动平均(EWMA)
def ewma(data, alpha):
ewma = [data[0]]
for i in range(1, len(data)):
ewma.append(alpha * data[i] + (1 - alpha) * ewma[-1])
return ewma
alpha = 0.5
ewma_values = ewma(data, alpha)
print(f"指数加权移动平均: {ewma_values}")
15. 计算列表元素的 Z 分数(标准分数)
def z_scores(data):
mean = np.mean(data)
std_dev = np.std(data)
return [(x - mean) / std_dev for x in data]
z_scores_values = z_scores(data)
print(f"Z 分数: {z_scores_values}")
16. 计算列表数据的累积密度函数(CDF)
def cdf(data):
sorted_data = sorted(data)
return [len(sorted_data[:i+1])/len(data) for i in range(len(data))]
cdf_values = cdf(data)
print(f"累积密度函数: {cdf_values}")
17. 计算概率密度函数(PDF)
def pdf(data, bins=10):
histogram, bin_edges = np.histogram(data, bins=bins, density=True)
return histogram, bin_edges
pdf_values, bin_edges = pdf(data)
print(f"概率密度函数: {pdf_values}")
print(f"区间边界: {bin_edges}")
18. 计算列表的排序索引
def rank_data(data):
sorted_data = sorted([(value, idx) for idx, value in enumerate(data)])
return [idx for value, idx in sorted_data]
rank_values = rank_data(data)
print(f"排序索引: {rank_values}")
19. 计算列表的逆序对数量
def count_inversions(data):
return sum(1 for i in range(len(data)) for j in range(i+1, len(data)) if data[i] > data[j])
inversions_count = count_inversions(data)
print(f"逆序对数量: {inversions_count}")
20. 计算列表的中位数绝对偏差(MAD)
def mad(data):
median_val = np.median(data)
return np.median(np.abs(data - median_val))
mad_value = mad(data)
print(f"中位数绝对偏差: {mad_value}")
21. 计算列表元素的二阶矩(M2)
def M2(data):
n = len(data)
mean = np.mean(data)
return sum((x - mean) ** 2 for x in data) / n
m2_value = M2(data)
print(f"二阶矩: {m2_value}")
22. 计算信息熵
from math import log2
def entropy(data):
unique_values = set(data)
probabilities = [data.count(value) / len(data) for value in unique_values]
return -sum(p * log2(p) for p in probabilities)
entropy_value = entropy(data)
print(f"信息熵: {entropy_value}")
23. 计算列表的自动相关性
import pandas as pd
def autocorrelation(data, lag=1):
series = pd.Series(data)
return series.autocorr(lag)
autocorr_value = autocorrelation(data, lag=1)
print(f"自动相关性: {autocorr_value}")
24. 计算 Pearson 相关系数矩阵
def pearson_corr_matrix(data_list):
df = pd.DataFrame(data_list)
return df.corr()
data_list = [data1, data2]
corr_matrix = pearson_corr_matrix(data_list)
print(f"Pearson 相关系数矩阵\n{corr_matrix}")
25. 计算 Jackknife 统计量
from statsmodels.stats.outliers_influence import variance_inflation_factor
def jackknife_statistics(data):
return [variance_inflation_factor(pd.Series(data).values.reshape(-1, 1), i) for i in range(len(data))]
jackknife_values = jackknife_statistics(data)
print(f"Jackknife 统计量: {jackknife_values}")
26. 计算列表的元素频率
def frequency_count(data):
freq_dict = {}
for item in data:
if item in freq_dict:
freq_dict[item] += 1
else:
freq_dict[item] = 1
return freq_dict
freq_dict = frequency_count(data)
print(f"元素频率: {freq_dict}")
27. 生成数据的频率分布表
def frequency_distribution(data, bins=10):
histogram, bin_edges = np.histogram(data, bins=bins)
return histogram, bin_edges
histogram, bin_edges = frequency_distribution(data)
print(f"频率分布: {histogram}")
print(f"区间边界: {bin_edges}")
28. 计算列表的中位数绝对偏差比率(MAD Ratio)
def mad_ratio(data):
median = np.median(data)
mad = np.median(np.abs(data - median))
return mad / np.std(data)
mad_ratio_value = mad_ratio(data)
print(f"中位数绝对偏差比率: {mad_ratio_value}")
29. 检测列表中的线性趋势
def linear_trend(data):
x = range(len(data))
slope, intercept, r_value, p_value, std_err = stats.linregress(x, data)
return slope, intercept, r_value
slope, intercept, r_value = linear_trend(data)
print(f"斜率: {slope}, 截距: {intercept}, 相关系数: {r_value}")
30. 计算列表的三角矩(Trimmed Mean)
def trimmed_mean(data, proportion=0.1):
sorted_data = sorted(data)
trim_amnt = int(len(data) * proportion)
trimmed_data = sorted_data[trim_amnt:-trim_amnt]
return np.mean(trimmed_data)
trimmed_mean_value = trimmed_mean(data)
print(f"三角矩: {trimmed_mean_value}")
评论区