# -*- coding: utf-8 -*-
__author__ = 'gerry'
'''
@time 10 Nov 2017
@auther gerry
特征之间的相关性
期望:衡量样本某个特征列取值范围的平均值
方差:衡量样本某个特征列取值范围的离散程度
协方差矩阵和相关系数:衡量样本特征列之间线性相关性
'''
from numpy import *
import sys
import os
import cPickle as pickle
import matplotlib.pyplot as plt
#1、相关系数与相关距离
# * 相关系数ρxy = Cov(X,Y)/(sqrt(D(x))*sqrt(D(y))):是衡量两个特征列之间相关程度的一种方法取值范围为[-1,1],相关系数的绝对值越大,表明特征列X与Y的相关程度越高
# * 相关距离:Dxy = 1 -ρxy
featureMat = mat([
[88.5,96.8,104.1,111.3,117.7,124.0,130.0,135.4,140.2,145.3,151.9,159.5,165.9,169.8,171.6,172.3,172.7],
[12.54,14.65,16.64,18.98,21.26,24.06,27.33,30.47,33.74,37.69,42.49,48.08,53.27,57.08,59.35,60.68,61.40]
])
print shape(featureMat)
# 计算均值
mv1 = mean(featureMat[0])#第一列的均值
mv2 = mean(featureMat[1])#第二列的均值
# 计算两列的标准差
dv1 = std(featureMat[0])
dv2 = std(featureMat[1])
corref = mean(multiply(featureMat[0]-mv1,featureMat[1]-mv2))/(dv1*dv2)
print corref
#使用Numpy相关系数得到关系相关系数矩阵
print corrcoef(featureMat)
#2、,马氏距离
# 有M个样本向量X1~Xm,协方差矩阵记为S,均值记为向量μ,则其中样本向量X到μ的距离记为
# D(X) = sqrt((X-μ)'S^(-1)(X-μ))
# 其中向量Xi与Xj之间的马氏距离定义为:
# D(Xi,Xj) = sqrt((Xi,Xj)'S^(-1)(Xi,Xj))
# 协方差矩阵是单位矩阵(各个样本向量之间独立分布),则公式变成了欧氏距离
# D(Xi,Xj) = sqrt((Xi,Xj)'(Xi,Xj))
# 协方差矩阵是对角矩阵,则公式变成了标准化欧氏距离公式
# 优点:量纲无关,排除变量之间的相关性的干扰
print "===================求马氏距离===================================="
a = cov(featureMat) #协方差公式
covinv = linalg.inv(a) #矩阵求逆
tp = featureMat.T[0]-featureMat.T[1]
distma = sqrt(dot(dot(tp,covinv),tp.T))
print distma
print "==========================矩阵的特征值和特征向量==================================="
A= [[8,1,6],[3,5,7],[4,9,2]]
evals,evecs = linalg.eig(A)
print "特征值:",evals,"\n特征向量:",evecs
print "===========================数据的归一化======================================================="
# 归一化是一种简化计算的方式,集将有量纲的表达式,经过变换,转换为无量纲的表达式,称为标量
#归一化有两种形式:一种是把数变为(0,1)之间的小数,一种是把有量纲表达式变为无量刚的表达式
# X* = (X-M)/S
# 标准化以后的值=(标准化前的值-分量的均值)/分量的标准差
#欧氏距离的标准化
vectormat = mat([[1,2,3],[4,5,6]])
print mean(vectormat)
v12 = vectormat[0]-vectormat[1]
print sqrt(v12*v12.T)
#norm
varmat = std(vectormat.T,axis=0) #求得方差
print varmat
normvmat = (vectormat-mean(vectormat))/varmat.T
normv12 = normvmat[0]-normvmat[1]
print sqrt(normv12*normv12.T)
print "====================数据的导入和内存管理==============================="
#配置UTF-8输出环境
reload(sys)
sys.setdefaultencoding('utf-8')
#数据文件转矩阵
#path:数据文件路径
#delimiter:行内字段分隔符
def file2matrix(path,delimiter):
recordlist = []
fp = open(path,"rb") #读取文件内容
content = fp.read()
fp.close()
rowlist = content.splitlines() #按行转换为一维表
#逐行遍历,结果按分割符分割行向量
recordlist = [map(eval,row.split(delimiter)) for row in rowlist if row.strip()]
return mat(recordlist) #返回转换后的矩阵形式
root = "testdata" #数据文件所在路径
pathlist = os.listdir(root) #获取路径下的所有数据文件
for path in pathlist:
recordmat = file2matrix(root+"/"+path,"\t") #文件到矩阵的转换
print shape(recordmat) #输出解析矩阵的行、列数
#对象的持久化
file_obj = open(root+"recordmat.dat","wb")
pickle.dump(recordmat[0],file_obj) #将生产的矩阵对象保存到指定位置
file_obj.close()
read_obj = open(root+"/recordmat.dat","rb")
readmat = pickle.load(read_obj)
print shape(readmat)
#高效读取大文本文件
# 按行读文件,读取指定行数;nmax = 0按行读取全部
def readfilelines(path,nmax=0):
fp = open(path,"rb")
ncount = 0 #已经读取行
while True:
content = fp.readline()
if content =="" or (ncount>=nmax and nmax!=0): #判断到文件尾,或读完指定行数
break
yield content #返回读取的行
if nmax !=0:
ncount +=1
fp.close()
path = "testdata/01.txt" #数据文件所在的路径
for line in readfilelines(path,nmax=10): #读取10行
print line.strip()
print "======================表与线性结构的可视化============================="
#曲线数据加入噪声
x = linspace(-5,5,200)
y = sin(x) #给出y与x的基本关系
yn = y+random.rand(1,len(y))*1.5 #加入噪声的点集
# 绘图
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(x,yn,c = "blue",marker="o")
ax.plot(x,y+0.75,'r')
plt.show()
print "=================表与线性结构的可视化============================"
#曲线数据加入噪声
x = linspace(-5,5,200)
y = sin(x) #给出y与x的基本关系
yn = y+random.rand(1,len(y))*1.5 #加入噪声的点集
# 绘图
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(x,yn,c = "blue",marker="o")
ax.plot(x,y+0.75,'r')
plt.show()
print "=================图与网络结构的可视化============================"
dist = mat([[0.1,0.1],[0.9,0.5],[0.9,1],[0.45,0.9],[0.9,0.8],[0.7,0.9],[0.1,0.45],[0.45,0.1]])
m,n = shape(dist)
fig = plt.figure() #绘图
ax = fig.add_subplot(111)
ax.scatter(dist.T.tolist()[0],dist.T.tolist()[1],c='blue',marker='o')
for point in dist.tolist():
plt.annotate("("+str(point[0])+","+str(point[1])+")",xy =(point[0],point[1]) )
xlist = []; ylist = []
for px,py in zip(dist.T.tolist()[0],dist.T.tolist()[1]):
xlist.append([px])
ylist.append([py])
ax.plot(xlist,ylist,'r')
plt.show()
Python机器学习numpy依赖包特征之间的相关性
最新推荐文章于 2024-10-28 19:01:25 发布