代码来自《机器学习实战一书》;
代码已经由github的wzy6642整理成python3版本:https://github.com/wzy6642/Machine-Learning-in-Action-Python3
regTrees.py
"""
树构建算法其实对输入的参数tolS和tolN非常敏感
# tolS:容许的误差下降值->对误差的数量级十分敏感
tolS = ops[0]
# tolN:切分的最少样本数
tolN = ops[1]
"""
import numpy as np
"""
# Load data
Parameters:
fileName: File name
Returns:
"""
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine))
dataMat.append(fltLine)
return dataMat
"""
# Split data
Parameters:
dataSet: data set
feature: feature choosed to divide
value: feature value
Returns:
mat0: subset
mat1: subset
Note:
Split the dataset into two based on feature and its value;
当数据特征值小于等于阈值,样本划分到左子树,反之样本划分到右子树。
"""
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :]
return mat0, mat1
"""
# Generate leaf node
Parameters:
dataSet: data set
Returns:
Mean value of node
"""
def regLeaf(dataSet):
return np.mean(dataSet[:, -1])
"""
# function to caculate variance
Parameters:
dataSet:
Returns:
Total square error (total variance) = mean square error * the number of sample
"""
def regErr(dataSet):
return np.var(dataSet[:, -1]) * dataSet.shape[0]
"""
伪代码:
对每个特征:
对每个特征值:
将数据集切分成两份
计算切分的误差
如果当前误差小于当前最小误差,那么将当前切分设定为最佳切分并更新最小误差
返回最佳切分的特征和阈值
函数说明:找到数据的最佳切分方式的特征和特征值
Parameters:
dataSet
leafType:生成叶结点的函数
errType:误差估计函数
ops:用户定义的参数构成的元组
Returns:
bestIndex: 最佳切分特征
bestValue: 最佳切分特征值
"""
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
# tolS:容许的误差下降值
tolS = ops[0] #default is 0
# tolN:切分的最少样本数
tolN = ops[1]
# 1)所有值都一样,不需要再切分,直接创建叶节点
if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
return None, leafType(dataSet)
# 行=m,列=n
m, n = np.shape(dataSet)
# 总方差
S = errType(dataSet)
# 分别为最佳误差,最佳特征切分的索引值,最佳特征值
bestS = float('inf') #errType(dataSet)
bestIndex = 0
bestValue = 0
# 遍历所有特征
for featIndex in range(n-1):
# 遍历所有特征值
for splitVal in set(dataSet[:, featIndex].T.A.tolist()[0]):
# 根据特征和特征值切分数据集
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
# 如果拆分的节点样本数少于tolN,则退出
if(np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
continue
# 计算误差估计,寻找newS的最小值
newS = errType(mat0) + errType(mat1)
# 如果误差估计更小,则更新特征索引值和特征值
if newS < bestS:
# 特征索引
bestIndex = featIndex
# 分割标准
bestValue = splitVal
# 更新目标函数的最小值
bestS = newS
# 2)如果误差减少不大则退出,不会切分,直接创建叶节点
if (S - bestS) < tolS:
return None, leafType(dataSet)
# 根据最佳的切分特征和特征值切分数据集合
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
# 3)如果切分出的数据集很小则退出, 不会切分,直接创建叶节点
if(np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
return None, leafType(dataSet)
# 返回最佳切分特征和特征值
return bestIndex, bestValue
"""
伪代码:
找到最佳的待切分特征:
如果该节点不能再分,将该节点存为叶节点
执行二元切分
在右子树调用createTree()方法
在左子树调用createTree()方法
Parameters:
dataSet - data set
leafType - the function of establishing leaf nodes
errType - the error calculation function
ops - a tuple containing other parameters required for tree construction.
Returns:
retTree - Constructed regression tree
"""
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
# 选择最佳切分特征和特征值
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
# 如果没有特征,则返回特征值
if feat == None:
return val
# 回归树
retTree = {}
# 分割特征索引
retTree['spInd'] = feat
# 分割标准
retTree['spVal'] = val
# 分成左数据集和右数据集
lSet, rSet = binSplitDataSet(dataSet, feat, val)
# 创建左子树和右子树 递归
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
"""
# 用于测试输入变量是否是一棵树(树是通过字典存储的)
Parameters:
obj:测试对象
Returns:
布尔值
"""
def isTree(obj):
return (type(obj).__name__ == 'dict')
"""
函数说明:对树进行塌陷处理(即返回树平均值)
Parameters:
tree - 树
Returns:
树的平均值
"""
def getMean(tree):
if isTree(tree['right']):
tree['right'] = getMean(tree['right'])
if isTree(tree['left']):
tree['left'] = getMean(tree['left'])
return (tree['left'] + tree['right']) / 2.0
"""
# 后剪枝
Parameters:
tree: 待剪枝的树
testData: 剪枝所需的测试数据
Returns:
树
"""
def prune(tree, testData):
# 如果测试集为空,则对树进行塌陷处理
if np.shape(testData)[0] == 0:
return getMean(tree)
# 如果有左子树或者右子树,则切分数据集
if (isTree(tree['right']) or isTree(tree['left'])):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
# 处理左子树(剪枝)
if isTree(tree['left']):
tree['left'] = prune(tree['left'], lSet)
# 处理右子树(剪枝)
if isTree(tree['right']):
tree['right'] = prune(tree['right'], rSet)
# 如果当前节点的左右结点为叶结点
if not isTree(tree['left']) and not isTree(tree['right']):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
# 计算没有合并的误差
errorNoMerge = np.sum(np.power(lSet[:, -1] - tree['left'], 2)) + np.sum(np.power(rSet[:, -1] - tree['right'], 2))
# 计算合并的均值
treeMean = (tree['left'] + tree['right']) / 2.0
# 计算合并的误差
errorMerge = np.sum(np.power(testData[:, -1] - treeMean, 2))
# 如果合并的误差小于没有合并的误差,合并
if errorMerge < errorNoMerge:
print("merging")
return treeMean
else:
return tree
else:
return tree
# model trees
"""
# data process and caculate coefficient
Parameters:
dataSet
Returns:
ws
X
Y
"""
def linearSolve(dataSet):
m, n = np.shape(dataSet)
X = np.mat(np.ones((m, n)))
Y = np.mat(np.ones((m, 1)))
# 保存特征矩阵X的第一列全为1
X[:, 1:n] = dataSet[:, 0:n-1]
# 保存label列向量
Y = dataSet[:, -1]
# 简单线性回归
xTx = X.T * X
# 奇异矩阵不可以求逆
if np.linalg.det(xTx) == 0.0:
raise NameError('This matrix is singular, cannont do inverse,\n\
try increasing the second value of ops')
# 求解回归系数
ws = xTx.I * (X.T * Y)
return ws, X, Y
"""
# retrun regression coefficient(ws) from linearSolve
Parameters:
dataSet
Returns:
ws: regression coefficient
"""
def modelLeaf(dataSet):
ws, X, Y = linearSolve(dataSet)
return ws
"""
# calculation error
Parameters:
dataSet
Returns:
error value
"""
def modelErr(dataSet):
ws, X, Y = linearSolve(dataSet)
yHat = X * ws
# Square error between yHat(predicted value) and y.
return sum(np.power(Y - yHat, 2))
# 树回归与标准回归的比较
# 用树回归进行预测
"""
regression tree
函数说明:返回回归树叶结点值
由于Tree的叶结点数据类型为matrix所以需要转化为float类型
Parameters:
model - tree叶结点
inDat - 输入数据
Returns:
叶结点值
"""
def regTreeEval(model, inDat):
return float(model)
"""
model tree
Parameters:
model - 叶结点值
inDat - 输入的特征矩阵
Returns:
预测值 相当于X*ws
"""
def modelTreeEval(model, inDat):
n = np.shape(inDat)[1]
X = np.mat(np.ones((1, n+1)))
X[:, 1:n+1] = inDat
return float(X * model)
"""
自顶向下遍历整棵树,直到命中叶节点为止。一旦到达叶节点,它就会在输入数据上调用modelEval或regTreeEval
Parameters:
tree - 树结构
inData - 需要预测的单个数据
modelEval - 回归树或模型树
Returns:
误差值
"""
def treeForeCast(tree, inData, modelEval=regTreeEval):
# 如果搜索到叶结点就返回叶结点的值
if not isTree(tree):
return modelEval(tree, inData)
# 数据实际值大于分割标准
print("数据实际值大于分割标准")
if inData[tree['spInd']] > tree['spVal']:
# 如果有左子树则递归
if isTree(tree['left']):
return treeForeCast(tree['left'], inData, modelEval)
# 否则返回该叶结点值
else:
return modelEval(tree['left'], inData)
# 小于则在右边
else:
# 如果有右子树则递归
if isTree(tree['right']):
return treeForeCast(tree['right'], inData, modelEval)
# 否则返回该叶结点值
else:
return modelEval(tree['right'], inData)
"""
多次调用treeForeCast()函数。由于它能够以 向量形式返回一组预测值,因此该函数在对整个测试集进行预测时非常有用。
Parameters:
tree - 树结构
testData - 测试数据集
modelEval - 求解方式
Returns:
yHat - 预测值
"""
def createForeCast(tree, testData, modelEval=regTreeEval):
m = len(testData)
yHat = np.mat(np.zeros((m, 1)))
for i in range(m):
yHat[i, 0] = treeForeCast(tree, np.mat(testData[i]), modelEval)
return yHat
调用构建好的决策树:
import regTrees
from numpy import*
# 构建树例子
testMat=mat(eye(4))
print(testMat)
mat0,mat1=regTrees.binSplitDataSet(testMat,1,0.5)
print(mat0)
print(mat1)
myDat=regTrees.loadDataSet('ex00.txt')
myMat=mat(myDat)
root=regTrees.createTree(myMat)
print(root)
myDat1=regTrees.loadDataSet('ex0.txt')
myMat1=mat(myDat1)
root=regTrees.createTree(myMat1)
print(root)
myDat2=regTrees.loadDataSet('ex2.txt')
myMat2=mat(myDat2)
root=regTrees.createTree(myMat2)
print(root)
root=regTrees.createTree(myMat2,ops=(0, 4))
print(root)
root=regTrees.createTree(myMat2,ops=(10000, 4))
print(root)
# 预剪枝例子
myTree = regTrees.createTree(myMat2,ops=(0, 1))
print(myTree)
myDatTest=regTrees.loadDataSet('ex2test.txt')
myDat2Test=mat(myDatTest)
regTrees.prune(myTree, myDat2Test)
myMat2=mat(regTrees.loadDataSet('exp2.txt'))
root=regTrees.createTree(myMat2,regTrees.modelLeaf,regTrees.modelErr,(1,10))
print(root)
# 模型树
# raw data <- y=3.5+1.0x和y=0+12x
myMat2=mat(regTrees.loadDataSet('exp2.txt'))
root=regTrees.createTree(myMat2,regTrees.modelLeaf,regTrees.modelErr,(1,10))
print(root)
def plotDataSet(filename):
dataMat = loadDataSet(filename)
n = len(dataMat)
xcord = []
ycord = []
# 样本点
for i in range(n):
xcord.append(dataMat[i][0])
ycord.append(dataMat[i][1])
fig = plt.figure()
ax = fig.add_subplot(111)
# 绘制样本点
ax.scatter(xcord, ycord, s=20, c='black', alpha=.5)
plt.title('DataSet')
plt.xlabel('X')
# plt.show()
train_filename = 'exp2.txt'
train_Data = regTrees.loadDataSet(train_filename)
dataMat = np.mat(train_Data)
Tree = regTrees.createTree(dataMat, modelLeaf, modelErr, (1, 10))
print(Tree)
plotDataSet(train_filename)
#树回归与标准回归的比较
trainMat=mat(regTrees.loadDataSet('bikeSpeedVsIq_train.txt'))
testMat=mat(regTrees.loadDataSet('bikeSpeedVsIq_test.txt'))
#创建一颗回归树
myTree=regTrees.createTree(trainMat,ops=(1,20))
yHat=regTrees.createForeCast(myTree,testMat[:,0])
RCor=corrcoef(yHat,testMat[:,1],rowvar=0)[0,1]
print("回归树拟合相关性 = ", RCor)
#创建一颗模型树
myTree2=regTrees.createTree(trainMat,regTrees.modelLeaf,regTrees.modelErr,(1,20))
yHat=regTrees.createForeCast(myTree2,testMat[:,0],regTrees.modelTreeEval)
MCor=corrcoef(yHat,testMat[:,1],rowvar=0)[0,1]
print("模型树拟合相关性 = ", RCor)
#使用标准线性回归模型
ws,X,Y=regTrees.linearSolve(trainMat)
m,n=shape(testMat)
yHat=zeros((m,1))
for i in range(m):
yHat[i]=testMat[i,0]*ws[1,0]+ws[0,0]
LCor=corrcoef(yHat,testMat[:,1],rowvar=0)[0,1]
print("线性回归拟合相关性 = ", LCor)
网友评论