-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathregTrees.py
More file actions
166 lines (131 loc) · 4.26 KB
/
regTrees.py
File metadata and controls
166 lines (131 loc) · 4.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from numpy import *
# 树回归算法
def loadDataSet(filename):
dataMat = []
fr = open(filename)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = map(float, curLine)
dataMat.append(fltLine)
return dataMat
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :][0]
mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :][0]
return mat0, mat1
def regLeaf(dataSet):
"""
生成叶节点
"""
return mean(dataSet[:,-1])
def regErr(dataSet):
"""
误差估计函数
"""
return var(dataSet[:, -1]) * shape(dataSet)[0]
def createTree(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)):
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
if feat == None: return val
retTree = []
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops = (1,4)):
"""
寻找最佳二元拆分方式
"""
tolS = ops[0]
tolN = ops[1]
if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
return None, leafType(dataSet)
m,n = shape(dataSet)
S = errType(dataSet)
bestS = inf
bestIndex = 0
bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:, featIndex]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if (shape(mat0)[0] < tolN) or (shape[mat1][0] < tolN): continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
return None, leafType((dataSet))
return bestIndex, bestValue
# 以下为用于生成模型树函数
def linearSolve(dataSet):
"""
将数据格式化
"""
m,n = shape(dataSet)
X = mat(ones((m, n)))
Y = mat(ones((m, 1)))
X[:, 1:n] = dataSet[:, 0:n-1]
Y = dataSet[:, -1]
xTx = X.T * X
if linalg.det(xTx) == 0.0:
raise NameError('This matrix is singular')
ws = xTx.I * (X.T * Y)
return ws,X,Y
def modelLeaf(dataSet):
"""
生成叶节点
"""
ws,X,Y = linearSolve(dataSet)
return ws
def modelErr(dataSet):
"""
预估误差
"""
ws, X, Y = linearSolve(dataSet)
yHat = X * ws
return sum(power(Y - yHat, 2))
# 以下函数用于使用树进行预测
def isTree(obj):
return (type(obj).__name__ == 'dict')
def regTreeEval(model, inDat):
return float(model)
def modelTreeEval(model, inDat):
n = shape(inDat)[1]
X = mat(ones((1, n+1)))
X[:, 1:n+1] = inDat
return float(X * model)
def treeForeCast(tree, inData, modelEval=regTreeEval):
if not isTree(tree): return modelEval(tree, inData)
if inData[tree['spInd']] > tree['spVal']:
if isTree(tree['left']):
return treeForeCast(tree['left'], inData, modelEval)
else:
return modelEval(tree['left'], inData)
else:
if isTree(tree['right']):
return treeForeCast(tree['right'], inData, modelEval)
else:
return modelEval(tree['right'], inData)
def createForeCast(tree, testData, modelEval = regTreeEval):
m = len(testData)
yHat = mat(zeros((m, 1)))
for i in range(m):
yHat[i, 0] = treeForeCast(tree, mat(testData[i]), modelEval)
return yHat
def testRaw():
"""
测试树回归
"""
trainMat = mat(loadDataSet('../data/bikeSpeedVsIq_train.txt'))
testMat = mat(loadDataSet('../data/bikeSpeedVsIq_test.txt'))
myTree = createTree(trainMat, ops=(1, 20))
yHat = createForeCast(myTree, testMat[:,0])
corrcoef(yHat, testMat[:,1], rowvar=0)[0,1]
# 创建模型树
myTree = createTree(trainMat, modelLeaf, modelErr, (1, 20))
yHat = createForeCast(myTree, testMat[:, 0], modelTreeEval())
corrcoef(yHat, testMat[:,1], rowvar=0)[0,1]