模拟实现梯度下降
1.1 损失函数可视化

import numpy as np
import matplotlib.pyplot as plt
plot_x = np.linspace(-1,6,141)#np.linspace主要用来创建等差数列。
plot_x
plot_y = (plot_x -2.5)**2-1
plt.plot(plot_x,plot_y)
plt.show()
输出

定义损失函数与求导
def dJ(theta):
return 2*(theta-2.5)
def J(theta):
return (theta-2.5)**2-1.
寻找最佳参数
eta = 0.1 #学习率
epsilon = 1e-8
theta = 0.0
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta*gradient
if(abs(J(theta)-J(last_theta))<epsilon):
break
print(theta)
print(J(theta))
模拟梯度下降
theta = 0.0
theta_history = [theta]
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta*gradient
theta_history.append(theta)
if(abs(J(theta)-J(last_theta))<epsilon):
break
def plot_theta_history():
plt.plot(plot_x,J(plot_x))
plt.plot(np.array(theta_history),J(np.array(theta_history)),color='r',marker='+')
plt.show()
plot_theta_history()
输出

查看生成了多少个theta
len(theta_history)
输出:46
减小学习率
theta = 0.0
theta_history = [theta]
def gradient_descent(initial_theta,eta,n_iters = 1e4,epilon=1e-8):
theta = initial_theta
theta_history.append(initial_theta)
#i_iter = 0
#while i_iter < n_iters:
while True:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta*gradient
theta_history.append(theta)
if(abs(J(theta)-J(last_theta))<epsilon):
break
# i_iter += 1
def plot_theta_history():
plt.plot(plot_x,J(plot_x))
plt.plot(np.array(theta_history),J(np.array(theta_history)),color='r',marker='+')
plt.show()
eta = 0.01
tehta_history = []
gradient_descent(0.,eta)
plot_theta_history()
查看生成了多少个theta
len(theta_history)
输出:425
再次降低学习率
eta = 0.001
tehta_history = []
gradient_descent(0.,eta)
plot_theta_history()
输出

查看生成了多少个theta
len(theta_history)
输出:4107
尝试增大学习率
eta = 0.8
tehta_history = []
gradient_descent(-0.,eta)
plot_theta_history()
输出

发现最后找到了合适的theta,即eta=0.8在可供使用的学习率范围内。
再次尝试增大学习率
eta = 1.1
tehta_history = []
gradient_descent(0.,eta)
plot_theta_history()
发现报错,提示OverflowError: (34, 'Result too large')
查看产生了多少个参数
len(theta_history)
输出:141
theta_history[-1]
发现theta超出了给定的范围,学习率过大导致函数不收敛
为了避免报错,可以对原代码进行改进:
在计算损失函数值时捕获一场
def J(theta):
try:
return (theta-2.5)**2-1.
except:
return float('inf') #inf 浮点数的最大值
设定条件,结束死循环
def gradient_descent(initial_theta,eta,n_iters = 1e4,epsilon=1e-8):
theta = initial_theta
theta_history.append(initial_theta)
i_iter = 0
while i_iter < n_iters:
gradient = dJ(theta)
last_theta = theta
theta = theta - eta*gradient
theta_history.append(theta)
if(abs(J(theta)-J(last_theta))<epsilon):
break
i_iter += 1
def plot_theta_history():
plt.plot(plot_x,J(plot_x))
def plot_theta_history():
plt.plot(plot_x,J(plot_x)) plt.plot(np.array(theta_history),J(np.array(theta_history)),color='r',marker='+')
plt.show()
eta = 1.1
theta_history = []
gradient_descent(0,eta)
len(theta_history)
输出:10001
theta_history[-1]
输出:NaN
eta = 1.1
theta_history = []
gradient_descent(0.,eta,n_iters=10)
plot_theta_history()
输出:

网友评论