反向传播用到的梯度下降原理数学公式
基本数学概念
函数: f ( x ) f(x) f(x)
导数: f ′ ( x ) f{}'(x) f′(x)
多元函数: f ( x , y ) f(x,y) f(x,y)
偏导数: ∂ f ( x , y ) ∂ x , ∂ f ( x , y ) ∂ y \frac{\partial f(x,y)}{\partial x},\frac{\partial f(x,y)}{\partial y} ∂x∂f(x,y),∂y∂f(x,y)
梯度:偏导数组成的向量集合
 
    
     
      
       
        ▽
       
       
        f
       
       
        (
       
       
        x
       
       
        ,
       
       
        y
       
       
        ,
       
       
        z
       
       
        )
       
       
        =
       
       
        
         
          ∂
         
         
          f
         
         
          (
         
         
          x
         
         
          ,
         
         
          y
         
         
          ,
         
         
          z
         
         
          )
         
        
        
         
          ∂
         
         
          x
         
        
       
       
        ,
       
       
        
         
          ∂
         
         
          f
         
         
          (
         
         
          x
         
         
          ,
         
         
          y
         
         
          ,
         
         
          z
         
         
          )
         
        
        
         
          ∂
         
         
          y
         
        
       
       
        ,
       
       
        
         
          ∂
         
         
          f
         
         
          (
         
         
          x
         
         
          ,
         
         
          y
         
         
          ,
         
         
          z
         
         
          )
         
        
        
         
          ∂
         
         
          z
         
        
       
      
      
       \bigtriangledown f(x,y,z)=\frac{\partial f(x,y,z)}{\partial x},\frac{\partial f(x,y,z)}{\partial y},\frac{\partial f(x,y,z)}{\partial z}
      
     
    ▽f(x,y,z)=∂x∂f(x,y,z),∂y∂f(x,y,z),∂z∂f(x,y,z)
二元函数
    
     
      
       
        f
       
       
        (
       
       
        x
       
       
        ,
       
       
        y
       
       
        )
       
       
        =
       
       
        
         x
        
        
         2
        
       
       
        +
       
       
        
         y
        
        
         2
        
       
      
      
       f(x,y)=x^{2}+y^{2}
      
     
    f(x,y)=x2+y2,梯度是
    
     
      
       
        (
       
       
        2
       
       
        x
       
       
        ,
       
       
        2
       
       
        y
       
       
        )
       
      
      
       (2x,2y)
      
     
    (2x,2y)
 在点
    
     
      
       
        (
       
       
        1
       
       
        ,
       
       
        1
       
       
        )
       
      
      
       (1,1)
      
     
    (1,1)处,梯度是
    
     
      
       
        (
       
       
        2
       
       
        ,
       
       
        2
       
       
        )
       
      
      
       (2,2)
      
     
    (2,2),
    
     
      
       
        f
       
       
        (
       
       
        x
       
       
        ,
       
       
        y
       
       
        )
       
      
      
       f(x,y)
      
     
    f(x,y)沿该向量变化,
    
     
      
       
        △
       
       
        f
       
       
        (
       
       
        x
       
       
        ,
       
       
        y
       
       
        )
       
      
      
       \bigtriangleup f(x,y)
      
     
    △f(x,y)变化最快!
所以,找到目标函数最小值(Loss)
                                                              
    
     
      
       
        y
       
       
        
        
         ′
        
       
       
        =
       
       
        k
       
       
        x
       
       
        +
       
       
        b
       
      
      
       y{}'=kx+b
      
     
    y′=kx+b
 均方误差是
                                                             
    
     
      
       
        L
       
       
        =
       
       
        
         1
        
        
         
          2
         
         
          m
         
        
       
       
        
         ∑
        
        
         
          i
         
         
          =
         
         
          1
         
        
        
         m
        
       
       
        (
       
       
        y
       
       
        −
       
       
        y
       
       
        
        
         ′
        
       
       
        
         )
        
        
         2
        
       
      
      
       L=\frac{1}{2m}\sum_{i=1}^{m}(y-y{}')^{2}
      
     
    L=2m1∑i=1m(y−y′)2
 
    
     
      
       
        m
       
      
      
       m
      
     
    m个样本
 因此需要找到合适的
    
     
      
       
        k
       
       
        ,
       
       
        b
       
      
      
       k,b
      
     
    k,b值,使
    
     
      
       
        L
       
      
      
       L
      
     
    L最小,即找到
    
     
      
       
        k
       
       
        ,
       
       
        b
       
      
      
       k,b
      
     
    k,b值,使
    
     
      
       
        L
       
       
        
        
         ′
        
       
       
        =
       
       
        0
       
      
      
       L{}'=0
      
     
    L′=0
如果是一维
    
     
      
       
        L
       
      
      
       L
      
     
    L函数,可以用
    
     
      
       
        L
       
       
        
        
         ′
        
       
       
        =
       
       
        0
       
      
      
       L{}'=0
      
     
    L′=0好求
    
     
      
       
        k
       
      
      
       k
      
     
    k和
    
     
      
       
        b
       
      
      
       b
      
     
    b
 在卷积神经网络中,若使
    
     
      
       
        L
       
       
        
        
         ′
        
       
       
        =
       
       
        0
       
      
      
       L{}'=0
      
     
    L′=0,会得到很多
    
     
      
       
        k
       
      
      
       k
      
     
    k和
    
     
      
       
        b
       
      
      
       b
      
     
    b,因为
    
     
      
       
        k
       
       
        ,
       
       
        b
       
      
      
       k,b
      
     
    k,b太复杂了,干脆就让
    
     
      
       
        θ
       
      
      
       \theta
      
     
    θ一点点变化,求
    
     
      
       
        L
       
      
      
       L
      
     
    L最小值!
    
     
      
       
        θ
       
      
      
       \theta
      
     
    θ代表的是
    
     
      
       
        [
       
       
        k
       
       
        ,
       
       
        b
       
       
        ]
       
      
      
       [k,b]
      
     
    [k,b],每次更新
    
     
      
       
        k
       
       
        ,
       
       
        b
       
      
      
       k,b
      
     
    k,b的值!
 
                                                                    
    
     
      
       
        
         θ
        
        
         1
        
       
       
        =
       
       
        
         θ
        
        
         1
        
        
         
         
          ′
         
        
       
       
        −
       
       
        α
       
       
        
         ∂
        
        
         
          ∂
         
         
          
           θ
          
          
           1
          
         
        
       
       
        J
       
       
        (
       
       
        θ
       
       
        )
       
      
      
       \theta _{1}=\theta _{1}^{{}'}-\alpha \frac{\partial }{\partial\theta _{1}}J(\theta )
      
     
    θ1=θ1′−α∂θ1∂J(θ)
                                                               新权值=当前权值-学习率*梯度
 学习率一般为3,1,0.5,0.1,0.05,0.01,0.005,0.0001
可改变学习率的方法
学习率一开始可以大一点,后面小一点,这样更容易更快收敛!
学习率:Ir(learning rate)
                         x=x-Ir*dx
 衰减:(decay),decay是衰减因子!【常用的是指数衰减法】
                         
    
     
      
       
        I
       
       
        r
       
       
        =
       
       
        I
       
       
        
         r
        
        
         
          s
         
         
          t
         
         
          r
         
         
          a
         
         
          t
         
        
       
       
        ∗
       
       
        1.0
       
       
        /
       
       
        (
       
       
        1.0
       
       
        +
       
       
        d
       
       
        e
       
       
        c
       
       
        a
       
       
        y
       
       
        ∗
       
       
        i
       
       
        )
       
      
      
       Ir=Ir_{strat}*1.0/(1.0+decay*i)
      
     
    Ir=Irstrat∗1.0/(1.0+decay∗i)
                         
    
     
      
       
        x
       
       
        =
       
       
        x
       
       
        −
       
       
        I
       
       
        r
       
       
        ∗
       
       
        d
       
       
        x
       
      
      
       x=x-Ir*dx
      
     
    x=x−Ir∗dx
 动量:(momentum)
                         
    
     
      
       
        {
       
       
        
         
          
           
            
             x
            
            
             =
            
            
             x
            
            
             −
            
            
             I
            
            
             r
            
            
             ∗
            
            
             d
            
            
             x
            
            
             +
            
            
             V
            
            
             ∗
            
            
             m
            
            
             o
            
            
             m
            
            
             e
            
            
             n
            
            
             t
            
            
             u
            
            
             m
            
           
          
         
        
        
         
          
           
            
             V
            
            
             =
            
            
             −
            
            
             I
            
            
             r
            
            
             ∗
            
            
             d
            
            
             x
            
           
          
         
        
       
      
      
       \left\{\begin{matrix}x=x-Ir*dx+V*momentum\\ V=-Ir*dx \end{matrix}\right.
      
     
    {x=x−Ir∗dx+V∗momentumV=−Ir∗dx









