1. 前言

2. readCheckpoint 和 writeCheckpoint

func (m *ManagerImpl) writeCheckpoint() error {
    m.mutex.Lock()
    registeredDevs := make(map[string][]string)
    // 只将healthy的devices持久化
    for resource, devices := range m.healthyDevices {
        registeredDevs[resource] = devices.UnsortedList()
    }
    // 将podDevices的内容持久化
    data := checkpoint.New(m.podDevices.toCheckpointData(),
        registeredDevs)
    m.mutex.Unlock()
    err := m.checkpointManager.CreateCheckpoint(kubeletDeviceManagerCheckpoint, data)
    if err != nil {
        return fmt.Errorf("failed to write checkpoint file %q: %v", kubeletDeviceManagerCheckpoint, err)
    }
    return nil
}

func (m *ManagerImpl) readCheckpoint() error {
    registeredDevs := make(map[string][]string)
    devEntries := make([]checkpoint.PodDevicesEntry, 0)
    cp := checkpoint.New(devEntries, registeredDevs)
    err := m.checkpointManager.GetCheckpoint(kubeletDeviceManagerCheckpoint, cp)
    if err != nil {
        if err == errors.ErrCheckpointNotFound {
            klog.Warningf("Failed to retrieve checkpoint for %q: %v", kubeletDeviceManagerCheckpoint, err)
            return nil
        }
        return err
    }
    m.mutex.Lock()
    defer m.mutex.Unlock()
    podDevices, registeredDevs := cp.GetData()
    // 只恢复了podDevices中的内容 并没有恢复healthyDevices里面的内容
    m.podDevices.fromCheckpointData(podDevices)
    m.allocatedDevices = m.podDevices.devices()
    for resource := range registeredDevs {
        // 为每个资源生成了一个带有stop时间的endpoint 等到device plugin重新注册
        m.healthyDevices[resource] = sets.NewString()
        m.unhealthyDevices[resource] = sets.NewString()
        m.endpoints[resource] = endpointInfo{e: newStoppedEndpointImpl(resource), opts: nil}
    }
    return nil
}

3. GetCapacity

func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) {
    needsUpdateCheckpoint := false
    var capacity = v1.ResourceList{}
    var allocatable = v1.ResourceList{}
    deletedResources := sets.NewString()
    m.mutex.Lock()
    for resourceName, devices := range m.healthyDevices {
        eI, ok := m.endpoints[resourceName]
        if (ok && eI.e.stopGracePeriodExpired()) || !ok {
            // The resources contained in endpoints and (un)healthyDevices
            // should always be consistent. Otherwise, we run with the risk
            // of failing to garbage collect non-existing resources or devices.
            if !ok {
                klog.Errorf("unexpected: healthyDevices and endpoints are out of sync")
            }
            // 删除device manager中关于ResourceName的所有关系
            delete(m.endpoints, resourceName)
            delete(m.healthyDevices, resourceName)
            deletedResources.Insert(resourceName)
            needsUpdateCheckpoint = true
        } else {
            capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
            allocatable[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
        }
    }
    for resourceName, devices := range m.unhealthyDevices {
        eI, ok := m.endpoints[resourceName]
        if (ok && eI.e.stopGracePeriodExpired()) || !ok {
            if !ok {
                klog.Errorf("unexpected: unhealthyDevices and endpoints are out of sync")
            }
            delete(m.endpoints, resourceName)
            delete(m.unhealthyDevices, resourceName)
            deletedResources.Insert(resourceName)
            needsUpdateCheckpoint = true
        } else {
            capacityCount := capacity[v1.ResourceName(resourceName)]
            unhealthyCount := *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
            capacityCount.Add(unhealthyCount)
            capacity[v1.ResourceName(resourceName)] = capacityCount
        }
    }
    m.mutex.Unlock()
    if needsUpdateCheckpoint {
        // 如果某个resourceName不存在endpoint 或者endpoint有stop时间
        m.writeCheckpoint()
    }
    return capacity, allocatable, deletedResources.UnsortedList()
}

4. GetDeviceRunContainerOptions

func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error) {
    podUID := string(pod.UID)
    contName := container.Name
    needsReAllocate := false
    for k := range container.Resources.Limits {
        resource := string(k)
        if !m.isDevicePluginResource(resource) {
            continue
        }
        err := m.callPreStartContainerIfNeeded(podUID, contName, resource)
        if err != nil {
            return nil, err
        }
        if m.podDevices.containerDevices(podUID, contName, resource) == nil {
            needsReAllocate = true
        }
    }
    if needsReAllocate {
        klog.V(2).Infof("needs re-allocate device plugin resources for pod %s", podUID)
        m.allocatePodResources(pod)
    }
    m.mutex.Lock()
    defer m.mutex.Unlock()
    return m.podDevices.deviceRunContainerOptions(string(pod.UID), container.Name), nil
}
func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource string) error {
    ...
    devices := m.podDevices.containerDevices(podUID, contName, resource)
    if devices == nil {
        m.mutex.Unlock()
        return fmt.Errorf("no devices found allocated in local cache for pod %s, container %s, resource %s", podUID, contName, resource)
    }
    _, err := eI.e.preStartContainer(devs)
    ...
}

5. 重启kubelet

5.1 重启kubelet

// k8s-device-plugin/main.go
func main() {
    ...
    log.Println("Starting FS watcher.")
    watcher, err := newFSWatcher(pluginapi.DevicePluginPath)
    ...
    restart := true
    var devicePlugin *NvidiaDevicePlugin

L:
    for {
        if restart {
            if devicePlugin != nil {
                devicePlugin.Stop()
            }

            devicePlugin = NewNvidiaDevicePlugin()
            if err := devicePlugin.Serve(); err != nil {
                ...
            } else {
                restart = false
            }
        }

        select {
        case event := <-watcher.Events:
            if event.Name == pluginapi.KubeletSocket && event.Op&fsnotify.Create == fsnotify.Create {
                log.Printf("inotify: %s created, restarting.", pluginapi.KubeletSocket)
                restart = true
            }

        case err := <-watcher.Errors:
            log.Printf("inotify: %s", err)
        ...
    }
}

5.2 重启device plugin

func (m *ManagerImpl) runEndpoint(resourceName string, e endpoint) {
    e.run()
    e.stop()

    m.mutex.Lock()
    defer m.mutex.Unlock()

    if old, ok := m.endpoints[resourceName]; ok && old.e == e {
        m.markResourceUnhealthy(resourceName)
    }

    klog.V(2).Infof("Endpoint (%s, %v) became unhealthy", resourceName, e)
}