kube-scheduler cache_办公生活

kube-scheduler cache

创始人

2024-01-13 06:22:50

调度缓存用来保存node列表，及每个node上的所有pod信息，包括已经在node上运行的pod，假定pod和调度失败的pod信息，其必须实现下面的接口

type Cache interface {//测试用NodeCount() int//测试用PodCount() (int, error)//pod调度成功后，调用AssumePod将此pod占用资源增加到它所在node上//此时还未开始bind nodeAssumePod(pod *v1.Pod) error//当bind node后，用来通知cache，假定的pod可以开始计算超时了FinishBinding(pod *v1.Pod) error//删除假定pod，并且它所在node也要减去此pod所占资源//调用AssumePod后的流程有任何错误，都需要调用ForgetPod进行删除ForgetPod(pod *v1.Pod) error//pod调度成功，并且bind node也成功后，会通过informer监听到pod添加事件，调用AddPod将pod加到cache，//如果假定pod已经expire了，则再次添加回来，保证pod所占资源被node统计AddPod(pod *v1.Pod) errorUpdatePod(oldPod, newPod *v1.Pod) error//通过informer监听到pod删除事件后，将pod从cache删除，所占资源也要从node删除RemovePod(pod *v1.Pod) errorGetPod(pod *v1.Pod) (*v1.Pod, error)//判断pod是否还是假定状态，并且未超时IsAssumedPod(pod *v1.Pod) (bool, error)//通过informer监听到node添加事件后，将node信息保存到cacheAddNode(node *v1.Node) *framework.NodeInfo//通过informer监听到node更新事件后，更新cache中的信息UpdateNode(oldNode, newNode *v1.Node) *framework.NodeInfo//通过informer监听到node删除事件后，将node信息从cache中删除RemoveNode(node *v1.Node) error//将cache中当前的信息做快照，其中node信息包括调度成功pod和假定pod的聚合信息,//并且在调用此函数时node仍未被删除。UpdateSnapshot(nodeSnapshot *Snapshot) errorDump() *Dump
}

schedulerCache为Cache接口的具体实现

type schedulerCache struct {stop   <-chan struct{}//假定pod超时时间，默认30sttl    time.Duration//cache模块会启动一个协程执行周期性任务，period指定周期为多久，默认为cleanAssumedPeriod，1speriod time.Duration//cache里的信息可能会被多个协程访问，这里使用锁保证互斥mu sync.RWMutex//保存假定pod的key，key可以用来得到podStates的一个entryassumedPods sets.String//保存pod key到podState的映射podStates map[string]*podState//保存node名字到nodeInfoListItem的映射nodes     map[string]*nodeInfoListItem//nodeInfoListItem用来保存一个node信息，其内部有next和prev指针，用来实现双向链表，这里的headNode为双向链表的头，//而且headNode指向的是最近更新的nodeheadNode *nodeInfoListItem//按区域保存node名字nodeTree *nodeTree//image信息，暂时忽略imageStates map[string]*imageState
}

podState保存pod在cache中的信息

type podState struct {//pod基本信息pod *v1.Pod//假定pod的超时时间deadline *time.Time//标志bind是否结束，只有结束后假定pod才能开始计时bindingFinished bool
}

nodeInfoListItem不仅要保存到nodes map中，还要以双向链表的结构加入链表，更新频繁的node在链表头部，做快照时会遍历此链表

type nodeInfoListItem struct {//node信息，包括其上的所有pod信息info *framework.NodeInfo//用来实现双向链表next *nodeInfoListItemprev *nodeInfoListItem
}// NodeInfo is node level aggregated information.
type NodeInfo struct {//node基本信息node *v1.Node//运行在此node上的所有podPods []*PodInfo//运行在此node上的所有声明亲和性的podPodsWithAffinity []*PodInfo//运行在此node上的所有声明反亲和性的podPodsWithRequiredAntiAffinity []*PodInfo// Ports allocated on the node.UsedPorts HostPortInfo//运行在此node上所有pod的资源总和，也包括假定pod的资源。这里的资源指的是yaml指定的cpu/memoryRequested *Resource//运行在此node上所有pod的资源总和，这里的资源包括yaml指定的资源和没指定的资源(有些container没指定资源，会自动生成一个默认值)//这样做的目的是防止很多0-request的pod被调度到同一个node上NonZeroRequested *Resource//node可分配的资源，kubelet上报的node可用资源Allocatable *Resource//暂时忽略ImageStates map[string]*ImageStateSummary//暂时忽略PVCRefCounts map[string]int//node序列号，如果node有任何改变，此值加1。做快照时，只添加有更新的nodeGeneration int64
}

nodeTree按区域保存node name

type nodeTree struct {//map的key为zone，value为node name列表tree     map[string][]string//区域列表zones    []stringnumNodes int
}

AssumePod
pod调度成功后，调用AssumePod将此pod占用资源增加到它所在node上

func (cache *schedulerCache) AssumePod(pod *v1.Pod) error {//获取pod key，即pod UIDkey, err := framework.GetPodKey(pod)if err != nil {return err}cache.mu.Lock()defer cache.mu.Unlock()//如果已经在cache中，则返回if _, ok := cache.podStates[key]; ok {return fmt.Errorf("pod %v is in the cache, so can't be assumed", key)}//将pod加入nodecache.addPod(pod)ps := &podState{pod: pod,}//保存podcache.podStates[key] = ps//插入pod keycache.assumedPods.Insert(key)return nil
}// Assumes that lock is already acquired.
func (cache *schedulerCache) addPod(pod *v1.Pod) {n, ok := cache.nodes[pod.Spec.NodeName]if !ok {//如果还不存在，则创建新节点n = newNodeInfoListItem(framework.NewNodeInfo())cache.nodes[pod.Spec.NodeName] = n}//将pod保存到node的pod列表，并将pod请求的资源加到node Requested中n.info.AddPod(pod)//将node移动到链表头部cache.moveNodeInfoToHead(pod.Spec.NodeName)
}

FinishBinding
当bind node后，用来通知cache，假定的pod可以开始计算超时了

func (cache *schedulerCache) FinishBinding(pod *v1.Pod) error {return cache.finishBinding(pod, time.Now())
}// finishBinding exists to make tests determinitistic by injecting now as an argument
func (cache *schedulerCache) finishBinding(pod *v1.Pod, now time.Time) error {key, err := framework.GetPodKey(pod)if err != nil {return err}cache.mu.RLock()defer cache.mu.RUnlock()klog.V(5).Infof("Finished binding for pod %v. Can be expired.", key)currState, ok := cache.podStates[key]if ok && cache.assumedPods.Has(key) {dl := now.Add(cache.ttl)//设置标志，协程cleanupExpiredAssumedPods可以启动对假定pod的超时处理currState.bindingFinished = true//设置超时时间currState.deadline = &dl}return nil
}

ForgetPod
删除假定pod，并且它所在node也要减去此pod所占资源，调用AssumePod后的流程有任何错误，都需要调用ForgetPod进行删除

func (cache *schedulerCache) ForgetPod(pod *v1.Pod) error {key, err := framework.GetPodKey(pod)if err != nil {return err}cache.mu.Lock()defer cache.mu.Unlock()currState, ok := cache.podStates[key]if ok && currState.pod.Spec.NodeName != pod.Spec.NodeName {return fmt.Errorf("pod %v was assumed on %v but assigned to %v", key, pod.Spec.NodeName, currState.pod.Spec.NodeName)}switch {// Only assumed pod can be forgotten.case ok && cache.assumedPods.Has(key):err := cache.removePod(pod)if err != nil {return err}delete(cache.assumedPods, key)delete(cache.podStates, key)default:return fmt.Errorf("pod %v wasn't assumed so cannot be forgotten", key)}return nil
}func (cache *schedulerCache) removePod(pod *v1.Pod) error {n, ok := cache.nodes[pod.Spec.NodeName]if !ok {klog.Errorf("node %v not found when trying to remove pod %v", pod.Spec.NodeName, pod.Name)return nil}//从node中删除此pod信息及所占资源if err := n.info.RemovePod(pod); err != nil {return err}//如果node上没pod了或者node不存在了，则将node删除if len(n.info.Pods) == 0 && n.info.Node() == nil {cache.removeNodeInfoFromList(pod.Spec.NodeName)} else {//node信息有更新，则将node移动链表头部cache.moveNodeInfoToHead(pod.Spec.NodeName)}return nil
}

AddPod
pod调度成功，并且bind node也成功后，会通过informer监听到pod添加事件，调用AddPod将pod加到cache，
如果假定pod已经expire了，则再次添加回来，保证pod所占资源被node统计

func (cache *schedulerCache) AddPod(pod *v1.Pod) error {key, err := framework.GetPodKey(pod)if err != nil {return err}cache.mu.Lock()defer cache.mu.Unlock()currState, ok := cache.podStates[key]switch {//pod还在假定状态，未超时，则将其删除case ok && cache.assumedPods.Has(key):if currState.pod.Spec.NodeName != pod.Spec.NodeName {// The pod was added to a different node than it was assumed to.klog.Warningf("Pod %v was assumed to be on %v but got added to %v", key, pod.Spec.NodeName, currState.pod.Spec.NodeName)// Clean this up.if err = cache.removePod(currState.pod); err != nil {klog.Errorf("removing pod error: %v", err)}cache.addPod(pod)}//pod已经被确认，则删除pod keydelete(cache.assumedPods, key)//停止超时cache.podStates[key].deadline = nilcache.podStates[key].pod = podcase !ok://pod已经超时，则将其添加回来。因为超时的时候已经将pod信息从node中删除了cache.addPod(pod)ps := &podState{pod: pod,}cache.podStates[key] = psdefault:return fmt.Errorf("pod %v was already in added state", key)}return nil
}

RemovePod
通过informer监听到pod删除事件后，将pod从cache删除，所占资源也要从node删除

func (cache *schedulerCache) RemovePod(pod *v1.Pod) error {key, err := framework.GetPodKey(pod)if err != nil {return err}cache.mu.Lock()defer cache.mu.Unlock()currState, ok := cache.podStates[key]switch {// An assumed pod won't have Delete/Remove event. It needs to have Add event// before Remove event, in which case the state would change from Assumed to Added.case ok && !cache.assumedPods.Has(key)://pod的node和之前保存的不一样，这估计是bug了，直接crashif currState.pod.Spec.NodeName != pod.Spec.NodeName {klog.Errorf("Pod %v was assumed to be on %v but got added to %v", key, pod.Spec.NodeName, currState.pod.Spec.NodeName)klog.Fatalf("Schedulercache is corrupted and can badly affect scheduling decisions")}//删除pod信息err := cache.removePod(currState.pod)if err != nil {return err}delete(cache.podStates, key)default:return fmt.Errorf("pod %v is not found in scheduler cache, so cannot be removed from it", key)}return nil
}

AddNode
通过informer监听到node添加事件后，将node信息保存到cache

func (cache *schedulerCache) AddNode(node *v1.Node) *framework.NodeInfo {cache.mu.Lock()defer cache.mu.Unlock()n, ok := cache.nodes[node.Name]if !ok {n = newNodeInfoListItem(framework.NewNodeInfo())cache.nodes[node.Name] = n} else {cache.removeNodeImageStates(n.info.Node())}//node有更新，移动到链表头部cache.moveNodeInfoToHead(node.Name)cache.nodeTree.addNode(node)cache.addNodeImageStates(node, n.info)n.info.SetNode(node)return n.info.Clone()
}

UpdateSnapshot
每次调度开始前都会调用UpdateSnapshot将cache中的信息做快照保存到g.nodeInfoSnapshot中，后续调度过程中用到的node/pod信息从快照获取即可，不用再访问cache，而且访问cache需要加锁，导致性能降低

//pkg/scheduler/generic_scheduler.go
func (g *genericScheduler) snapshot() error {// Used for all fit and priority funcs.return g.cache.UpdateSnapshot(g.nodeInfoSnapshot)
}

快照使用如下结构表示

type Snapshot struct {//node name到node info的映射nodeInfoMap map[string]*framework.NodeInfo//node info列表nodeInfoList []*framework.NodeInfo//node info列表，每个node上至少有一个pod声明了亲和性havePodsWithAffinityNodeInfoList []*framework.NodeInfo//node info列表，每个node上至少有一个pod声明了反亲和性havePodsWithRequiredAntiAffinityNodeInfoList []*framework.NodeInfo//nodeinfo中也有一个变量generation，每次node信息变化时会将一个全局变量generation加1，并赋给nodeinfo的generation，//这里的generation会被赋值为最近更新过的nodoinfo的generation，所以只要这里的generation大于等于nodoinfo的generation，//说明node没有更新，也不用更新快照generation                                   int64
}

//pkg/scheduler/internal/cache/cache.go
func (cache *schedulerCache) UpdateSnapshot(nodeSnapshot *Snapshot) error {cache.mu.Lock()defer cache.mu.Unlock()//获取generationsnapshotGeneration := nodeSnapshot.generation// NodeInfoList and HavePodsWithAffinityNodeInfoList must be re-created if a node was added// or removed from the cache.updateAllLists := false// HavePodsWithAffinityNodeInfoList must be re-created if a node changed its// status from having pods with affinity to NOT having pods with affinity or the other// way around.updateNodesHavePodsWithAffinity := false// HavePodsWithRequiredAntiAffinityNodeInfoList must be re-created if a node changed its// status from having pods with required anti-affinity to NOT having pods with required// anti-affinity or the other way around.updateNodesHavePodsWithRequiredAntiAffinity := false//遍历cache的双向链表，链表中nodeinfo按照generation的值递减排列for node := cache.headNode; node != nil; node = node.next {//只更新node.info.Generation大于snapshotGeneration的node，增量更新if node.info.Generation <= snapshotGeneration {// all the nodes are updated before the existing snapshot. We are done.break}if np := node.info.Node(); np != nil {existing, ok := nodeSnapshot.nodeInfoMap[np.Name]//快照中还没有此node信息if !ok {//设置全量更新标志updateAllLists = trueexisting = &framework.NodeInfo{}nodeSnapshot.nodeInfoMap[np.Name] = existing}//复制node信息clone := node.info.Clone()//快照中保存的node上声明亲和性的pod个数和此node最新的声明亲和性的pod个数不相等if (len(existing.PodsWithAffinity) > 0) != (len(clone.PodsWithAffinity) > 0) {updateNodesHavePodsWithAffinity = true}//快照中保存的node上声明反亲和性的pod个数和此node最新的声明反亲和性的pod个数不相等if (len(existing.PodsWithRequiredAntiAffinity) > 0) != (len(clone.PodsWithRequiredAntiAffinity) > 0) {updateNodesHavePodsWithRequiredAntiAffinity = true}//将nodeinfo最新的状态更新到快照中*existing = *clone}}//将最新的nodeinfo的generation更新到快照中if cache.headNode != nil {nodeSnapshot.generation = cache.headNode.info.Generation}//快照中node个数大于cache中node个数，说明有node被删除了，也需要全量更新if len(nodeSnapshot.nodeInfoMap) > cache.nodeTree.numNodes {cache.removeDeletedNodesFromSnapshot(nodeSnapshot)updateAllLists = true}//有需要更新的话调用updateNodeInfoSnapshotList进行更新if updateAllLists || updateNodesHavePodsWithAffinity || updateNodesHavePodsWithRequiredAntiAffinity {cache.updateNodeInfoSnapshotList(nodeSnapshot, updateAllLists)}if len(nodeSnapshot.nodeInfoList) != cache.nodeTree.numNodes {errMsg := fmt.Sprintf("snapshot state is not consistent, length of NodeInfoList=%v not equal to length of nodes in tree=%v "+", length of NodeInfoMap=%v, length of nodes in cache=%v"+", trying to recover",len(nodeSnapshot.nodeInfoList), cache.nodeTree.numNodes,len(nodeSnapshot.nodeInfoMap), len(cache.nodes))klog.Error(errMsg)// We will try to recover by re-creating the lists for the next scheduling cycle, but still return an// error to surface the problem, the error will likely cause a failure to the current scheduling cycle.cache.updateNodeInfoSnapshotList(nodeSnapshot, true)return fmt.Errorf(errMsg)}return nil
}func (cache *schedulerCache) updateNodeInfoSnapshotList(snapshot *Snapshot, updateAll bool) {snapshot.havePodsWithAffinityNodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)//全量更新if updateAll {// Take a snapshot of the nodes order in the treesnapshot.nodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)//获取nodelist，保存的是nodenamenodesList, err := cache.nodeTree.list()if err != nil {klog.Error(err)}for _, nodeName := range nodesList {if nodeInfo := snapshot.nodeInfoMap[nodeName]; nodeInfo != nil {snapshot.nodeInfoList = append(snapshot.nodeInfoList, nodeInfo)if len(nodeInfo.PodsWithAffinity) > 0 {snapshot.havePodsWithAffinityNodeInfoList = append(snapshot.havePodsWithAffinityNodeInfoList, nodeInfo)}if len(nodeInfo.PodsWithRequiredAntiAffinity) > 0 {snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo)}} else {klog.Errorf("node %q exist in nodeTree but not in NodeInfoMap, this should not happen.", nodeName)}}} else {//只更新有pod亲和性/反亲和性的nodefor _, nodeInfo := range snapshot.nodeInfoList {if len(nodeInfo.PodsWithAffinity) > 0 {snapshot.havePodsWithAffinityNodeInfoList = append(snapshot.havePodsWithAffinityNodeInfoList, nodeInfo)}if len(nodeInfo.PodsWithRequiredAntiAffinity) > 0 {snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo)}}}
}

run
调度缓存会启动一个协程，周期清理过期的假定pod

func (cache *schedulerCache) run() {//period为cleanAssumedPeriod = 1 * time.Secondgo wait.Until(cache.cleanupExpiredAssumedPods, cache.period, cache.stop)
}func (cache *schedulerCache) cleanupExpiredAssumedPods() {cache.cleanupAssumedPods(time.Now())
}// cleanupAssumedPods exists for making test deterministic by taking time as input argument.
// It also reports metrics on the cache size for nodes, pods, and assumed pods.
func (cache *schedulerCache) cleanupAssumedPods(now time.Time) {cache.mu.Lock()defer cache.mu.Unlock()defer cache.updateMetrics()// The size of assumedPods should be smallfor key := range cache.assumedPods {ps, ok := cache.podStates[key]if !ok {klog.Fatal("Key found in assumed set but not in podStates. Potentially a logical error.")}//pod还未完成bind，不能超时if !ps.bindingFinished {klog.V(5).Infof("Couldn't expire cache for pod %v/%v. Binding is still in progress.",ps.pod.Namespace, ps.pod.Name)continue}//假定状态的pod超时了，将其从cache中删除if now.After(*ps.deadline) {klog.Warningf("Pod %s/%s expired", ps.pod.Namespace, ps.pod.Name)if err := cache.expirePod(key, ps); err != nil {klog.Errorf("ExpirePod failed for %s: %v", key, err)}}}
}

上一篇：git工具基本操作命令

下一篇：1024程序员节带你玩转图片Exif信息获取之JavaScript

kube-scheduler cache

相关内容

热门资讯