diff options
Diffstat (limited to 'vendor/github.com/containerd/cgroups/memory.go')
-rw-r--r-- | vendor/github.com/containerd/cgroups/memory.go | 480 |
1 files changed, 480 insertions, 0 deletions
diff --git a/vendor/github.com/containerd/cgroups/memory.go b/vendor/github.com/containerd/cgroups/memory.go new file mode 100644 index 000000000..e271866ef --- /dev/null +++ b/vendor/github.com/containerd/cgroups/memory.go @@ -0,0 +1,480 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bufio" + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +// MemoryEvent is an interface that V1 memory Cgroup notifications implement. Arg returns the +// file name whose fd should be written to "cgroups.event_control". EventFile returns the name of +// the file that supports the notification api e.g. "memory.usage_in_bytes". +type MemoryEvent interface { + Arg() string + EventFile() string +} + +type memoryThresholdEvent struct { + threshold uint64 + swap bool +} + +// MemoryThresholdEvent returns a new memory threshold event to be used with RegisterMemoryEvent. +// If swap is true, the event will be registered using memory.memsw.usage_in_bytes +func MemoryThresholdEvent(threshold uint64, swap bool) MemoryEvent { + return &memoryThresholdEvent{ + threshold, + swap, + } +} + +func (m *memoryThresholdEvent) Arg() string { + return strconv.FormatUint(m.threshold, 10) +} + +func (m *memoryThresholdEvent) EventFile() string { + if m.swap { + return "memory.memsw.usage_in_bytes" + } + return "memory.usage_in_bytes" +} + +type oomEvent struct{} + +// OOMEvent returns a new oom event to be used with RegisterMemoryEvent. +func OOMEvent() MemoryEvent { + return &oomEvent{} +} + +func (oom *oomEvent) Arg() string { + return "" +} + +func (oom *oomEvent) EventFile() string { + return "memory.oom_control" +} + +type memoryPressureEvent struct { + pressureLevel MemoryPressureLevel + hierarchy EventNotificationMode +} + +// MemoryPressureEvent returns a new memory pressure event to be used with RegisterMemoryEvent. +func MemoryPressureEvent(pressureLevel MemoryPressureLevel, hierarchy EventNotificationMode) MemoryEvent { + return &memoryPressureEvent{ + pressureLevel, + hierarchy, + } +} + +func (m *memoryPressureEvent) Arg() string { + return string(m.pressureLevel) + "," + string(m.hierarchy) +} + +func (m *memoryPressureEvent) EventFile() string { + return "memory.pressure_level" +} + +// MemoryPressureLevel corresponds to the memory pressure levels defined +// for memory cgroups. +type MemoryPressureLevel string + +// The three memory pressure levels are as follows. +// - The "low" level means that the system is reclaiming memory for new +// allocations. Monitoring this reclaiming activity might be useful for +// maintaining cache level. Upon notification, the program (typically +// "Activity Manager") might analyze vmstat and act in advance (i.e. +// prematurely shutdown unimportant services). +// - The "medium" level means that the system is experiencing medium memory +// pressure, the system might be making swap, paging out active file caches, +// etc. Upon this event applications may decide to further analyze +// vmstat/zoneinfo/memcg or internal memory usage statistics and free any +// resources that can be easily reconstructed or re-read from a disk. +// - The "critical" level means that the system is actively thrashing, it is +// about to out of memory (OOM) or even the in-kernel OOM killer is on its +// way to trigger. Applications should do whatever they can to help the +// system. It might be too late to consult with vmstat or any other +// statistics, so it is advisable to take an immediate action. +// "https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt" Section 11 +const ( + LowPressure MemoryPressureLevel = "low" + MediumPressure MemoryPressureLevel = "medium" + CriticalPressure MemoryPressureLevel = "critical" +) + +// EventNotificationMode corresponds to the notification modes +// for the memory cgroups pressure level notifications. +type EventNotificationMode string + +// There are three optional modes that specify different propagation behavior: +// - "default": this is the default behavior specified above. This mode is the +// same as omitting the optional mode parameter, preserved by backwards +// compatibility. +// - "hierarchy": events always propagate up to the root, similar to the default +// behavior, except that propagation continues regardless of whether there are +// event listeners at each level, with the "hierarchy" mode. In the above +// example, groups A, B, and C will receive notification of memory pressure. +// - "local": events are pass-through, i.e. they only receive notifications when +// memory pressure is experienced in the memcg for which the notification is +// registered. In the above example, group C will receive notification if +// registered for "local" notification and the group experiences memory +// pressure. However, group B will never receive notification, regardless if +// there is an event listener for group C or not, if group B is registered for +// local notification. +// "https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt" Section 11 +const ( + DefaultMode EventNotificationMode = "default" + LocalMode EventNotificationMode = "local" + HierarchyMode EventNotificationMode = "hierarchy" +) + +// NewMemory returns a Memory controller given the root folder of cgroups. +// It may optionally accept other configuration options, such as IgnoreModules(...) +func NewMemory(root string, options ...func(*memoryController)) *memoryController { + mc := &memoryController{ + root: filepath.Join(root, string(Memory)), + ignored: map[string]struct{}{}, + } + for _, opt := range options { + opt(mc) + } + return mc +} + +// IgnoreModules configure the memory controller to not read memory metrics for some +// module names (e.g. passing "memsw" would avoid all the memory.memsw.* entries) +func IgnoreModules(names ...string) func(*memoryController) { + return func(mc *memoryController) { + for _, name := range names { + mc.ignored[name] = struct{}{} + } + } +} + +// OptionalSwap allows the memory controller to not fail if cgroups is not accounting +// Swap memory (there are no memory.memsw.* entries) +func OptionalSwap() func(*memoryController) { + return func(mc *memoryController) { + _, err := os.Stat(filepath.Join(mc.root, "memory.memsw.usage_in_bytes")) + if os.IsNotExist(err) { + mc.ignored["memsw"] = struct{}{} + } + } +} + +type memoryController struct { + root string + ignored map[string]struct{} +} + +func (m *memoryController) Name() Name { + return Memory +} + +func (m *memoryController) Path(path string) string { + return filepath.Join(m.root, path) +} + +func (m *memoryController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(m.Path(path), defaultDirPerm); err != nil { + return err + } + if resources.Memory == nil { + return nil + } + return m.set(path, getMemorySettings(resources)) +} + +func (m *memoryController) Update(path string, resources *specs.LinuxResources) error { + if resources.Memory == nil { + return nil + } + g := func(v *int64) bool { + return v != nil && *v > 0 + } + settings := getMemorySettings(resources) + if g(resources.Memory.Limit) && g(resources.Memory.Swap) { + // if the updated swap value is larger than the current memory limit set the swap changes first + // then set the memory limit as swap must always be larger than the current limit + current, err := readUint(filepath.Join(m.Path(path), "memory.limit_in_bytes")) + if err != nil { + return err + } + if current < uint64(*resources.Memory.Swap) { + settings[0], settings[1] = settings[1], settings[0] + } + } + return m.set(path, settings) +} + +func (m *memoryController) Stat(path string, stats *v1.Metrics) error { + fMemStat, err := os.Open(filepath.Join(m.Path(path), "memory.stat")) + if err != nil { + return err + } + defer fMemStat.Close() + stats.Memory = &v1.MemoryStat{ + Usage: &v1.MemoryEntry{}, + Swap: &v1.MemoryEntry{}, + Kernel: &v1.MemoryEntry{}, + KernelTCP: &v1.MemoryEntry{}, + } + if err := m.parseStats(fMemStat, stats.Memory); err != nil { + return err + } + + fMemOomControl, err := os.Open(filepath.Join(m.Path(path), "memory.oom_control")) + if err != nil { + return err + } + defer fMemOomControl.Close() + stats.MemoryOomControl = &v1.MemoryOomControl{} + if err := m.parseOomControlStats(fMemOomControl, stats.MemoryOomControl); err != nil { + return err + } + for _, t := range []struct { + module string + entry *v1.MemoryEntry + }{ + { + module: "", + entry: stats.Memory.Usage, + }, + { + module: "memsw", + entry: stats.Memory.Swap, + }, + { + module: "kmem", + entry: stats.Memory.Kernel, + }, + { + module: "kmem.tcp", + entry: stats.Memory.KernelTCP, + }, + } { + if _, ok := m.ignored[t.module]; ok { + continue + } + for _, tt := range []struct { + name string + value *uint64 + }{ + { + name: "usage_in_bytes", + value: &t.entry.Usage, + }, + { + name: "max_usage_in_bytes", + value: &t.entry.Max, + }, + { + name: "failcnt", + value: &t.entry.Failcnt, + }, + { + name: "limit_in_bytes", + value: &t.entry.Limit, + }, + } { + parts := []string{"memory"} + if t.module != "" { + parts = append(parts, t.module) + } + parts = append(parts, tt.name) + v, err := readUint(filepath.Join(m.Path(path), strings.Join(parts, "."))) + if err != nil { + return err + } + *tt.value = v + } + } + return nil +} + +func (m *memoryController) parseStats(r io.Reader, stat *v1.MemoryStat) error { + var ( + raw = make(map[string]uint64) + sc = bufio.NewScanner(r) + line int + ) + for sc.Scan() { + key, v, err := parseKV(sc.Text()) + if err != nil { + return fmt.Errorf("%d: %v", line, err) + } + raw[key] = v + line++ + } + if err := sc.Err(); err != nil { + return err + } + stat.Cache = raw["cache"] + stat.RSS = raw["rss"] + stat.RSSHuge = raw["rss_huge"] + stat.MappedFile = raw["mapped_file"] + stat.Dirty = raw["dirty"] + stat.Writeback = raw["writeback"] + stat.PgPgIn = raw["pgpgin"] + stat.PgPgOut = raw["pgpgout"] + stat.PgFault = raw["pgfault"] + stat.PgMajFault = raw["pgmajfault"] + stat.InactiveAnon = raw["inactive_anon"] + stat.ActiveAnon = raw["active_anon"] + stat.InactiveFile = raw["inactive_file"] + stat.ActiveFile = raw["active_file"] + stat.Unevictable = raw["unevictable"] + stat.HierarchicalMemoryLimit = raw["hierarchical_memory_limit"] + stat.HierarchicalSwapLimit = raw["hierarchical_memsw_limit"] + stat.TotalCache = raw["total_cache"] + stat.TotalRSS = raw["total_rss"] + stat.TotalRSSHuge = raw["total_rss_huge"] + stat.TotalMappedFile = raw["total_mapped_file"] + stat.TotalDirty = raw["total_dirty"] + stat.TotalWriteback = raw["total_writeback"] + stat.TotalPgPgIn = raw["total_pgpgin"] + stat.TotalPgPgOut = raw["total_pgpgout"] + stat.TotalPgFault = raw["total_pgfault"] + stat.TotalPgMajFault = raw["total_pgmajfault"] + stat.TotalInactiveAnon = raw["total_inactive_anon"] + stat.TotalActiveAnon = raw["total_active_anon"] + stat.TotalInactiveFile = raw["total_inactive_file"] + stat.TotalActiveFile = raw["total_active_file"] + stat.TotalUnevictable = raw["total_unevictable"] + return nil +} + +func (m *memoryController) parseOomControlStats(r io.Reader, stat *v1.MemoryOomControl) error { + var ( + raw = make(map[string]uint64) + sc = bufio.NewScanner(r) + line int + ) + for sc.Scan() { + key, v, err := parseKV(sc.Text()) + if err != nil { + return fmt.Errorf("%d: %v", line, err) + } + raw[key] = v + line++ + } + if err := sc.Err(); err != nil { + return err + } + stat.OomKillDisable = raw["oom_kill_disable"] + stat.UnderOom = raw["under_oom"] + stat.OomKill = raw["oom_kill"] + return nil +} + +func (m *memoryController) set(path string, settings []memorySettings) error { + for _, t := range settings { + if t.value != nil { + if err := retryingWriteFile( + filepath.Join(m.Path(path), "memory."+t.name), + []byte(strconv.FormatInt(*t.value, 10)), + defaultFilePerm, + ); err != nil { + return err + } + } + } + return nil +} + +type memorySettings struct { + name string + value *int64 +} + +func getMemorySettings(resources *specs.LinuxResources) []memorySettings { + mem := resources.Memory + var swappiness *int64 + if mem.Swappiness != nil { + v := int64(*mem.Swappiness) + swappiness = &v + } + return []memorySettings{ + { + name: "limit_in_bytes", + value: mem.Limit, + }, + { + name: "soft_limit_in_bytes", + value: mem.Reservation, + }, + { + name: "memsw.limit_in_bytes", + value: mem.Swap, + }, + { + name: "kmem.limit_in_bytes", + value: mem.Kernel, + }, + { + name: "kmem.tcp.limit_in_bytes", + value: mem.KernelTCP, + }, + { + name: "oom_control", + value: getOomControlValue(mem), + }, + { + name: "swappiness", + value: swappiness, + }, + } +} + +func getOomControlValue(mem *specs.LinuxMemory) *int64 { + if mem.DisableOOMKiller != nil && *mem.DisableOOMKiller { + i := int64(1) + return &i + } + return nil +} + +func (m *memoryController) memoryEvent(path string, event MemoryEvent) (uintptr, error) { + root := m.Path(path) + efd, err := unix.Eventfd(0, unix.EFD_CLOEXEC) + if err != nil { + return 0, err + } + evtFile, err := os.Open(filepath.Join(root, event.EventFile())) + if err != nil { + unix.Close(efd) + return 0, err + } + defer evtFile.Close() + data := fmt.Sprintf("%d %d %s", efd, evtFile.Fd(), event.Arg()) + evctlPath := filepath.Join(root, "cgroup.event_control") + if err := retryingWriteFile(evctlPath, []byte(data), 0700); err != nil { + unix.Close(efd) + return 0, err + } + return uintptr(efd), nil +} |