summaryrefslogtreecommitdiff
path: root/vendor/github.com/containerd/cgroups/v3/cgroup2/manager.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/containerd/cgroups/v3/cgroup2/manager.go')
-rw-r--r--vendor/github.com/containerd/cgroups/v3/cgroup2/manager.go990
1 files changed, 990 insertions, 0 deletions
diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/manager.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/manager.go
new file mode 100644
index 000000000..fc9fcf453
--- /dev/null
+++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/manager.go
@@ -0,0 +1,990 @@
+/*
+ Copyright The containerd Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+package cgroup2
+
+import (
+ "bufio"
+ "context"
+ "errors"
+ "fmt"
+ "io"
+ "math"
+ "os"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "syscall"
+ "time"
+
+ "github.com/containerd/cgroups/v3/cgroup2/stats"
+
+ systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+ "github.com/godbus/dbus/v5"
+ "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/sirupsen/logrus"
+ "golang.org/x/sys/unix"
+)
+
+const (
+ subtreeControl = "cgroup.subtree_control"
+ controllersFile = "cgroup.controllers"
+ killFile = "cgroup.kill"
+ defaultCgroup2Path = "/sys/fs/cgroup"
+ defaultSlice = "system.slice"
+)
+
+var (
+ canDelegate bool
+)
+
+type Event struct {
+ Low uint64
+ High uint64
+ Max uint64
+ OOM uint64
+ OOMKill uint64
+}
+
+// Resources for a cgroups v2 unified hierarchy
+type Resources struct {
+ CPU *CPU
+ Memory *Memory
+ Pids *Pids
+ IO *IO
+ RDMA *RDMA
+ HugeTlb *HugeTlb
+ // When len(Devices) is zero, devices are not controlled
+ Devices []specs.LinuxDeviceCgroup
+}
+
+// Values returns the raw filenames and values that
+// can be written to the unified hierarchy
+func (r *Resources) Values() (o []Value) {
+ if r.CPU != nil {
+ o = append(o, r.CPU.Values()...)
+ }
+ if r.Memory != nil {
+ o = append(o, r.Memory.Values()...)
+ }
+ if r.Pids != nil {
+ o = append(o, r.Pids.Values()...)
+ }
+ if r.IO != nil {
+ o = append(o, r.IO.Values()...)
+ }
+ if r.RDMA != nil {
+ o = append(o, r.RDMA.Values()...)
+ }
+ if r.HugeTlb != nil {
+ o = append(o, r.HugeTlb.Values()...)
+ }
+ return o
+}
+
+// EnabledControllers returns the list of all not nil resource controllers
+func (r *Resources) EnabledControllers() (c []string) {
+ if r.CPU != nil {
+ c = append(c, "cpu")
+ c = append(c, "cpuset")
+ }
+ if r.Memory != nil {
+ c = append(c, "memory")
+ }
+ if r.Pids != nil {
+ c = append(c, "pids")
+ }
+ if r.IO != nil {
+ c = append(c, "io")
+ }
+ if r.RDMA != nil {
+ c = append(c, "rdma")
+ }
+ if r.HugeTlb != nil {
+ c = append(c, "hugetlb")
+ }
+ return
+}
+
+// Value of a cgroup setting
+type Value struct {
+ filename string
+ value interface{}
+}
+
+// write the value to the full, absolute path, of a unified hierarchy
+func (c *Value) write(path string, perm os.FileMode) error {
+ var data []byte
+ switch t := c.value.(type) {
+ case uint64:
+ data = []byte(strconv.FormatUint(t, 10))
+ case uint16:
+ data = []byte(strconv.FormatUint(uint64(t), 10))
+ case int64:
+ data = []byte(strconv.FormatInt(t, 10))
+ case []byte:
+ data = t
+ case string:
+ data = []byte(t)
+ case CPUMax:
+ data = []byte(t)
+ default:
+ return ErrInvalidFormat
+ }
+
+ return os.WriteFile(
+ filepath.Join(path, c.filename),
+ data,
+ perm,
+ )
+}
+
+func writeValues(path string, values []Value) error {
+ for _, o := range values {
+ if err := o.write(path, defaultFilePerm); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func NewManager(mountpoint string, group string, resources *Resources) (*Manager, error) {
+ if resources == nil {
+ return nil, errors.New("resources reference is nil")
+ }
+ if err := VerifyGroupPath(group); err != nil {
+ return nil, err
+ }
+ path := filepath.Join(mountpoint, group)
+ if err := os.MkdirAll(path, defaultDirPerm); err != nil {
+ return nil, err
+ }
+ m := Manager{
+ unifiedMountpoint: mountpoint,
+ path: path,
+ }
+ if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil {
+ // clean up cgroup dir on failure
+ os.Remove(path)
+ return nil, err
+ }
+ if err := setResources(path, resources); err != nil {
+ os.Remove(path)
+ return nil, err
+ }
+ return &m, nil
+}
+
+type InitConfig struct {
+ mountpoint string
+}
+
+type InitOpts func(c *InitConfig) error
+
+// WithMountpoint sets the unified mountpoint. The default path is /sys/fs/cgroup.
+func WithMountpoint(path string) InitOpts {
+ return func(c *InitConfig) error {
+ c.mountpoint = path
+ return nil
+ }
+}
+
+// Load a cgroup.
+func Load(group string, opts ...InitOpts) (*Manager, error) {
+ c := InitConfig{mountpoint: defaultCgroup2Path}
+ for _, opt := range opts {
+ if err := opt(&c); err != nil {
+ return nil, err
+ }
+ }
+
+ if err := VerifyGroupPath(group); err != nil {
+ return nil, err
+ }
+ path := filepath.Join(c.mountpoint, group)
+ return &Manager{
+ unifiedMountpoint: c.mountpoint,
+ path: path,
+ }, nil
+}
+
+type Manager struct {
+ unifiedMountpoint string
+ path string
+}
+
+func setResources(path string, resources *Resources) error {
+ if resources != nil {
+ if err := writeValues(path, resources.Values()); err != nil {
+ return err
+ }
+ if err := setDevices(path, resources.Devices); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (c *Manager) RootControllers() ([]string, error) {
+ b, err := os.ReadFile(filepath.Join(c.unifiedMountpoint, controllersFile))
+ if err != nil {
+ return nil, err
+ }
+ return strings.Fields(string(b)), nil
+}
+
+func (c *Manager) Controllers() ([]string, error) {
+ b, err := os.ReadFile(filepath.Join(c.path, controllersFile))
+ if err != nil {
+ return nil, err
+ }
+ return strings.Fields(string(b)), nil
+}
+
+func (c *Manager) Update(resources *Resources) error {
+ return setResources(c.path, resources)
+}
+
+type ControllerToggle int
+
+const (
+ Enable ControllerToggle = iota + 1
+ Disable
+)
+
+func toggleFunc(controllers []string, prefix string) []string {
+ out := make([]string, len(controllers))
+ for i, c := range controllers {
+ out[i] = prefix + c
+ }
+ return out
+}
+
+func (c *Manager) ToggleControllers(controllers []string, t ControllerToggle) error {
+ // when c.path is like /foo/bar/baz, the following files need to be written:
+ // * /sys/fs/cgroup/cgroup.subtree_control
+ // * /sys/fs/cgroup/foo/cgroup.subtree_control
+ // * /sys/fs/cgroup/foo/bar/cgroup.subtree_control
+ // Note that /sys/fs/cgroup/foo/bar/baz/cgroup.subtree_control does not need to be written.
+ split := strings.Split(c.path, "/")
+ var lastErr error
+ for i := range split {
+ f := strings.Join(split[:i], "/")
+ if !strings.HasPrefix(f, c.unifiedMountpoint) || f == c.path {
+ continue
+ }
+ filePath := filepath.Join(f, subtreeControl)
+ if err := c.writeSubtreeControl(filePath, controllers, t); err != nil {
+ // When running as rootless, the user may face EPERM on parent groups, but it is neglible when the
+ // controller is already written.
+ // So we only return the last error.
+ lastErr = fmt.Errorf("failed to write subtree controllers %+v to %q: %w", controllers, filePath, err)
+ } else {
+ lastErr = nil
+ }
+ }
+ return lastErr
+}
+
+func (c *Manager) writeSubtreeControl(filePath string, controllers []string, t ControllerToggle) error {
+ f, err := os.OpenFile(filePath, os.O_WRONLY, 0)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+ switch t {
+ case Enable:
+ controllers = toggleFunc(controllers, "+")
+ case Disable:
+ controllers = toggleFunc(controllers, "-")
+ }
+ _, err = f.WriteString(strings.Join(controllers, " "))
+ return err
+}
+
+func (c *Manager) NewChild(name string, resources *Resources) (*Manager, error) {
+ if strings.HasPrefix(name, "/") {
+ return nil, errors.New("name must be relative")
+ }
+ path := filepath.Join(c.path, name)
+ if err := os.MkdirAll(path, defaultDirPerm); err != nil {
+ return nil, err
+ }
+ m := Manager{
+ unifiedMountpoint: c.unifiedMountpoint,
+ path: path,
+ }
+ if resources != nil {
+ if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil {
+ // clean up cgroup dir on failure
+ os.Remove(path)
+ return nil, err
+ }
+ }
+ if err := setResources(path, resources); err != nil {
+ // clean up cgroup dir on failure
+ os.Remove(path)
+ return nil, err
+ }
+ return &m, nil
+}
+
+func (c *Manager) AddProc(pid uint64) error {
+ v := Value{
+ filename: cgroupProcs,
+ value: pid,
+ }
+ return writeValues(c.path, []Value{v})
+}
+
+func (c *Manager) AddThread(tid uint64) error {
+ v := Value{
+ filename: cgroupThreads,
+ value: tid,
+ }
+ return writeValues(c.path, []Value{v})
+}
+
+// Kill will try to forcibly exit all of the processes in the cgroup. This is
+// equivalent to sending a SIGKILL to every process. On kernels 5.14 and greater
+// this will use the cgroup.kill file, on anything that doesn't have the cgroup.kill
+// file, a manual process of freezing -> sending a SIGKILL to every process -> thawing
+// will be used.
+func (c *Manager) Kill() error {
+ v := Value{
+ filename: killFile,
+ value: "1",
+ }
+ err := writeValues(c.path, []Value{v})
+ if err == nil {
+ return nil
+ }
+ logrus.Warnf("falling back to slower kill implementation: %s", err)
+ // Fallback to slow method.
+ return c.fallbackKill()
+}
+
+// fallbackKill is a slower fallback to the more modern (kernels 5.14+)
+// approach of writing to the cgroup.kill file. This is heavily pulled
+// from runc's same approach (in signalAllProcesses), with the only differences
+// being this is just tailored to the API exposed in this library, and we don't
+// need to care about signals other than SIGKILL.
+//
+// https://github.com/opencontainers/runc/blob/8da0a0b5675764feaaaaad466f6567a9983fcd08/libcontainer/init_linux.go#L523-L529
+func (c *Manager) fallbackKill() error {
+ if err := c.Freeze(); err != nil {
+ logrus.Warn(err)
+ }
+ pids, err := c.Procs(true)
+ if err != nil {
+ if err := c.Thaw(); err != nil {
+ logrus.Warn(err)
+ }
+ return err
+ }
+ var procs []*os.Process
+ for _, pid := range pids {
+ p, err := os.FindProcess(int(pid))
+ if err != nil {
+ logrus.Warn(err)
+ continue
+ }
+ procs = append(procs, p)
+ if err := p.Signal(unix.SIGKILL); err != nil {
+ logrus.Warn(err)
+ }
+ }
+ if err := c.Thaw(); err != nil {
+ logrus.Warn(err)
+ }
+
+ subreaper, err := getSubreaper()
+ if err != nil {
+ // The error here means that PR_GET_CHILD_SUBREAPER is not
+ // supported because this code might run on a kernel older
+ // than 3.4. We don't want to throw an error in that case,
+ // and we simplify things, considering there is no subreaper
+ // set.
+ subreaper = 0
+ }
+
+ for _, p := range procs {
+ // In case a subreaper has been setup, this code must not
+ // wait for the process. Otherwise, we cannot be sure the
+ // current process will be reaped by the subreaper, while
+ // the subreaper might be waiting for this process in order
+ // to retrieve its exit code.
+ if subreaper == 0 {
+ if _, err := p.Wait(); err != nil {
+ if !errors.Is(err, unix.ECHILD) {
+ logrus.Warnf("wait on pid %d failed: %s", p.Pid, err)
+ }
+ }
+ }
+ }
+ return nil
+}
+
+func (c *Manager) Delete() error {
+ // kernel prevents cgroups with running process from being removed, check the tree is empty
+ processes, err := c.Procs(true)
+ if err != nil {
+ return err
+ }
+ if len(processes) > 0 {
+ return fmt.Errorf("cgroups: unable to remove path %q: still contains running processes", c.path)
+ }
+ return remove(c.path)
+}
+
+func (c *Manager) Procs(recursive bool) ([]uint64, error) {
+ var processes []uint64
+ err := filepath.Walk(c.path, func(p string, info os.FileInfo, err error) error {
+ if err != nil {
+ return err
+ }
+ if !recursive && info.IsDir() {
+ if p == c.path {
+ return nil
+ }
+ return filepath.SkipDir
+ }
+ _, name := filepath.Split(p)
+ if name != cgroupProcs {
+ return nil
+ }
+ procs, err := parseCgroupProcsFile(p)
+ if err != nil {
+ return err
+ }
+ processes = append(processes, procs...)
+ return nil
+ })
+ return processes, err
+}
+
+func (c *Manager) MoveTo(destination *Manager) error {
+ processes, err := c.Procs(true)
+ if err != nil {
+ return err
+ }
+ for _, p := range processes {
+ if err := destination.AddProc(p); err != nil {
+ if strings.Contains(err.Error(), "no such process") {
+ continue
+ }
+ return err
+ }
+ }
+ return nil
+}
+
+var singleValueFiles = []string{
+ "pids.current",
+ "pids.max",
+}
+
+func (c *Manager) Stat() (*stats.Metrics, error) {
+ controllers, err := c.Controllers()
+ if err != nil {
+ return nil, err
+ }
+ out := make(map[string]interface{})
+ for _, controller := range controllers {
+ switch controller {
+ case "cpu", "memory":
+ if err := readKVStatsFile(c.path, controller+".stat", out); err != nil {
+ if os.IsNotExist(err) {
+ continue
+ }
+ return nil, err
+ }
+ }
+ }
+ for _, name := range singleValueFiles {
+ if err := readSingleFile(c.path, name, out); err != nil {
+ if os.IsNotExist(err) {
+ continue
+ }
+ return nil, err
+ }
+ }
+ memoryEvents := make(map[string]interface{})
+ if err := readKVStatsFile(c.path, "memory.events", memoryEvents); err != nil {
+ if !os.IsNotExist(err) {
+ return nil, err
+ }
+ }
+ var metrics stats.Metrics
+
+ metrics.Pids = &stats.PidsStat{
+ Current: getPidValue("pids.current", out),
+ Limit: getPidValue("pids.max", out),
+ }
+ metrics.CPU = &stats.CPUStat{
+ UsageUsec: getUint64Value("usage_usec", out),
+ UserUsec: getUint64Value("user_usec", out),
+ SystemUsec: getUint64Value("system_usec", out),
+ NrPeriods: getUint64Value("nr_periods", out),
+ NrThrottled: getUint64Value("nr_throttled", out),
+ ThrottledUsec: getUint64Value("throttled_usec", out),
+ }
+ metrics.Memory = &stats.MemoryStat{
+ Anon: getUint64Value("anon", out),
+ File: getUint64Value("file", out),
+ KernelStack: getUint64Value("kernel_stack", out),
+ Slab: getUint64Value("slab", out),
+ Sock: getUint64Value("sock", out),
+ Shmem: getUint64Value("shmem", out),
+ FileMapped: getUint64Value("file_mapped", out),
+ FileDirty: getUint64Value("file_dirty", out),
+ FileWriteback: getUint64Value("file_writeback", out),
+ AnonThp: getUint64Value("anon_thp", out),
+ InactiveAnon: getUint64Value("inactive_anon", out),
+ ActiveAnon: getUint64Value("active_anon", out),
+ InactiveFile: getUint64Value("inactive_file", out),
+ ActiveFile: getUint64Value("active_file", out),
+ Unevictable: getUint64Value("unevictable", out),
+ SlabReclaimable: getUint64Value("slab_reclaimable", out),
+ SlabUnreclaimable: getUint64Value("slab_unreclaimable", out),
+ Pgfault: getUint64Value("pgfault", out),
+ Pgmajfault: getUint64Value("pgmajfault", out),
+ WorkingsetRefault: getUint64Value("workingset_refault", out),
+ WorkingsetActivate: getUint64Value("workingset_activate", out),
+ WorkingsetNodereclaim: getUint64Value("workingset_nodereclaim", out),
+ Pgrefill: getUint64Value("pgrefill", out),
+ Pgscan: getUint64Value("pgscan", out),
+ Pgsteal: getUint64Value("pgsteal", out),
+ Pgactivate: getUint64Value("pgactivate", out),
+ Pgdeactivate: getUint64Value("pgdeactivate", out),
+ Pglazyfree: getUint64Value("pglazyfree", out),
+ Pglazyfreed: getUint64Value("pglazyfreed", out),
+ ThpFaultAlloc: getUint64Value("thp_fault_alloc", out),
+ ThpCollapseAlloc: getUint64Value("thp_collapse_alloc", out),
+ Usage: getStatFileContentUint64(filepath.Join(c.path, "memory.current")),
+ UsageLimit: getStatFileContentUint64(filepath.Join(c.path, "memory.max")),
+ SwapUsage: getStatFileContentUint64(filepath.Join(c.path, "memory.swap.current")),
+ SwapLimit: getStatFileContentUint64(filepath.Join(c.path, "memory.swap.max")),
+ }
+ if len(memoryEvents) > 0 {
+ metrics.MemoryEvents = &stats.MemoryEvents{
+ Low: getUint64Value("low", memoryEvents),
+ High: getUint64Value("high", memoryEvents),
+ Max: getUint64Value("max", memoryEvents),
+ Oom: getUint64Value("oom", memoryEvents),
+ OomKill: getUint64Value("oom_kill", memoryEvents),
+ }
+ }
+ metrics.Io = &stats.IOStat{Usage: readIoStats(c.path)}
+ metrics.Rdma = &stats.RdmaStat{
+ Current: rdmaStats(filepath.Join(c.path, "rdma.current")),
+ Limit: rdmaStats(filepath.Join(c.path, "rdma.max")),
+ }
+ metrics.Hugetlb = readHugeTlbStats(c.path)
+
+ return &metrics, nil
+}
+
+func getUint64Value(key string, out map[string]interface{}) uint64 {
+ v, ok := out[key]
+ if !ok {
+ return 0
+ }
+ switch t := v.(type) {
+ case uint64:
+ return t
+ }
+ return 0
+}
+
+func getPidValue(key string, out map[string]interface{}) uint64 {
+ v, ok := out[key]
+ if !ok {
+ return 0
+ }
+ switch t := v.(type) {
+ case uint64:
+ return t
+ case string:
+ if t == "max" {
+ return math.MaxUint64
+ }
+ }
+ return 0
+}
+
+func readSingleFile(path string, file string, out map[string]interface{}) error {
+ f, err := os.Open(filepath.Join(path, file))
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+ data, err := io.ReadAll(f)
+ if err != nil {
+ return err
+ }
+ s := strings.TrimSpace(string(data))
+ v, err := parseUint(s, 10, 64)
+ if err != nil {
+ // if we cannot parse as a uint, parse as a string
+ out[file] = s
+ return nil
+ }
+ out[file] = v
+ return nil
+}
+
+func readKVStatsFile(path string, file string, out map[string]interface{}) error {
+ f, err := os.Open(filepath.Join(path, file))
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ s := bufio.NewScanner(f)
+ for s.Scan() {
+ name, value, err := parseKV(s.Text())
+ if err != nil {
+ return fmt.Errorf("error while parsing %s (line=%q): %w", filepath.Join(path, file), s.Text(), err)
+ }
+ out[name] = value
+ }
+ return s.Err()
+}
+
+func (c *Manager) Freeze() error {
+ return c.freeze(c.path, Frozen)
+}
+
+func (c *Manager) Thaw() error {
+ return c.freeze(c.path, Thawed)
+}
+
+func (c *Manager) freeze(path string, state State) error {
+ values := state.Values()
+ for {
+ if err := writeValues(path, values); err != nil {
+ return err
+ }
+ current, err := fetchState(path)
+ if err != nil {
+ return err
+ }
+ if current == state {
+ return nil
+ }
+ time.Sleep(1 * time.Millisecond)
+ }
+}
+
+func (c *Manager) isCgroupEmpty() bool {
+ // In case of any error we return true so that we exit and don't leak resources
+ out := make(map[string]interface{})
+ if err := readKVStatsFile(c.path, "cgroup.events", out); err != nil {
+ return true
+ }
+ if v, ok := out["populated"]; ok {
+ populated, ok := v.(uint64)
+ if !ok {
+ return true
+ }
+ return populated == 0
+ }
+ return true
+}
+
+// MemoryEventFD returns inotify file descriptor and 'memory.events' inotify watch descriptor
+func (c *Manager) MemoryEventFD() (int, uint32, error) {
+ fpath := filepath.Join(c.path, "memory.events")
+ fd, err := syscall.InotifyInit()
+ if err != nil {
+ return 0, 0, errors.New("failed to create inotify fd")
+ }
+ wd, err := syscall.InotifyAddWatch(fd, fpath, unix.IN_MODIFY)
+ if err != nil {
+ syscall.Close(fd)
+ return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", fpath, err)
+ }
+ // monitor to detect process exit/cgroup deletion
+ evpath := filepath.Join(c.path, "cgroup.events")
+ if _, err = syscall.InotifyAddWatch(fd, evpath, unix.IN_MODIFY); err != nil {
+ syscall.Close(fd)
+ return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", evpath, err)
+ }
+
+ return fd, uint32(wd), nil
+}
+
+func (c *Manager) EventChan() (<-chan Event, <-chan error) {
+ ec := make(chan Event)
+ errCh := make(chan error, 1)
+ go c.waitForEvents(ec, errCh)
+
+ return ec, errCh
+}
+
+func parseMemoryEvents(out map[string]interface{}) (Event, error) {
+ e := Event{}
+ if v, ok := out["high"]; ok {
+ e.High, ok = v.(uint64)
+ if !ok {
+ return Event{}, fmt.Errorf("cannot convert high to uint64: %+v", v)
+ }
+ }
+ if v, ok := out["low"]; ok {
+ e.Low, ok = v.(uint64)
+ if !ok {
+ return Event{}, fmt.Errorf("cannot convert low to uint64: %+v", v)
+ }
+ }
+ if v, ok := out["max"]; ok {
+ e.Max, ok = v.(uint64)
+ if !ok {
+ return Event{}, fmt.Errorf("cannot convert max to uint64: %+v", v)
+ }
+ }
+ if v, ok := out["oom"]; ok {
+ e.OOM, ok = v.(uint64)
+ if !ok {
+ return Event{}, fmt.Errorf("cannot convert oom to uint64: %+v", v)
+ }
+ }
+ if v, ok := out["oom_kill"]; ok {
+ e.OOMKill, ok = v.(uint64)
+ if !ok {
+ return Event{}, fmt.Errorf("cannot convert oom_kill to uint64: %+v", v)
+ }
+ }
+ return e, nil
+}
+
+func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) {
+ defer close(errCh)
+
+ fd, _, err := c.MemoryEventFD()
+ if err != nil {
+ errCh <- err
+ return
+ }
+ defer syscall.Close(fd)
+
+ for {
+ buffer := make([]byte, syscall.SizeofInotifyEvent*10)
+ bytesRead, err := syscall.Read(fd, buffer)
+ if err != nil {
+ errCh <- err
+ return
+ }
+ if bytesRead >= syscall.SizeofInotifyEvent {
+ out := make(map[string]interface{})
+ if err := readKVStatsFile(c.path, "memory.events", out); err != nil {
+ // When cgroup is deleted read may return -ENODEV instead of -ENOENT from open.
+ if _, statErr := os.Lstat(filepath.Join(c.path, "memory.events")); !os.IsNotExist(statErr) {
+ errCh <- err
+ }
+ return
+ }
+ e, err := parseMemoryEvents(out)
+ if err != nil {
+ errCh <- err
+ return
+ }
+ ec <- e
+ if c.isCgroupEmpty() {
+ return
+ }
+ }
+ }
+}
+
+func setDevices(path string, devices []specs.LinuxDeviceCgroup) error {
+ if len(devices) == 0 {
+ return nil
+ }
+ insts, license, err := DeviceFilter(devices)
+ if err != nil {
+ return err
+ }
+ dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY|unix.O_CLOEXEC, 0600)
+ if err != nil {
+ return fmt.Errorf("cannot get dir FD for %s", path)
+ }
+ defer unix.Close(dirFD)
+ if _, err := LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
+ if !canSkipEBPFError(devices) {
+ return err
+ }
+ }
+ return nil
+}
+
+// getSystemdFullPath returns the full systemd path when creating a systemd slice group.
+// the reason this is necessary is because the "-" character has a special meaning in
+// systemd slice. For example, when creating a slice called "my-group-112233.slice",
+// systemd will create a hierarchy like this:
+//
+// /sys/fs/cgroup/my.slice/my-group.slice/my-group-112233.slice
+func getSystemdFullPath(slice, group string) string {
+ return filepath.Join(defaultCgroup2Path, dashesToPath(slice), dashesToPath(group))
+}
+
+// dashesToPath converts a slice name with dashes to it's corresponding systemd filesystem path.
+func dashesToPath(in string) string {
+ path := ""
+ if strings.HasSuffix(in, ".slice") && strings.Contains(in, "-") {
+ parts := strings.Split(in, "-")
+ for i := range parts {
+ s := strings.Join(parts[0:i+1], "-")
+ if !strings.HasSuffix(s, ".slice") {
+ s += ".slice"
+ }
+ path = filepath.Join(path, s)
+ }
+ } else {
+ path = filepath.Join(path, in)
+ }
+ return path
+}
+
+func NewSystemd(slice, group string, pid int, resources *Resources) (*Manager, error) {
+ if slice == "" {
+ slice = defaultSlice
+ }
+ ctx := context.TODO()
+ path := getSystemdFullPath(slice, group)
+ conn, err := systemdDbus.NewWithContext(ctx)
+ if err != nil {
+ return &Manager{}, err
+ }
+ defer conn.Close()
+
+ properties := []systemdDbus.Property{
+ systemdDbus.PropDescription("cgroup " + group),
+ newSystemdProperty("DefaultDependencies", false),
+ newSystemdProperty("MemoryAccounting", true),
+ newSystemdProperty("CPUAccounting", true),
+ newSystemdProperty("IOAccounting", true),
+ }
+
+ // if we create a slice, the parent is defined via a Wants=
+ if strings.HasSuffix(group, ".slice") {
+ properties = append(properties, systemdDbus.PropWants(defaultSlice))
+ } else {
+ // otherwise, we use Slice=
+ properties = append(properties, systemdDbus.PropSlice(defaultSlice))
+ }
+
+ // only add pid if its valid, -1 is used w/ general slice creation.
+ if pid != -1 {
+ properties = append(properties, newSystemdProperty("PIDs", []uint32{uint32(pid)}))
+ }
+
+ if resources.Memory != nil && resources.Memory.Min != nil && *resources.Memory.Min != 0 {
+ properties = append(properties,
+ newSystemdProperty("MemoryMin", uint64(*resources.Memory.Min)))
+ }
+
+ if resources.Memory != nil && resources.Memory.Max != nil && *resources.Memory.Max != 0 {
+ properties = append(properties,
+ newSystemdProperty("MemoryMax", uint64(*resources.Memory.Max)))
+ }
+
+ if resources.CPU != nil && resources.CPU.Weight != nil && *resources.CPU.Weight != 0 {
+ properties = append(properties,
+ newSystemdProperty("CPUWeight", *resources.CPU.Weight))
+ }
+
+ if resources.CPU != nil && resources.CPU.Max != "" {
+ quota, period := resources.CPU.Max.extractQuotaAndPeriod()
+ // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
+ // corresponds to USEC_INFINITY in systemd
+ // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
+ // always setting a property value ensures we can apply a quota and remove it later
+ cpuQuotaPerSecUSec := uint64(math.MaxUint64)
+ if quota > 0 {
+ // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
+ // (integer percentage of CPU) internally. This means that if a fractional percent of
+ // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
+ // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
+ cpuQuotaPerSecUSec = uint64(quota*1000000) / period
+ if cpuQuotaPerSecUSec%10000 != 0 {
+ cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
+ }
+ }
+ properties = append(properties,
+ newSystemdProperty("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
+ }
+
+ // If we can delegate, we add the property back in
+ if canDelegate {
+ properties = append(properties, newSystemdProperty("Delegate", true))
+ }
+
+ if resources.Pids != nil && resources.Pids.Max > 0 {
+ properties = append(properties,
+ newSystemdProperty("TasksAccounting", true),
+ newSystemdProperty("TasksMax", uint64(resources.Pids.Max)))
+ }
+
+ statusChan := make(chan string, 1)
+ if _, err := conn.StartTransientUnitContext(ctx, group, "replace", properties, statusChan); err == nil {
+ select {
+ case <-statusChan:
+ case <-time.After(time.Second):
+ logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", group)
+ }
+ } else if !isUnitExists(err) {
+ return &Manager{}, err
+ }
+
+ return &Manager{
+ path: path,
+ }, nil
+}
+
+func LoadSystemd(slice, group string) (*Manager, error) {
+ if slice == "" {
+ slice = defaultSlice
+ }
+ path := getSystemdFullPath(slice, group)
+ return &Manager{
+ path: path,
+ }, nil
+}
+
+func (c *Manager) DeleteSystemd() error {
+ ctx := context.TODO()
+ conn, err := systemdDbus.NewWithContext(ctx)
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+ group := systemdUnitFromPath(c.path)
+ ch := make(chan string)
+ _, err = conn.StopUnitContext(ctx, group, "replace", ch)
+ if err != nil {
+ return err
+ }
+ <-ch
+ return nil
+}
+
+func newSystemdProperty(name string, units interface{}) systemdDbus.Property {
+ return systemdDbus.Property{
+ Name: name,
+ Value: dbus.MakeVariant(units),
+ }
+}