Source file
src/syscall/exec_linux.go
1
2
3
4
5
6
7 package syscall
8
9 import (
10 errpkg "errors"
11 "internal/itoa"
12 "runtime"
13 "unsafe"
14 )
15
16
17
18 const (
19 CLONE_VM = 0x00000100
20 CLONE_FS = 0x00000200
21 CLONE_FILES = 0x00000400
22 CLONE_SIGHAND = 0x00000800
23 CLONE_PIDFD = 0x00001000
24 CLONE_PTRACE = 0x00002000
25 CLONE_VFORK = 0x00004000
26 CLONE_PARENT = 0x00008000
27 CLONE_THREAD = 0x00010000
28 CLONE_NEWNS = 0x00020000
29 CLONE_SYSVSEM = 0x00040000
30 CLONE_SETTLS = 0x00080000
31 CLONE_PARENT_SETTID = 0x00100000
32 CLONE_CHILD_CLEARTID = 0x00200000
33 CLONE_DETACHED = 0x00400000
34 CLONE_UNTRACED = 0x00800000
35 CLONE_CHILD_SETTID = 0x01000000
36 CLONE_NEWCGROUP = 0x02000000
37 CLONE_NEWUTS = 0x04000000
38 CLONE_NEWIPC = 0x08000000
39 CLONE_NEWUSER = 0x10000000
40 CLONE_NEWPID = 0x20000000
41 CLONE_NEWNET = 0x40000000
42 CLONE_IO = 0x80000000
43
44
45
46 CLONE_CLEAR_SIGHAND = 0x100000000
47 CLONE_INTO_CGROUP = 0x200000000
48
49
50
51
52 CLONE_NEWTIME = 0x00000080
53 )
54
55
56
57
58
59
60
61 type SysProcIDMap struct {
62 ContainerID int
63 HostID int
64 Size int
65 }
66
67 type SysProcAttr struct {
68 Chroot string
69 Credential *Credential
70
71
72
73 Ptrace bool
74 Setsid bool
75
76
77 Setpgid bool
78
79
80
81
82 Setctty bool
83 Noctty bool
84 Ctty int
85
86
87
88
89
90 Foreground bool
91 Pgid int
92
93
94
95
96 Pdeathsig Signal
97 Cloneflags uintptr
98 Unshareflags uintptr
99 UidMappings []SysProcIDMap
100 GidMappings []SysProcIDMap
101
102
103
104
105 GidMappingsEnableSetgroups bool
106 AmbientCaps []uintptr
107 UseCgroupFD bool
108 CgroupFD int
109
110
111
112 PidFD *int
113 }
114
115 var (
116 none = [...]byte{'n', 'o', 'n', 'e', 0}
117 slash = [...]byte{'/', 0}
118
119 forceClone3 = false
120 )
121
122
123 func runtime_BeforeFork()
124 func runtime_AfterFork()
125 func runtime_AfterForkInChild()
126
127
128
129
130
131
132
133
134
135
136
137
138 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
139
140
141 upid, pidfd, err, mapPipe, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
142 if locked {
143 runtime_AfterFork()
144 }
145 if err != 0 {
146 return 0, err
147 }
148
149
150 pid = int(upid)
151 if sys.PidFD != nil {
152 *sys.PidFD = int(pidfd)
153 }
154
155 if sys.UidMappings != nil || sys.GidMappings != nil {
156 Close(mapPipe[0])
157 var err2 Errno
158
159
160 if sys.Unshareflags&CLONE_NEWUSER == 0 {
161 if err := writeUidGidMappings(pid, sys); err != nil {
162 err2 = err.(Errno)
163 }
164 }
165 RawSyscall(SYS_WRITE, uintptr(mapPipe[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
166 Close(mapPipe[1])
167 }
168
169 return pid, 0
170 }
171
172 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
173
174 type capHeader struct {
175 version uint32
176 pid int32
177 }
178
179 type capData struct {
180 effective uint32
181 permitted uint32
182 inheritable uint32
183 }
184 type caps struct {
185 hdr capHeader
186 data [2]capData
187 }
188
189
190 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
191
192
193 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
194
195
196 type cloneArgs struct {
197 flags uint64
198 pidFD uint64
199 childTID uint64
200 parentTID uint64
201 exitSignal uint64
202 stack uint64
203 stackSize uint64
204 tls uint64
205 setTID uint64
206 setTIDSize uint64
207 cgroup uint64
208 }
209
210
211
212
213
214
215
216
217
218
219
220
221 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid uintptr, pidfd int32, err1 Errno, mapPipe [2]int, locked bool) {
222
223 const (
224 PR_CAP_AMBIENT = 0x2f
225 PR_CAP_AMBIENT_RAISE = 0x2
226 )
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242 var (
243 err2 Errno
244 nextfd int
245 i int
246 caps caps
247 fd1, flags uintptr
248 puid, psetgroups, pgid []byte
249 uidmap, setgroups, gidmap []byte
250 clone3 *cloneArgs
251 pgrp int32
252 dirfd int
253 cred *Credential
254 ngroups, groups uintptr
255 c uintptr
256 )
257 pidfd = -1
258
259 rlim := origRlimitNofile.Load()
260
261 if sys.UidMappings != nil {
262 puid = []byte("/proc/self/uid_map\000")
263 uidmap = formatIDMappings(sys.UidMappings)
264 }
265
266 if sys.GidMappings != nil {
267 psetgroups = []byte("/proc/self/setgroups\000")
268 pgid = []byte("/proc/self/gid_map\000")
269
270 if sys.GidMappingsEnableSetgroups {
271 setgroups = []byte("allow\000")
272 } else {
273 setgroups = []byte("deny\000")
274 }
275 gidmap = formatIDMappings(sys.GidMappings)
276 }
277
278
279 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
280
281
282
283
284 fd := make([]int, len(attr.Files))
285 nextfd = len(attr.Files)
286 for i, ufd := range attr.Files {
287 if nextfd < int(ufd) {
288 nextfd = int(ufd)
289 }
290 fd[i] = int(ufd)
291 }
292 nextfd++
293
294
295
296 if sys.UidMappings != nil || sys.GidMappings != nil {
297 if err := forkExecPipe(mapPipe[:]); err != nil {
298 err1 = err.(Errno)
299 return
300 }
301 }
302
303 flags = sys.Cloneflags
304 if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
305 flags |= CLONE_VFORK | CLONE_VM
306 }
307 if sys.PidFD != nil {
308 flags |= CLONE_PIDFD
309 }
310
311 if sys.UseCgroupFD || flags&CLONE_NEWTIME != 0 || forceClone3 {
312 clone3 = &cloneArgs{
313 flags: uint64(flags),
314 exitSignal: uint64(SIGCHLD),
315 }
316 if sys.UseCgroupFD {
317 clone3.flags |= CLONE_INTO_CGROUP
318 clone3.cgroup = uint64(sys.CgroupFD)
319 }
320 if sys.PidFD != nil {
321 clone3.pidFD = uint64(uintptr(unsafe.Pointer(&pidfd)))
322 }
323 }
324
325
326
327 runtime_BeforeFork()
328 locked = true
329 if clone3 != nil {
330 pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0)
331 } else {
332
333 flags |= uintptr(SIGCHLD)
334 if runtime.GOARCH == "s390x" {
335
336 pid, err1 = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(&pidfd)))
337 } else {
338 pid, err1 = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(&pidfd)))
339 }
340 }
341 if err1 != 0 || pid != 0 {
342
343
344
345
346
347
348 return
349 }
350
351
352
353
354 if len(sys.AmbientCaps) > 0 {
355 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
356 if err1 != 0 {
357 goto childerror
358 }
359 }
360
361
362 if sys.UidMappings != nil || sys.GidMappings != nil {
363 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {
364 goto childerror
365 }
366 pid, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
367 if err1 != 0 {
368 goto childerror
369 }
370 if pid != unsafe.Sizeof(err2) {
371 err1 = EINVAL
372 goto childerror
373 }
374 if err2 != 0 {
375 err1 = err2
376 goto childerror
377 }
378 }
379
380
381 if sys.Setsid {
382 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
383 if err1 != 0 {
384 goto childerror
385 }
386 }
387
388
389 if sys.Setpgid || sys.Foreground {
390
391 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
392 if err1 != 0 {
393 goto childerror
394 }
395 }
396
397 if sys.Foreground {
398 pgrp = int32(sys.Pgid)
399 if pgrp == 0 {
400 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
401
402 pgrp = int32(pid)
403 }
404
405
406 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
407 if err1 != 0 {
408 goto childerror
409 }
410 }
411
412
413
414 runtime_AfterForkInChild()
415
416
417 if sys.Unshareflags != 0 {
418 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
419 if err1 != 0 {
420 goto childerror
421 }
422
423 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
424 dirfd = int(_AT_FDCWD)
425 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
426 goto childerror
427 }
428 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
429 if err1 != 0 {
430 goto childerror
431 }
432 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
433 goto childerror
434 }
435
436 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
437 goto childerror
438 }
439 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
440 if err1 != 0 {
441 goto childerror
442 }
443 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
444 goto childerror
445 }
446 }
447
448 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
449 dirfd = int(_AT_FDCWD)
450 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
451 goto childerror
452 }
453 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
454 if err1 != 0 {
455 goto childerror
456 }
457 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
458 goto childerror
459 }
460 }
461
462
463
464
465
466
467
468
469 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
470 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
471 if err1 != 0 {
472 goto childerror
473 }
474 }
475 }
476
477
478 if chroot != nil {
479 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
480 if err1 != 0 {
481 goto childerror
482 }
483 }
484
485
486 if cred = sys.Credential; cred != nil {
487 ngroups = uintptr(len(cred.Groups))
488 groups = uintptr(0)
489 if ngroups > 0 {
490 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
491 }
492 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
493 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
494 if err1 != 0 {
495 goto childerror
496 }
497 }
498 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
499 if err1 != 0 {
500 goto childerror
501 }
502 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
503 if err1 != 0 {
504 goto childerror
505 }
506 }
507
508 if len(sys.AmbientCaps) != 0 {
509
510
511 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
512
513 if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
514 goto childerror
515 }
516
517 for _, c = range sys.AmbientCaps {
518
519
520 caps.data[capToIndex(c)].permitted |= capToMask(c)
521 caps.data[capToIndex(c)].inheritable |= capToMask(c)
522 }
523
524 if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
525 goto childerror
526 }
527
528 for _, c = range sys.AmbientCaps {
529 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
530 if err1 != 0 {
531 goto childerror
532 }
533 }
534 }
535
536
537 if dir != nil {
538 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
539 if err1 != 0 {
540 goto childerror
541 }
542 }
543
544
545 if sys.Pdeathsig != 0 {
546 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
547 if err1 != 0 {
548 goto childerror
549 }
550
551
552
553
554 pid, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
555 if pid != ppid {
556 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
557 _, _, err1 = RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
558 if err1 != 0 {
559 goto childerror
560 }
561 }
562 }
563
564
565
566 if pipe < nextfd {
567 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
568 if err1 != 0 {
569 goto childerror
570 }
571 pipe = nextfd
572 nextfd++
573 }
574 for i = 0; i < len(fd); i++ {
575 if fd[i] >= 0 && fd[i] < i {
576 if nextfd == pipe {
577 nextfd++
578 }
579 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
580 if err1 != 0 {
581 goto childerror
582 }
583 fd[i] = nextfd
584 nextfd++
585 }
586 }
587
588
589 for i = 0; i < len(fd); i++ {
590 if fd[i] == -1 {
591 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
592 continue
593 }
594 if fd[i] == i {
595
596
597 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
598 if err1 != 0 {
599 goto childerror
600 }
601 continue
602 }
603
604
605 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
606 if err1 != 0 {
607 goto childerror
608 }
609 }
610
611
612
613
614
615 for i = len(fd); i < 3; i++ {
616 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
617 }
618
619
620 if sys.Noctty {
621 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
622 if err1 != 0 {
623 goto childerror
624 }
625 }
626
627
628 if sys.Setctty {
629 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
630 if err1 != 0 {
631 goto childerror
632 }
633 }
634
635
636 if rlim != nil {
637 rawSetrlimit(RLIMIT_NOFILE, rlim)
638 }
639
640
641
642
643 if sys.Ptrace {
644 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
645 if err1 != 0 {
646 goto childerror
647 }
648 }
649
650
651 _, _, err1 = RawSyscall(SYS_EXECVE,
652 uintptr(unsafe.Pointer(argv0)),
653 uintptr(unsafe.Pointer(&argv[0])),
654 uintptr(unsafe.Pointer(&envv[0])))
655
656 childerror:
657
658 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
659 for {
660 RawSyscall(SYS_EXIT, 253, 0, 0)
661 }
662 }
663
664 func formatIDMappings(idMap []SysProcIDMap) []byte {
665 var data []byte
666 for _, im := range idMap {
667 data = append(data, itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n"...)
668 }
669 return data
670 }
671
672
673 func writeIDMappings(path string, idMap []SysProcIDMap) error {
674 fd, err := Open(path, O_RDWR, 0)
675 if err != nil {
676 return err
677 }
678
679 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
680 Close(fd)
681 return err
682 }
683
684 if err := Close(fd); err != nil {
685 return err
686 }
687
688 return nil
689 }
690
691
692
693
694
695 func writeSetgroups(pid int, enable bool) error {
696 sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
697 fd, err := Open(sgf, O_RDWR, 0)
698 if err != nil {
699 return err
700 }
701
702 var data []byte
703 if enable {
704 data = []byte("allow")
705 } else {
706 data = []byte("deny")
707 }
708
709 if _, err := Write(fd, data); err != nil {
710 Close(fd)
711 return err
712 }
713
714 return Close(fd)
715 }
716
717
718
719 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
720 if sys.UidMappings != nil {
721 uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
722 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
723 return err
724 }
725 }
726
727 if sys.GidMappings != nil {
728
729 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
730 return err
731 }
732 gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
733 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
734 return err
735 }
736 }
737
738 return nil
739 }
740
741
742 func forkAndExecFailureCleanup(attr *ProcAttr, sys *SysProcAttr) {
743 if sys.PidFD != nil && *sys.PidFD != -1 {
744 Close(*sys.PidFD)
745 *sys.PidFD = -1
746 }
747 }
748
749
750
751
752
753 func os_checkClonePidfd() error {
754 pidfd := int32(-1)
755 pid, errno := doCheckClonePidfd(&pidfd)
756 if errno != 0 {
757 return errno
758 }
759
760 if pidfd == -1 {
761
762
763
764 var err error
765 for {
766 var status WaitStatus
767 _, err = Wait4(int(pid), &status, 0, nil)
768 if err != EINTR {
769 break
770 }
771 }
772 if err != nil {
773 return err
774 }
775
776 return errpkg.New("clone(CLONE_PIDFD) failed to return pidfd")
777 }
778
779
780
781 defer Close(int(pidfd))
782
783 for {
784 const _P_PIDFD = 3
785 _, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), 0, WEXITED, 0, 0)
786 if errno != EINTR {
787 break
788 }
789 }
790 if errno != 0 {
791 return errno
792 }
793
794 return nil
795 }
796
797
798
799
800
801
802
803
804
805 func doCheckClonePidfd(pidfd *int32) (pid uintptr, errno Errno) {
806 flags := uintptr(CLONE_VFORK|CLONE_VM|CLONE_PIDFD|SIGCHLD)
807 if runtime.GOARCH == "s390x" {
808
809 pid, errno = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(pidfd)))
810 } else {
811 pid, errno = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(pidfd)))
812 }
813 if errno != 0 || pid != 0 {
814
815
816
817
818
819
820 return
821 }
822
823 for {
824 RawSyscall(SYS_EXIT_GROUP, 0, 0, 0)
825 }
826 }
827
View as plain text