Source file
src/syscall/exec_linux.go
1
2
3
4
5
6
7 package syscall
8
9 import (
10 errpkg "errors"
11 "internal/itoa"
12 "runtime"
13 "unsafe"
14 )
15
16
17
18 const (
19 CLONE_VM = 0x00000100
20 CLONE_FS = 0x00000200
21 CLONE_FILES = 0x00000400
22 CLONE_SIGHAND = 0x00000800
23 CLONE_PIDFD = 0x00001000
24 CLONE_PTRACE = 0x00002000
25 CLONE_VFORK = 0x00004000
26 CLONE_PARENT = 0x00008000
27 CLONE_THREAD = 0x00010000
28 CLONE_NEWNS = 0x00020000
29 CLONE_SYSVSEM = 0x00040000
30 CLONE_SETTLS = 0x00080000
31 CLONE_PARENT_SETTID = 0x00100000
32 CLONE_CHILD_CLEARTID = 0x00200000
33 CLONE_DETACHED = 0x00400000
34 CLONE_UNTRACED = 0x00800000
35 CLONE_CHILD_SETTID = 0x01000000
36 CLONE_NEWCGROUP = 0x02000000
37 CLONE_NEWUTS = 0x04000000
38 CLONE_NEWIPC = 0x08000000
39 CLONE_NEWUSER = 0x10000000
40 CLONE_NEWPID = 0x20000000
41 CLONE_NEWNET = 0x40000000
42 CLONE_IO = 0x80000000
43
44
45
46 CLONE_CLEAR_SIGHAND = 0x100000000
47 CLONE_INTO_CGROUP = 0x200000000
48
49
50
51
52 CLONE_NEWTIME = 0x00000080
53 )
54
55
56
57
58
59
60
61 type SysProcIDMap struct {
62 ContainerID int
63 HostID int
64 Size int
65 }
66
67 type SysProcAttr struct {
68 Chroot string
69 Credential *Credential
70
71
72
73 Ptrace bool
74 Setsid bool
75
76
77 Setpgid bool
78
79
80
81
82 Setctty bool
83 Noctty bool
84 Ctty int
85
86
87
88
89
90 Foreground bool
91 Pgid int
92
93
94
95
96 Pdeathsig Signal
97 Cloneflags uintptr
98 Unshareflags uintptr
99 UidMappings []SysProcIDMap
100 GidMappings []SysProcIDMap
101
102
103
104
105 GidMappingsEnableSetgroups bool
106 AmbientCaps []uintptr
107 UseCgroupFD bool
108 CgroupFD int
109
110
111
112 PidFD *int
113 }
114
115 var (
116 none = [...]byte{'n', 'o', 'n', 'e', 0}
117 slash = [...]byte{'/', 0}
118
119 forceClone3 = false
120 )
121
122
123 func runtime_BeforeFork()
124 func runtime_AfterFork()
125 func runtime_AfterForkInChild()
126
127
128
129
130
131
132
133
134
135
136
137
138 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
139
140
141 upid, pidfd, err, mapPipe, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
142 if locked {
143 runtime_AfterFork()
144 }
145 if err != 0 {
146 return 0, err
147 }
148
149
150 pid = int(upid)
151 if sys.PidFD != nil {
152 *sys.PidFD = int(pidfd)
153 }
154
155 if sys.UidMappings != nil || sys.GidMappings != nil {
156 Close(mapPipe[0])
157 var err2 Errno
158
159
160 if sys.Unshareflags&CLONE_NEWUSER == 0 {
161 if err := writeUidGidMappings(pid, sys); err != nil {
162 err2 = err.(Errno)
163 }
164 }
165 RawSyscall(SYS_WRITE, uintptr(mapPipe[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
166 Close(mapPipe[1])
167 }
168
169 return pid, 0
170 }
171
172 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
173
174 type capHeader struct {
175 version uint32
176 pid int32
177 }
178
179 type capData struct {
180 effective uint32
181 permitted uint32
182 inheritable uint32
183 }
184 type caps struct {
185 hdr capHeader
186 data [2]capData
187 }
188
189
190 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
191
192
193 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
194
195
196 type cloneArgs struct {
197 flags uint64
198 pidFD uint64
199 childTID uint64
200 parentTID uint64
201 exitSignal uint64
202 stack uint64
203 stackSize uint64
204 tls uint64
205 setTID uint64
206 setTIDSize uint64
207 cgroup uint64
208 }
209
210
211
212
213
214
215
216
217
218
219
220
221 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid uintptr, pidfd int32, err1 Errno, mapPipe [2]int, locked bool) {
222
223 const (
224 PR_CAP_AMBIENT = 0x2f
225 PR_CAP_AMBIENT_RAISE = 0x2
226 )
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242 var (
243 err2 Errno
244 nextfd int
245 i int
246 caps caps
247 fd1, flags uintptr
248 puid, psetgroups, pgid []byte
249 uidmap, setgroups, gidmap []byte
250 clone3 *cloneArgs
251 pgrp int32
252 dirfd int
253 cred *Credential
254 ngroups, groups uintptr
255 c uintptr
256 rlim *Rlimit
257 lim Rlimit
258 )
259 pidfd = -1
260
261 rlim = origRlimitNofile.Load()
262
263 if sys.UidMappings != nil {
264 puid = []byte("/proc/self/uid_map\000")
265 uidmap = formatIDMappings(sys.UidMappings)
266 }
267
268 if sys.GidMappings != nil {
269 psetgroups = []byte("/proc/self/setgroups\000")
270 pgid = []byte("/proc/self/gid_map\000")
271
272 if sys.GidMappingsEnableSetgroups {
273 setgroups = []byte("allow\000")
274 } else {
275 setgroups = []byte("deny\000")
276 }
277 gidmap = formatIDMappings(sys.GidMappings)
278 }
279
280
281 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
282
283
284
285
286 fd := make([]int, len(attr.Files))
287 nextfd = len(attr.Files)
288 for i, ufd := range attr.Files {
289 if nextfd < int(ufd) {
290 nextfd = int(ufd)
291 }
292 fd[i] = int(ufd)
293 }
294 nextfd++
295
296
297
298 if sys.UidMappings != nil || sys.GidMappings != nil {
299 if err := forkExecPipe(mapPipe[:]); err != nil {
300 err1 = err.(Errno)
301 return
302 }
303 }
304
305 flags = sys.Cloneflags
306 if sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0 {
307 flags |= CLONE_VFORK | CLONE_VM
308 }
309 if sys.PidFD != nil {
310 flags |= CLONE_PIDFD
311 }
312
313 if sys.UseCgroupFD || flags&CLONE_NEWTIME != 0 || forceClone3 {
314 clone3 = &cloneArgs{
315 flags: uint64(flags),
316 exitSignal: uint64(SIGCHLD),
317 }
318 if sys.UseCgroupFD {
319 clone3.flags |= CLONE_INTO_CGROUP
320 clone3.cgroup = uint64(sys.CgroupFD)
321 }
322 if sys.PidFD != nil {
323 clone3.pidFD = uint64(uintptr(unsafe.Pointer(&pidfd)))
324 }
325 }
326
327
328
329 runtime_BeforeFork()
330 locked = true
331 if clone3 != nil {
332 pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0)
333 } else {
334
335 flags |= uintptr(SIGCHLD)
336 if runtime.GOARCH == "s390x" {
337
338 pid, err1 = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(&pidfd)))
339 } else {
340 pid, err1 = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(&pidfd)))
341 }
342 }
343 if err1 != 0 || pid != 0 {
344
345
346
347
348
349
350 return
351 }
352
353
354
355
356 if len(sys.AmbientCaps) > 0 {
357 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
358 if err1 != 0 {
359 goto childerror
360 }
361 }
362
363
364 if sys.UidMappings != nil || sys.GidMappings != nil {
365 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {
366 goto childerror
367 }
368 pid, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
369 if err1 != 0 {
370 goto childerror
371 }
372 if pid != unsafe.Sizeof(err2) {
373 err1 = EINVAL
374 goto childerror
375 }
376 if err2 != 0 {
377 err1 = err2
378 goto childerror
379 }
380 }
381
382
383 if sys.Setsid {
384 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
385 if err1 != 0 {
386 goto childerror
387 }
388 }
389
390
391 if sys.Setpgid || sys.Foreground {
392
393 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
394 if err1 != 0 {
395 goto childerror
396 }
397 }
398
399 if sys.Foreground {
400 pgrp = int32(sys.Pgid)
401 if pgrp == 0 {
402 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
403
404 pgrp = int32(pid)
405 }
406
407
408 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
409 if err1 != 0 {
410 goto childerror
411 }
412 }
413
414
415
416 runtime_AfterForkInChild()
417
418
419 if sys.Unshareflags != 0 {
420 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
421 if err1 != 0 {
422 goto childerror
423 }
424
425 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
426 dirfd = int(_AT_FDCWD)
427 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
428 goto childerror
429 }
430 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
431 if err1 != 0 {
432 goto childerror
433 }
434 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
435 goto childerror
436 }
437
438 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
439 goto childerror
440 }
441 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
442 if err1 != 0 {
443 goto childerror
444 }
445 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
446 goto childerror
447 }
448 }
449
450 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
451 dirfd = int(_AT_FDCWD)
452 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
453 goto childerror
454 }
455 pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
456 if err1 != 0 {
457 goto childerror
458 }
459 if _, _, err1 = RawSyscall(SYS_CLOSE, fd1, 0, 0); err1 != 0 {
460 goto childerror
461 }
462 }
463
464
465
466
467
468
469
470
471 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
472 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
473 if err1 != 0 {
474 goto childerror
475 }
476 }
477 }
478
479
480 if chroot != nil {
481 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
482 if err1 != 0 {
483 goto childerror
484 }
485 }
486
487
488 if cred = sys.Credential; cred != nil {
489 ngroups = uintptr(len(cred.Groups))
490 groups = uintptr(0)
491 if ngroups > 0 {
492 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
493 }
494 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
495 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
496 if err1 != 0 {
497 goto childerror
498 }
499 }
500 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
501 if err1 != 0 {
502 goto childerror
503 }
504 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
505 if err1 != 0 {
506 goto childerror
507 }
508 }
509
510 if len(sys.AmbientCaps) != 0 {
511
512
513 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
514
515 if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
516 goto childerror
517 }
518
519 for _, c = range sys.AmbientCaps {
520
521
522 caps.data[capToIndex(c)].permitted |= capToMask(c)
523 caps.data[capToIndex(c)].inheritable |= capToMask(c)
524 }
525
526 if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
527 goto childerror
528 }
529
530 for _, c = range sys.AmbientCaps {
531 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
532 if err1 != 0 {
533 goto childerror
534 }
535 }
536 }
537
538
539 if dir != nil {
540 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
541 if err1 != 0 {
542 goto childerror
543 }
544 }
545
546
547 if sys.Pdeathsig != 0 {
548 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
549 if err1 != 0 {
550 goto childerror
551 }
552
553
554
555
556 pid, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
557 if pid != ppid {
558 pid, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
559 _, _, err1 = RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
560 if err1 != 0 {
561 goto childerror
562 }
563 }
564 }
565
566
567
568 if pipe < nextfd {
569 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
570 if err1 != 0 {
571 goto childerror
572 }
573 pipe = nextfd
574 nextfd++
575 }
576 for i = 0; i < len(fd); i++ {
577 if fd[i] >= 0 && fd[i] < i {
578 if nextfd == pipe {
579 nextfd++
580 }
581 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
582 if err1 != 0 {
583 goto childerror
584 }
585 fd[i] = nextfd
586 nextfd++
587 }
588 }
589
590
591 for i = 0; i < len(fd); i++ {
592 if fd[i] == -1 {
593 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
594 continue
595 }
596 if fd[i] == i {
597
598
599 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
600 if err1 != 0 {
601 goto childerror
602 }
603 continue
604 }
605
606
607 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
608 if err1 != 0 {
609 goto childerror
610 }
611 }
612
613
614
615
616
617 for i = len(fd); i < 3; i++ {
618 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
619 }
620
621
622 if sys.Noctty {
623 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
624 if err1 != 0 {
625 goto childerror
626 }
627 }
628
629
630 if sys.Setctty {
631 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
632 if err1 != 0 {
633 goto childerror
634 }
635 }
636
637
638 if rlim != nil {
639
640
641
642
643
644
645
646
647
648
649 _, _, err1 = RawSyscall6(SYS_PRLIMIT64, 0, RLIMIT_NOFILE, 0, uintptr(unsafe.Pointer(&lim)), 0, 0)
650 if err1 != 0 || (lim.Cur == rlim.Max-1 && lim.Max == rlim.Max) {
651 RawSyscall6(SYS_PRLIMIT64, 0, RLIMIT_NOFILE, uintptr(unsafe.Pointer(rlim)), 0, 0, 0)
652 }
653 }
654
655
656
657
658 if sys.Ptrace {
659 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
660 if err1 != 0 {
661 goto childerror
662 }
663 }
664
665
666 _, _, err1 = RawSyscall(SYS_EXECVE,
667 uintptr(unsafe.Pointer(argv0)),
668 uintptr(unsafe.Pointer(&argv[0])),
669 uintptr(unsafe.Pointer(&envv[0])))
670
671 childerror:
672
673 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
674 for {
675 RawSyscall(SYS_EXIT, 253, 0, 0)
676 }
677 }
678
679 func formatIDMappings(idMap []SysProcIDMap) []byte {
680 var data []byte
681 for _, im := range idMap {
682 data = append(data, itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n"...)
683 }
684 return data
685 }
686
687
688 func writeIDMappings(path string, idMap []SysProcIDMap) error {
689 fd, err := Open(path, O_RDWR, 0)
690 if err != nil {
691 return err
692 }
693
694 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
695 Close(fd)
696 return err
697 }
698
699 if err := Close(fd); err != nil {
700 return err
701 }
702
703 return nil
704 }
705
706
707
708
709
710 func writeSetgroups(pid int, enable bool) error {
711 sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
712 fd, err := Open(sgf, O_RDWR, 0)
713 if err != nil {
714 return err
715 }
716
717 var data []byte
718 if enable {
719 data = []byte("allow")
720 } else {
721 data = []byte("deny")
722 }
723
724 if _, err := Write(fd, data); err != nil {
725 Close(fd)
726 return err
727 }
728
729 return Close(fd)
730 }
731
732
733
734 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
735 if sys.UidMappings != nil {
736 uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
737 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
738 return err
739 }
740 }
741
742 if sys.GidMappings != nil {
743
744 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
745 return err
746 }
747 gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
748 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
749 return err
750 }
751 }
752
753 return nil
754 }
755
756
757 func forkAndExecFailureCleanup(attr *ProcAttr, sys *SysProcAttr) {
758 if sys.PidFD != nil && *sys.PidFD != -1 {
759 Close(*sys.PidFD)
760 *sys.PidFD = -1
761 }
762 }
763
764
765
766
767
768 func os_checkClonePidfd() error {
769 pidfd := int32(-1)
770 pid, errno := doCheckClonePidfd(&pidfd)
771 if errno != 0 {
772 return errno
773 }
774
775 if pidfd == -1 {
776
777
778
779 var err error
780 for {
781 var status WaitStatus
782 _, err = Wait4(int(pid), &status, 0, nil)
783 if err != EINTR {
784 break
785 }
786 }
787 if err != nil {
788 return err
789 }
790
791 return errpkg.New("clone(CLONE_PIDFD) failed to return pidfd")
792 }
793
794
795
796 defer Close(int(pidfd))
797
798 for {
799 const _P_PIDFD = 3
800 _, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), 0, WEXITED, 0, 0)
801 if errno != EINTR {
802 break
803 }
804 }
805 if errno != 0 {
806 return errno
807 }
808
809 return nil
810 }
811
812
813
814
815
816
817
818
819
820 func doCheckClonePidfd(pidfd *int32) (pid uintptr, errno Errno) {
821 flags := uintptr(CLONE_VFORK | CLONE_VM | CLONE_PIDFD | SIGCHLD)
822 if runtime.GOARCH == "s390x" {
823
824 pid, errno = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(pidfd)))
825 } else {
826 pid, errno = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(pidfd)))
827 }
828 if errno != 0 || pid != 0 {
829
830
831
832
833
834
835 return
836 }
837
838 for {
839 RawSyscall(SYS_EXIT_GROUP, 0, 0, 0)
840 }
841 }
842
View as plain text