Source file
src/syscall/exec_linux.go
Documentation: syscall
1
2
3
4
5
6
7
8 package syscall
9
10 import (
11 "internal/itoa"
12 "runtime"
13 "unsafe"
14 )
15
16
17
18 type SysProcIDMap struct {
19 ContainerID int
20 HostID int
21 Size int
22 }
23
24 type SysProcAttr struct {
25 Chroot string
26 Credential *Credential
27
28
29
30 Ptrace bool
31 Setsid bool
32
33
34 Setpgid bool
35
36
37
38
39 Setctty bool
40 Noctty bool
41 Ctty int
42
43
44
45
46
47 Foreground bool
48 Pgid int
49 Pdeathsig Signal
50 Cloneflags uintptr
51 Unshareflags uintptr
52 UidMappings []SysProcIDMap
53 GidMappings []SysProcIDMap
54
55
56
57
58 GidMappingsEnableSetgroups bool
59 AmbientCaps []uintptr
60 }
61
62 var (
63 none = [...]byte{'n', 'o', 'n', 'e', 0}
64 slash = [...]byte{'/', 0}
65 )
66
67
68 func runtime_BeforeFork()
69 func runtime_AfterFork()
70 func runtime_AfterForkInChild()
71
72
73
74
75
76
77
78
79
80
81
82 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
83
84
85 r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
86 if locked {
87 runtime_AfterFork()
88 }
89 if err1 != 0 {
90 return 0, err1
91 }
92
93
94 pid = int(r1)
95
96 if sys.UidMappings != nil || sys.GidMappings != nil {
97 Close(p[0])
98 var err2 Errno
99
100
101 if sys.Unshareflags&CLONE_NEWUSER == 0 {
102 if err := writeUidGidMappings(pid, sys); err != nil {
103 err2 = err.(Errno)
104 }
105 }
106 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
107 Close(p[1])
108 }
109
110 return pid, 0
111 }
112
113 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
114
115 type capHeader struct {
116 version uint32
117 pid int32
118 }
119
120 type capData struct {
121 effective uint32
122 permitted uint32
123 inheritable uint32
124 }
125 type caps struct {
126 hdr capHeader
127 data [2]capData
128 }
129
130
131 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
132
133
134 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
135
136
137
138
139
140
141
142
143
144
145
146 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
147
148 const (
149 PR_CAP_AMBIENT = 0x2f
150 PR_CAP_AMBIENT_RAISE = 0x2
151 )
152
153
154
155
156
157
158
159
160 var (
161 err2 Errno
162 nextfd int
163 i int
164 caps caps
165 fd1 uintptr
166 puid, psetgroups, pgid []byte
167 uidmap, setgroups, gidmap []byte
168 )
169
170 if sys.UidMappings != nil {
171 puid = []byte("/proc/self/uid_map\000")
172 uidmap = formatIDMappings(sys.UidMappings)
173 }
174
175 if sys.GidMappings != nil {
176 psetgroups = []byte("/proc/self/setgroups\000")
177 pgid = []byte("/proc/self/gid_map\000")
178
179 if sys.GidMappingsEnableSetgroups {
180 setgroups = []byte("allow\000")
181 } else {
182 setgroups = []byte("deny\000")
183 }
184 gidmap = formatIDMappings(sys.GidMappings)
185 }
186
187
188 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
189
190
191
192
193 fd := make([]int, len(attr.Files))
194 nextfd = len(attr.Files)
195 for i, ufd := range attr.Files {
196 if nextfd < int(ufd) {
197 nextfd = int(ufd)
198 }
199 fd[i] = int(ufd)
200 }
201 nextfd++
202
203
204
205 if sys.UidMappings != nil || sys.GidMappings != nil {
206 if err := forkExecPipe(p[:]); err != nil {
207 err1 = err.(Errno)
208 return
209 }
210 }
211
212
213
214 runtime_BeforeFork()
215 locked = true
216 switch {
217 case sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0:
218 r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
219 case runtime.GOARCH == "s390x":
220 r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
221 default:
222 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
223 }
224 if err1 != 0 || r1 != 0 {
225
226
227
228
229
230
231 return
232 }
233
234
235
236
237 if len(sys.AmbientCaps) > 0 {
238 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
239 if err1 != 0 {
240 goto childerror
241 }
242 }
243
244
245 if sys.UidMappings != nil || sys.GidMappings != nil {
246 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
247 goto childerror
248 }
249 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
250 if err1 != 0 {
251 goto childerror
252 }
253 if r1 != unsafe.Sizeof(err2) {
254 err1 = EINVAL
255 goto childerror
256 }
257 if err2 != 0 {
258 err1 = err2
259 goto childerror
260 }
261 }
262
263
264 if sys.Setsid {
265 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
266 if err1 != 0 {
267 goto childerror
268 }
269 }
270
271
272 if sys.Setpgid || sys.Foreground {
273
274 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
275 if err1 != 0 {
276 goto childerror
277 }
278 }
279
280 if sys.Foreground {
281 pgrp := int32(sys.Pgid)
282 if pgrp == 0 {
283 r1, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
284
285 pgrp = int32(r1)
286 }
287
288
289 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
290 if err1 != 0 {
291 goto childerror
292 }
293 }
294
295
296
297 runtime_AfterForkInChild()
298
299
300 if sys.Unshareflags != 0 {
301 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
302 if err1 != 0 {
303 goto childerror
304 }
305
306 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
307 dirfd := int(_AT_FDCWD)
308 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
309 goto childerror
310 }
311 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
312 if err1 != 0 {
313 goto childerror
314 }
315 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
316 goto childerror
317 }
318
319 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
320 goto childerror
321 }
322 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
323 if err1 != 0 {
324 goto childerror
325 }
326 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
327 goto childerror
328 }
329 }
330
331 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
332 dirfd := int(_AT_FDCWD)
333 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
334 goto childerror
335 }
336 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
337 if err1 != 0 {
338 goto childerror
339 }
340 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
341 goto childerror
342 }
343 }
344
345
346
347
348
349
350
351
352 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
353 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
354 if err1 != 0 {
355 goto childerror
356 }
357 }
358 }
359
360
361 if chroot != nil {
362 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
363 if err1 != 0 {
364 goto childerror
365 }
366 }
367
368
369 if cred := sys.Credential; cred != nil {
370 ngroups := uintptr(len(cred.Groups))
371 groups := uintptr(0)
372 if ngroups > 0 {
373 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
374 }
375 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
376 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
377 if err1 != 0 {
378 goto childerror
379 }
380 }
381 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
382 if err1 != 0 {
383 goto childerror
384 }
385 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
386 if err1 != 0 {
387 goto childerror
388 }
389 }
390
391 if len(sys.AmbientCaps) != 0 {
392
393
394 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
395
396 if _, _, err1 := RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
397 goto childerror
398 }
399
400 for _, c := range sys.AmbientCaps {
401
402
403 caps.data[capToIndex(c)].permitted |= capToMask(c)
404 caps.data[capToIndex(c)].inheritable |= capToMask(c)
405 }
406
407 if _, _, err1 := RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
408 goto childerror
409 }
410
411 for _, c := range sys.AmbientCaps {
412 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
413 if err1 != 0 {
414 goto childerror
415 }
416 }
417 }
418
419
420 if dir != nil {
421 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
422 if err1 != 0 {
423 goto childerror
424 }
425 }
426
427
428 if sys.Pdeathsig != 0 {
429 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
430 if err1 != 0 {
431 goto childerror
432 }
433
434
435
436
437 r1, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
438 if r1 != ppid {
439 pid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
440 _, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
441 if err1 != 0 {
442 goto childerror
443 }
444 }
445 }
446
447
448
449 if pipe < nextfd {
450 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
451 if _SYS_dup != SYS_DUP3 && err1 == ENOSYS {
452 _, _, err1 = RawSyscall(_SYS_dup, uintptr(pipe), uintptr(nextfd), 0)
453 if err1 != 0 {
454 goto childerror
455 }
456 RawSyscall(fcntl64Syscall, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
457 } else if err1 != 0 {
458 goto childerror
459 }
460 pipe = nextfd
461 nextfd++
462 }
463 for i = 0; i < len(fd); i++ {
464 if fd[i] >= 0 && fd[i] < int(i) {
465 if nextfd == pipe {
466 nextfd++
467 }
468 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
469 if _SYS_dup != SYS_DUP3 && err1 == ENOSYS {
470 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(nextfd), 0)
471 if err1 != 0 {
472 goto childerror
473 }
474 RawSyscall(fcntl64Syscall, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
475 } else if err1 != 0 {
476 goto childerror
477 }
478 fd[i] = nextfd
479 nextfd++
480 }
481 }
482
483
484 for i = 0; i < len(fd); i++ {
485 if fd[i] == -1 {
486 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
487 continue
488 }
489 if fd[i] == int(i) {
490
491
492 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
493 if err1 != 0 {
494 goto childerror
495 }
496 continue
497 }
498
499
500 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(i), 0)
501 if err1 != 0 {
502 goto childerror
503 }
504 }
505
506
507
508
509
510 for i = len(fd); i < 3; i++ {
511 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
512 }
513
514
515 if sys.Noctty {
516 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
517 if err1 != 0 {
518 goto childerror
519 }
520 }
521
522
523 if sys.Setctty {
524 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
525 if err1 != 0 {
526 goto childerror
527 }
528 }
529
530
531
532
533 if sys.Ptrace {
534 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
535 if err1 != 0 {
536 goto childerror
537 }
538 }
539
540
541 _, _, err1 = RawSyscall(SYS_EXECVE,
542 uintptr(unsafe.Pointer(argv0)),
543 uintptr(unsafe.Pointer(&argv[0])),
544 uintptr(unsafe.Pointer(&envv[0])))
545
546 childerror:
547
548 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
549 for {
550 RawSyscall(SYS_EXIT, 253, 0, 0)
551 }
552 }
553
554
555 func forkExecPipe(p []int) (err error) {
556 err = Pipe2(p, O_CLOEXEC)
557
558
559 if err == ENOSYS {
560 if err = Pipe(p); err != nil {
561 return
562 }
563 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil {
564 return
565 }
566 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC)
567 }
568 return
569 }
570
571 func formatIDMappings(idMap []SysProcIDMap) []byte {
572 var data []byte
573 for _, im := range idMap {
574 data = append(data, []byte(itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n")...)
575 }
576 return data
577 }
578
579
580 func writeIDMappings(path string, idMap []SysProcIDMap) error {
581 fd, err := Open(path, O_RDWR, 0)
582 if err != nil {
583 return err
584 }
585
586 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
587 Close(fd)
588 return err
589 }
590
591 if err := Close(fd); err != nil {
592 return err
593 }
594
595 return nil
596 }
597
598
599
600
601
602 func writeSetgroups(pid int, enable bool) error {
603 sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
604 fd, err := Open(sgf, O_RDWR, 0)
605 if err != nil {
606 return err
607 }
608
609 var data []byte
610 if enable {
611 data = []byte("allow")
612 } else {
613 data = []byte("deny")
614 }
615
616 if _, err := Write(fd, data); err != nil {
617 Close(fd)
618 return err
619 }
620
621 return Close(fd)
622 }
623
624
625
626 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
627 if sys.UidMappings != nil {
628 uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
629 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
630 return err
631 }
632 }
633
634 if sys.GidMappings != nil {
635
636 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
637 return err
638 }
639 gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
640 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
641 return err
642 }
643 }
644
645 return nil
646 }
647
View as plain text