#with-exec + services + non-nil meta bug
1 messages ยท Page 1 of 1 (latest)
I was able to replicate the issue using:
package main
import (
"context"
"fmt"
"time"
"dagger.io/dagger/dag"
)
const alpineImage = "alpine:latest"
func main() {
defer dag.Close()
ctx := context.Background()
salt := fmt.Sprintf("%v", time.Now().UnixNano())
script := dag.Container().
From("alpine:3.20.2").
WithNewFile("script", "#!/bin/sh\nwhile true; do nc -l -p 8080; done").
WithExec([]string{"/bin/sh", "-c", "chmod +x script"}).
File("script")
srv := dag.Container().
From("alpine:3.20.2").
WithNewFile("salty", salt).
WithFile("/bin/app", script).
WithEntrypoint([]string{"/bin/app", "via-entrypoint"}).
WithDefaultArgs([]string{"/bin/app", "via-default-args"}).
WithExposedPort(8080)
s, err := dag.Container().
From("alpine:3.20.2").
WithServiceBinding("dagger-engine", srv.AsService()).
WithoutDirectory("/doesnt-matter").
WithExec([]string{"sh", "-c", "echo hello"}).
Stdout(ctx)
if err != nil {
panic(err)
}
fmt.Printf("got %s\n", s)
}
The issue itself feels like a deadlock of sorts
one thing that jumps out is I was seeing this failure in the client logs:
11 : remotes.docker.resolver.HTTPRequest
12 : โ HTTP HEAD
12 : โ HTTP HEAD ERROR [0.3s]
11 : remotes.docker.resolver.HTTPRequest ERROR [0.3s]
but it doesn't actually cause the client to quit, nor do any retries occur
it seems like
func (s *Server) Resolve(ctx context.Context, self Object, sels ...Selection) (map[string]any, error) {
never gets the error, so it just deadlocks in this function on the pool.Wait call
so I put in the following in the buildkit scheduler to confirm there's no edges that are getting stuck in it:
go func() {
for {
time.Sleep(time.Second * 30)
s.mu.Lock()
fmt.Printf("ACB scheduler debug %v; %d edges in waitq\n", time.Now(), len(s.waitq))
for e := range s.waitq {
fmt.Printf("ACB scheduler debug waiting on %+v %s\n", e, e.edge.Vertex.Name())
}
s.mu.Unlock()
}
}()
but there's a call to file.Stat that's getting stuck:
func (file *File) Stat(ctx context.Context) (*fstypes.Stat, error) {
...
fmt.Printf("ACB Stat called on %s LLB %+v\n", file.File, file.LLB)
return ref.StatFile(ctx, bkgw.StatRequest{
Path: file.File,
})
I also instrumented the wrapped call the scheduler triggers:
@@ -75,11 +76,14 @@ func NewWithFunction[Payload, Value any](f func(context.Context) (Value, error))
}
return p, func() {
+ fmt.Printf("ACB pipe %v called\n", f)
res, err := f(ctx)
if err != nil {
+ fmt.Printf("ACB pipe %v err=%v\n", f, err)
p.Sender.Finalize(*new(Value), err)
return
}
+ fmt.Printf("ACB pipe %v ok\n", f)
p.Sender.Finalize(res, nil)
}
}
the err is never hit, and I have 31 calls, and 30 ok -> so clearly there's one that's blocking.
I then did a docker exec -ti dagger-engine.dev sh -c 'kill -s SIGQUIT 1' and I can see the blocked call:
goroutine 14004 gp=0xc001c3c380 m=nil [select, 4 minutes]:
runtime.gopark(0xc001741b40?, 0x2?, 0x4b?, 0xf9?, 0xc001741aec?)
/usr/lib/go/src/runtime/proc.go:435 +0xce fp=0xc001741968 sp=0xc001741948 pc=0x479c0e
runtime.selectgo(0xc001741b40, 0xc001741ae8, 0x0?, 0x0, 0x2ddf260?, 0x1)
/usr/lib/go/src/runtime/select.go:351 +0x837 fp=0xc001741aa0 sp=0xc001741968 pc=0x4574d7
github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait(0x3a3ef20, {0x3a13f70, 0xc0003c9630})
/app/buildkit/util/flightcontrol/flightcontrol.go:170 +0x5c7 fp=0xc001741be8 sp=0xc001741aa0 pc=0xf02ac7
github.com/moby/buildkit/util/flightcontrol.(*Group[...]).do(0x3a3c3c0, {0x3a13f70, 0xc0003c9630}, {0x32cdec5, 0x4}, 0xc0007220c0)
/app/buildkit/util/flightcontrol/flightcontrol.go:80 +0x237 fp=0xc001741c28 sp=0xc001741be8 pc=0xf01a57
github.com/moby/buildkit/util/flightcontrol.(*Group[...]).Do(0x3a3c3c0, {0x3a13f70, 0xc0003c9630}, {0x32cdec5, 0x4}, 0xc0007220c0)
/app/buildkit/util/flightcontrol/flightcontrol.go:38 +0x93 fp=0xc001741c90 sp=0xc001741c28 pc=0xf01ef3
...
...
github.com/moby/buildkit/solver.(*sharedOp).Exec(0xc00165c700, {0x3a13f70, 0xc0003c9630}, {0xc0002721c0, 0x2, 0x2})
/app/buildkit/solver/jobs.go:1095 +0x1b0 fp=0xc001741d78 sp=0xc001741c90 pc=0xef36d0
github.com/moby/buildkit/solver.(*edge).execOp(0xc000a38c80, {0x3a13f70, 0xc0003c9630})
/app/buildkit/solver/edge.go:910 +0x183 fp=0xc001741ee8 sp=0xc001741d78 pc=0xee3563
github.com/moby/buildkit/solver.(*edge).execOp-fm({0x3a13f70?, 0xc0003c9630?})
<autogenerated>:1 +0x33 fp=0xc001741f10 sp=0xc001741ee8 pc=0xf09b93
github.com/moby/buildkit/solver/internal/pipe.NewWithFunction[...].func2()
/app/buildkit/solver/internal/pipe/pipe.go:80 +0xa9 fp=0xc001741fe0 sp=0xc001741f10 pc=0xf06029
runtime.goexit({})
/usr/lib/go/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc001741fe8 sp=0xc001741fe0 pc=0x481c81
created by github.com/moby/buildkit/solver.(*scheduler).newRequestWithFunc in goroutine 25
/app/buildkit/solver/scheduler.go:300 +0x18e
and if i'm not mistaken, I end up under /app/engine/buildkit/executor_spec.go
goroutine 14008 gp=0xc001348e00 m=nil [chan receive, 4 minutes]:
runtime.gopark(0xc001186818?, 0x41101e?, 0x32?, 0xfe?, 0xc001186860?)
/usr/lib/go/src/runtime/proc.go:435 +0xce fp=0xc001186800 sp=0xc0011867e0 pc=0x479c0e
runtime.chanrecv(0xc000034380, 0xc0011868d8, 0x1)
/usr/lib/go/src/runtime/chan.go:664 +0x445 fp=0xc001186878 sp=0xc001186800 pc=0x411d25
runtime.chanrecv1(0x0?, 0xf94f7a?)
/usr/lib/go/src/runtime/chan.go:506 +0x12 fp=0xc0011868a0 sp=0xc001186878 pc=0x4118b2
github.com/containerd/go-runc.(*defaultMonitor).Wait(0xc0015265a0?, 0xc0006dcc00?, 0xc0013c4be0?)
/go/pkg/mod/github.com/containerd/go-runc@v1.1.0/monitor.go:116 +0x2d fp=0xc001186910 sp=0xc0011868a0 pc=0xf9516d
github.com/containerd/go-runc.(*Runc).Run(0xc000051c20, {0x3a13f70, 0xc0013c4be0}, {0xc0014ed281, 0x19}, {0xc001a87ac0?, 0x761ddaff4048?}, 0x761e21cf1a78?)
/go/pkg/mod/github.com/containerd/go-runc@v1.1.0/runc.go:343 +0x438 fp=0xc001186aa8 sp=0xc001186910 pc=0xf97778
github.com/dagger/dagger/engine/buildkit.(*Worker).runContainer.func7({0x3a13f70, 0xc0013c4be0}, 0xc000051c20?, {0x3a1ac00?, 0xc0015265a0?}, {0x0?, 0x0?})
/app/engine/buildkit/executor_spec.go:1186 +0x109 fp=0xc001186b88 sp=0xc001186aa8 pc=0x13afa69
...
github.com/dagger/dagger/engine/buildkit.(*Worker).callWithIO(0x2b73a00?, {0x3a13f38?, 0xc001a80db0?}, 0xc00145d680, 0xc0004fa820, {0xc000051c20, {0xc0014ed281, 0x19}, {0x0, 0x0}, ...}, ...)
/app/engine/buildkit/executor.go:748 +0x97e fp=0xc001186db0 sp=0xc001186b88 pc=0x13a32fe
github.com/dagger/dagger/engine/buildkit.(*Worker).runContainer(0xc0013c4320, {0x3a13f38, 0xc001a80db0}, 0xc000367040)
/app/engine/buildkit/executor_spec.go:1194 +0xb8b fp=0xc001187178 sp=0xc001186db0 pc=0x13aef8b
github.com/dagger/dagger/engine/buildkit.(*Worker).runContainer-fm({0x3a13f38?, 0xc001a80db0?}, 0xc000367040?)
<autogenerated>:1 +0x36 fp=0xc0011871a8 sp=0xc001187178 pc=0x13e29b6
github.com/dagger/dagger/engine/buildkit.(*Worker).run(0xc0013c4320, {0x3a13f38, 0xc001a80db0}, 0xc000367040, {0xc001187488, 0x10, 0x5?})
/app/engine/buildkit/executor.go:225 +0x18d fp=0xc001187250 sp=0xc0011871a8 pc=0x139e6cd
github.com/dagger/dagger/engine/buildkit.(*Worker).Run(_, {_, _}, {_, _}, {{0x39e6280, 0xc000272580}, {0x0, 0x0}, {0x0, ...}, ...}, ...)
/app/engine/buildkit/executor.go:177 +0x617 fp=0xc001187518 sp=0xc001187250 pc=0x139e477
github.com/moby/buildkit/solver/llbsolver/ops.(*ExecOp).Exec(0xc001f33100, {0x3a13f38, 0xc001a80db0}, {0x39e6300, 0xc0002723e0}, {0xc0002721c0, 0x2, 0x2})
/app/buildkit/solver/llbsolver/ops/exec.go:493 +0x12ad fp=0xc001187c68 sp=0xc001187518 pc=0x12d8fad
github.com/moby/buildkit/solver.(*sharedOp).Exec.func2({0x3a13f70, 0xc0003c9720})
/app/buildkit/solver/jobs.go:1122 +0x632 fp=0xc001187e78 sp=0xc001187c68 pc=0xef3fd2
...
github.com/moby/buildkit/util/flightcontrol.(*call[...]).run(0x3a3ef20)
/app/buildkit/util/flightcontrol/flightcontrol.go:123 +0x1fe fp=0xc001187f48 sp=0xc001187e78 pc=0xf02fbe
github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait.func1()
/app/buildkit/util/flightcontrol/flightcontrol.go:168 +0x25 fp=0xc001187f68 sp=0xc001187f48 pc=0xf02d25
sync.(*Once).doSlow(0xc001a80180?, 0xc001a80198?)
/usr/lib/go/src/sync/once.go:78 +0xab fp=0xc001187fc0 sp=0xc001187f68 pc=0x48c46b
sync.(*Once).Do(...)
/usr/lib/go/src/sync/once.go:69
github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait.gowrap2()
/app/buildkit/util/flightcontrol/flightcontrol.go:168 +0x2a fp=0xc001187fe0 sp=0xc001187fc0 pc=0xf02cca
runtime.goexit({})
/usr/lib/go/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc001187fe8 sp=0xc001187fe0 pc=0x481c81
created by github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait in goroutine 14004
/app/buildkit/util/flightcontrol/flightcontrol.go:168 +0x565
ok so let's figure out what's blocking under runc:
runcCall := func(ctx context.Context, started chan<- int, io runc.IO, pidfile string) error {
+ fmt.Printf("ACB runc.Run id=%s bundle=%s\n", state.id, bundle)
_, err := w.runc.Run(ctx, state.id, bundle, &runc.CreateOpts{
Started: started,
IO: io,
ExtraArgs: []string{"--keep"},
})
+ fmt.Printf("ACB runc.Run id=%s bundle=%s err=%v\n", state.id, bundle, err)
return err
}
and here it is: we are stalled on
== pid: 3203575; bundle: /var/lib/dagger/worker/executor/osogryikvx45ln1wk8y5hdz2g ==
['/.init', '/bin/app', 'via-default-args']
so my best guess is there's something in dagops that isn't closing the bound services.
so I added a printf under:
stopSvc := func(ctx context.Context, force bool) error {
fmt.Printf("ACB stopSvc called %s from %s\n", execOp.Meta.Args, debug.Stack())
and it's not being called.
@inland sleet does anything obvious jump out at you regarding this blocking call? I think it's related to bound services not shutting down, but I'm not certain that's the actual cause (or just a side effect)
Which PR is this? Just so I can look at the code too
My immediate impression is that this smells similar to a deadlock I hit the other day involving dagop where I managed to get circular LLB generated which utterly deadlocked the bk scheduler, but in such a way that it was very hard to tell just from the goroutine stack traces
Like the file.Stat could be triggering an evaluate and hitting that deadlock
Could be 100% wrong but for some reason it feels similar
The service container being around might be a red herring since those will keeping running while the session is still up
ah, yeah I was wondering about that case.
in my repro case (at the top of this thread), as soon as I comment out WithoutDirectory("/doesnt-matter")., the only wrapped dagop, it works fine.
in this stack trace was anything stuck at sharedOp.Exec (here-ish https://github.com/sipsma/buildkit/blob/c234dd97ba77033c55b844e59b4ab1122e7977f1/solver/jobs.go#L1095-L1095) ?
...
github.com/moby/buildkit/util/flightcontrol.(*Group[...]).Do(0x3a3db20, {0x3a156d0, 0xc000d18c30}, {0x32ceee5, 0x4}, 0xc000305bc0)
/app/buildkit/util/flightcontrol/flightcontrol.go:38 +0x93 fp=0xc001418c90 sp=0xc001418c28 pc=0xf01ef3
github.com/moby/buildkit/solver.(*sharedOp).Exec(0xc000f3e460, {0x3a156d0, 0xc000d18c30}, {0xc00038e4c0, 0x2, 0x2}) <-------------------- yup!
/app/buildkit/solver/jobs.go:1095 +0x1b0 fp=0xc001418d78 sp=0xc001418c90 pc=0xef36d0
github.com/moby/buildkit/solver.(*edge).execOp(0xc000724b40, {0x3a156d0, 0xc000d18c30})
/app/buildkit/solver/edge.go:910 +0x183 fp=0xc001418ee8 sp=0xc001418d78 pc=0xee3563
github.com/moby/buildkit/solver.(*edge).execOp-fm({0x3a156d0?, 0xc000d18c30?})
<autogenerated>:1 +0x33 fp=0xc001418f10 sp=0xc001418ee8 pc=0xf09b93
github.com/moby/buildkit/solver/internal/pipe.NewWithFunction[...].func2()
/app/buildkit/solver/internal/pipe/pipe.go:80 +0xa9 fp=0xc001418fe0 sp=0xc001418f10 pc=0xf06029
runtime.goexit({})
/usr/lib/go/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc001418fe8 sp=0xc001418fe0 pc=0x481c81
created by github.com/moby/buildkit/solver.(*scheduler).newRequestWithFunc in goroutine 42
/app/buildkit/solver/scheduler.go:300 +0x18e
side note, argggg discord for not letting me paste the whole stack trace due to length ๐ฆ
ah yeah, I use gist.github.com when needed
also have a second one under:
github.com/moby/buildkit/solver/llbsolver/ops.(*ExecOp).Exec(0xc00134da80, {0x3a15698, 0xc0010eb4a0}, {0x39e7a60, 0xc00038e6e0}, {0xc00038e4c0, 0x2, 0x2})
/app/buildkit/solver/llbsolver/ops/exec.go:493 +0x12ad fp=0xc001bb5c68 sp=0xc001bb5518 pc=0x12d8fad
github.com/moby/buildkit/solver.(*sharedOp).Exec.func2({0x3a156d0, 0xc000d18d20})
/app/buildkit/solver/jobs.go:1122 +0x632 fp=0xc001bb5e78 sp=0xc001bb5c68 pc=0xef3fd2
github.com/moby/buildkit/util/flightcontrol.(*call[...]).run(0x3a40680)
/app/buildkit/util/flightcontrol/flightcontrol.go:123 +0x1fe fp=0xc001bb5f48 sp=0xc001bb5e78 pc=0xf02fbe
github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait.func1()
/app/buildkit/util/flightcontrol/flightcontrol.go:168 +0x25 fp=0xc001bb5f68 sp=0xc001bb5f48 pc=0xf02d25
sync.(*Once).doSlow(0x3a156d0?, 0xc001913110?)
/usr/lib/go/src/sync/once.go:78 +0xab fp=0xc001bb5fc0 sp=0xc001bb5f68 pc=0x48c46b
sync.(*Once).Do(...)
/usr/lib/go/src/sync/once.go:69
github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait.gowrap2()
/app/buildkit/util/flightcontrol/flightcontrol.go:168 +0x2a fp=0xc001bb5fe0 sp=0xc001bb5fc0 pc=0xf02cca
runtime.goexit({})
/usr/lib/go/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc001bb5fe8 sp=0xc001bb5fe0 pc=0x481c81
created by github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait in goroutine 13006
/app/buildkit/util/flightcontrol/flightcontrol.go:168 +0x565
I just tried a gist, and github gave me a 500.... double sigh.
No worries I'll pull your PR down and try to the repro myself
Hm yeah, I also saw that there's seemingly two of the service containers running.
What's also very odd is that it seems like the solver is blocked on an ExecOp, but service containers aren't even supposed to be exec-ops, they should just be gateway containers...
when I printf'ed it here, I only saw one instance running, but running ps does show two on my system as well.
Yeah if you grep for e.g. Runc in the sigquit stack trace you'll see two stacks: one from a gateway container call (expected) and one from the exec op that's getting blocked on (not expected)
ohhhh, that makes a bit more sense.
So the deadlock is because it's trying to run a long-running service as an exec-op and waiting for it, but obviously the question is how it could ever be doing that...
It's somehow finding the Container underlying the Service that's bound and running it as an exec-op, but really struggling to think how that could happen
@craggy heath does that ring any bells, anything you can think of in your changes that could somehow result in that sort of confusion?
(i'm looking too, just checking)
honestly I have no idea ๐
is this the location that's sort of unexpected? https://github.com/dagger/dagger/blob/54e5848c733125f93612f2a450cce4ba10205a91/engine/buildkit/executor_spec.go#L1186-L1189
No that would be expected for any container that needs to run.
The unexpected is:
- That there two containers for the service running
- That one of the codepaths for starting the container is going through an execop vs. just a gateway container, e.g. this stack trace:
goroutine 13656 gp=0x4001adf340 m=nil [chan receive, 1 minutes]:
runtime.gopark(0x40016e47d8?, 0x1f344?, 0xd8?, 0x47?, 0x1f3c8?)
/usr/lib/go/src/runtime/proc.go:435 +0xc8 fp=0x40016e47b0 sp=0x40016e4790 pc=0x88858
runtime.chanrecv(0x4000565b20, 0x40016e48a0, 0x1)
/usr/lib/go/src/runtime/chan.go:664 +0x42c fp=0x40016e4830 sp=0x40016e47b0 pc=0x1ff9c
runtime.chanrecv1(0x0?, 0x1?)
/usr/lib/go/src/runtime/chan.go:506 +0x14 fp=0x40016e4860 sp=0x40016e4830 pc=0x1fb34
github.com/containerd/go-runc.(*defaultMonitor).Wait(0x4001a0fa70?, 0x40014b0480?, 0x40011185a0?)
/go/pkg/mod/github.com/containerd/go-runc@v1.1.0/monitor.go:116 +0x30 fp=0x40016e48d0 sp=0x40016e4860 pc=0xac3160
github.com/containerd/go-runc.(*Runc).Run(0x400062ecf0, {0x320e830, 0x40011185a0}, {0x4001b7cce1, 0x19}, {0x4001300840?, 0x4002700008?},
0x4001b41de8?)
/go/pkg/mod/github.com/containerd/go-runc@v1.1.0/runc.go:343 +0x354 fp=0x40016e4a70 sp=0x40016e48d0 pc=0xac5194
github.com/dagger/dagger/engine/buildkit.(*Worker).runContainer.func7({0x320e830, 0x40011185a0}, 0x400062ecf0?, {0x3216500?, 0x4001a0fa7
0?}, {0x0?, 0x0?})
/app/engine/buildkit/executor_spec.go:1186 +0xd0 fp=0x40016e4b50 sp=0x40016e4a70 pc=0xe72c50
github.com/dagger/dagger/engine/buildkit.(*Worker).callWithIO(0x4000378200?, {0x320e7f8?, 0x4000f8afc0?}, 0x4001002900, 0x40002910c0, {0
x400062ecf0, {0x4001b7cce1, 0x19}, {0x0, 0x0}, ...}, ...)
/app/engine/buildkit/executor.go:748 +0x738 fp=0x40016e4d80 sp=0x40016e4b50 pc=0xe685c8
github.com/dagger/dagger/engine/buildkit.(*Worker).runContainer(0x4002485b30, {0x320e7f8, 0x4000f8afc0}, 0x40007e4780)
/app/engine/buildkit/executor_spec.go:1194 +0x890 fp=0x40016e5140 sp=0x40016e4d80 pc=0xe72470
github.com/dagger/dagger/engine/buildkit.(*Worker).runContainer-fm({0x320e7f8?, 0x4000f8afc0?}, 0x40007e4780?)
<autogenerated>:1 +0x40 fp=0x40016e5170 sp=0x40016e5140 pc=0xe9fbe0
github.com/dagger/dagger/engine/buildkit.(*Worker).run(0x4002485b30, {0x320e7f8, 0x4000f8afc0}, 0x40007e4780, {0x40016e5468, 0x10, 0x0?}
)
/app/engine/buildkit/executor.go:225 +0x144 fp=0x40016e5220 sp=0x40016e5170 pc=0xe648f4
github.com/dagger/dagger/engine/buildkit.(*Worker).Run(_, {_, _}, {_, _}, {{_, _}, {_, _}, {_, ...}, ...}, ...)
/app/engine/buildkit/executor.go:177 +0x484 fp=0x40016e54f0 sp=0x40016e5220 pc=0xe64724
github.com/moby/buildkit/solver/llbsolver/ops.(*ExecOp).Exec(0x4000a29e80, {0x320e7f8, 0x4000f8afc0}, {0x31e0c00, 0x40002b0340}, {0x4000
2b01c0, 0x2, 0x2})
/go/pkg/mod/github.com/dagger/buildkit@v0.0.0-20250519134855-c1734e1f6155/solver/llbsolver/ops/exec.go:493 +0xd60 fp=0x40016e5c5
0 sp=0x40016e54f0 pc=0xdbb850
github.com/moby/buildkit/solver.(*sharedOp).Exec.func2({0x320e830, 0x40012f0370})
/go/pkg/mod/github.com/dagger/buildkit@v0.0.0-20250519134855-c1734e1f6155/solver/jobs.go:1122 +0x4c0 fp=0x40016e5e60 sp=0x40016e
5c50 pc=0xa2f430
...
The fact that github.com/moby/buildkit/solver/llbsolver/ops.(*ExecOp).Exec is in the stack there means it's an actual LLB exec op
even though it appears to be a service container
ahhhh! thanks for pointing the llbsolver part out
Yeah there's only like 10 things named Exec(Op) in the (llb)solver packages, so easy to keep track of which is which! ๐
Ran w/ this change:
diff --git a/core/container_exec.go b/core/container_exec.go
index b16fe5a9f..1dafce470 100644
--- a/core/container_exec.go
+++ b/core/container_exec.go
@@ -386,6 +386,12 @@ func (container *Container) metaFileContents(ctx context.Context, filePath strin
return "", fmt.Errorf("%w: %s requires an exec", ErrNoCommand, filePath)
}
+ opdag, err := buildkit.DefToDAG(container.Meta)
+ if err != nil {
+ return "", fmt.Errorf("convert meta to dag: %w", err)
+ }
+ fmt.Println("META OPDAG", opdag.String())
+
file := NewFile(
container.Meta,
path.Join(buildkit.MetaMountDestPath, filePath),
and can see that the stdout file where things start getting blocked has LLB pb:
META OPDAG 0 <nil>
1 &{Exec:meta:<args:"sh" args:"-c" args:"echo hello" env:"_DAGGER_HOSTNAME_ALIASES=c2otd7ldheh4o=dagger-engine" env:"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" cwd:"/" removeMountStubsRecursive:true > mounts:<dest:"/" > mounts:<input:1 selector:"/.dagger_meta_mount" dest:"/.dagger_meta_mount" output:1 > }
0 &{File:actions:<input:-1 secondaryInput:-1 output:-1 rm:<path:"/b1e9b1944265e99ebf2837a03a8943fd9b5880f807408b9dc003ff9afd70b341" > > actions:<input:2 output:-1 copy:<src:"/" dest:"/" mode:-1 timestamp:-1 > > actions:<input:3 output:-1 copy:<src:"/" dest:"/" mode:-1 timestamp:-1 > > actions:<input:4 secondaryInput:1 copy:<src:"/" dest:"/" mode:-1 timestamp:-1 > > }
0 &{Source:identifier:"docker-image://docker.io/library/alpine:3.20.2@sha256:0a4eaa0eecf5f8c050e5bba433f58c052be7587ee8af3e8b3910ef9ab5fbe9f5" attrs:<key:"image.resolvemode" value:"local" > }
0 &{Exec:meta:<args:"/bin/app" args:"via-default-args" env:"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" cwd:"/" removeMountStubsRecursive:true > mounts:<dest:"/" > mounts:<input:1 selector:"/.dagger_meta_mount" dest:"/.dagger_meta_mount" output:1 > }
0 &{File:actions:<input:-1 secondaryInput:-1 output:-1 rm:<path:"/044ed1657f8023356acec570c55e2194bde33adc250cb3c41b82b8e43bdca1de" > > actions:<input:1 output:-1 copy:<src:"/" dest:"/" mode:-1 timestamp:-1 > > actions:<input:2 copy:<src:"/" dest:"/" mode:-1 timestamp:-1 > > }
0 &{File:actions:<secondaryInput:-1 mkfile:<path:"/salty" mode:420 data:"1750723506734207360" timestamp:-1 > > }
0 &{File:actions:<secondaryInput:-1 mkdir:<path:"/" mode:493 makeParents:true timestamp:-1 > > }
0 &{Source:identifier:"docker-image://docker.io/library/alpine:3.20.2@sha256:0a4eaa0eecf5f8c050e5bba433f58c052be7587ee8af3e8b3910ef9ab5fbe9f5" attrs:<key:"image.resolvemode" value:"local" > }
0 &{File:actions:<input:-1 secondaryInput:-1 mkdir:<path:"/.dagger_meta_mount" mode:511 timestamp:-1 > > }
0 &{File:actions:<input:-1 secondaryInput:-1 mkdir:<path:"/.dagger_meta_mount" mode:511 timestamp:-1 > > }
Hard to read obviously, but that does indeed show the LLB for that stdout file includes an ExecOp for the service
Trying to see how that could happen
Have a feeling we are using PB in a place where we should be using a Result instead, but we'll see
Maybe? But can't connect all the dots really yet
The 0 &{File:actions:<input:-1 secondaryInput:-1 output:-1 rm:<path:"/b1e9b1944265e99ebf2837a03a8943fd9b5880f807408b9dc003ff9afd70b341" is our special custom dagop: https://github.com/sipsma/dagger/blob/279aa9667ed1bfe2714e776272e966c43ef9b12c/engine/buildkit/op.go#L72-L72
But the inputs it's being given are wrong, they are the inputs for the service exec rather than just the base alpine container
Checking to see if it's something involving caching, like if cache keys are somehow equal for different containers
I'm looking still, but honestly suspect it may make sense for you to rebase on https://github.com/dagger/dagger/pull/10457 since Justin has a bunch of fixes/improvements to Container.withExec in the context of dag-op there. Wouldn't shock me if there's a fix for this lying in there
Yeah, Justin can confirm tomorrow but I think rebasing on his PR is probably the best next step. Something is going wrong in withExec specifically for you, so I'd be afraid that any fixes you make to this would end up just replicating what he had already done
I'll try another rebase against this to see if it works with the more recent commits.
Thanks for taking a look at things today, I was starting to spin my wheels a bit.
mm i have a suspicion that the work in my PR to rework how services are called (to now not depend on llb execs might potentially sort things through)
a couple obvious things to make sure of:
- make sure
.Cloneis called in all the right places (and that it's definitely doing the right thing) - this can cause some wild errors when it's not. looks good to me, but maybe worth checking. - maybe confirm that
Resultdoesn't escape the dagop context. it's only valid there, if we're getting confused, might be worth adding a hard check for that somewhere.
generally, the way that services are so entangled in withExec on main is insane, and so i broke that down a lot - if it doesn't go away on top of 10457, i can dive in a bit more
Thanks for those tips.
I was able to repliace it on your PR when I introduced a call to WithSymlink (which triggers the dagop wrapper path):
script := dag.Container().
From("alpine:3.20.2").
WithNewFile("script", "#!/bin/sh\nwhile true; do nc -l -p 8080; done").
WithExec([]string{"/bin/sh", "-c", "chmod +x script"}).
File("script")
srv := dag.Container().
From("alpine:3.20.2").
WithNewFile("salty", salt).
WithFile("/bin/app", script).
WithSymlink("doesnt", "matter"). // enabling this breaks things
WithEntrypoint([]string{"/bin/app", "via-entrypoint"}).
WithDefaultArgs([]string{"/bin/app", "via-default-args"}).
WithExposedPort(8080)
s, err := dag.Container().
From("alpine:3.20.2").
WithServiceBinding("dagger-engine", srv.AsService()).
WithExec([]string{"sh", "-c", "echo hello"}).
Stdout(ctx)
However in this case, the error is slightly different:
panic: start l1v3s5i4k0g76 (aliased as dagger-engine): open /.dagger_meta_mount: no such file or directory
However it feels like the underlying issue of the Meta field containing unexpected LLB.
I can make this issue go away by setting Meta = nil in container.AsService, since that then
causes service.startContainer to recreate the propper LLB when it loops over pbmounts:
if pbmount.Input != pb.Empty {
st = &states[pbmount.Input]
} else if pbmount.Dest == buildkit.MetaMountDestPath {
v := MetaMountState(ctx, "")
st = &v
}
So my best guess is the order of the mounts is incorrect, and something somewhere
is not following the required mount[0] = root, mount[1] = meta ordering.
I couldn't find any missing clones, nor were there cases where Result or MetaResult persisted
My best guess is it's happening under https://github.com/jedevc/dagger/blob/fae6ff1aff62c32779a73ea781e6ce186a8c4330/core/dagop.go#L733
which gets set via https://github.com/jedevc/dagger/blob/fae6ff1aff62c32779a73ea781e6ce186a8c4330/core/dagop.go#L364
which pulls in all the LLB from https://github.com/jedevc/dagger/blob/fae6ff1aff62c32779a73ea781e6ce186a8c4330/core/dagop.go#L354
The particular output index is set via https://github.com/jedevc/dagger/blob/fae6ff1aff62c32779a73ea781e6ce186a8c4330/engine/buildkit/output.go#L32 but I'm not sure how (if) it's used when the final call to toPB() gets made.
๐ค ohhhh okay nice to know it goes away when meta gets explicitly set to nil
actuallllyyyy, i've seen something like this yesterday! just put 2 and 2 together
sometimes the output of a dagop can't be chained into a terminal with this exact error
i will have a dig! since i was encountering it somewhere completely different as well ๐
AHA
yeah, on my branch, i can replicate this perfectly using dagger-dev core container from --address=alpine with-symlink --link-name=/foo --target=/bin terminal
โผ .terminal: Container! 0.3s ERROR
! open /.dagger_meta_mount: no such file or directory
โฐโโ exec sh 0.2s ERROR
! open /.dagger_meta_mount: no such file or directory
Container@xxh3:7d37ede75b093e09
sweet! that's a much smaller repro case!
okay, sorry got pulled into a couple regressions in v0.18.11
Signed-off-by: Justin Chadwell me@jedevc.com
i did end up hacking this together between bits today, which clears up a lot of the Meta things. this seems to fix the repro i had above?
it's all just an elaborate bit of dancing, but the main bit is - just set the Meta input to nil.
i'll be honest. i don't really understand why this fixes the issue.
ah i know what it was. sending in a nil input to meta was always invalid.
i set it to nil in withexec to make it happen everytime there
this part makes me feel better ๐
the problem really is using MetaMountDestPath in the SourcePath - with llb.Scratch that doesn't work, because there's no /.dagger_meta_mount
so buildkit errors (kinda reasonable honestly)
so to fix, we can remove the SourcePath selector - then we can have a meta=nil input, which fixes it for withExec, and also every other case.
okay! yeah that makes sense to me ๐
yeah having it set to nil forces this part to run: https://github.com/alexcb/dagger/blob/aceead963010dcc167e738878777b862d21fef0f/core/service.go#L366-L367
I don't really understand why you would want to ever mount a different directory. wouldn't one always want a new clean directory to output meta files to?
oh
how interesting
i think that's a hack as well that i added in
i have a suspicion we can get rid of that? lemme quickly test
i think that may do it
yeah, we should only ever set Meta in withExec - i think the reason i added it in service there was because of this exact error
but, the real fix is to just make sure that we can handle a nil/empty source for the meta mount
otherwise, we'd have to add this little weird edge case to every place we call ContainerDagOp
i'm about to sign off - hopefully the commit i pushed fixes things, otherwsie, feel free to keep digging ๐
one thing that would definitely be useful is some integration test for this behavior (chaining a from -> withSymlink and starting a service/terminal from it)
thanks for the help with this, i did a rebase against your latest branch. I'll add a test for this too.
I need to rerun some flaky tests, to figure out if it's a flake or not, and will keep on digging. have a good night ๐
lol
no it does not fix it
\
๐ฉธ ๐ฅ ๐ท
think i fixed it - if not, i'm gone now
good luck 
@craggy heath any chance you can rename this thread ๐ฅบ I keep losing it sorry ๐
discord only allows the creator to name it
with-exec + services + nil meta bug
I've been wanting to rename it too, but only figured out how to right now ๐
with-exec + services + non-nil meta bug
Changing the startContainer function to always create a new scratch+mkdir mount (see https://github.com/alexcb/dagger/commit/4836261d0711bf5b94e55d724091da5faae355b4 ) fixes the failing tests.
I tried your latest change -- it passes my repro test, but unfortunately some of the other existing tests are failing, e.g.
--- FAIL: TestContainer/TestExecStdoutStderr/stderr_without_exec (0.01s)
testctx.go:173:
Error Trace: /app/core/integration/suite_test.go:347
/app/core/integration/container_test.go:523
/go/pkg/mod/github.com/dagger/testctx@v0.0.4/testctx.go:295
/go/pkg/mod/github.com/dagger/testctx/oteltest@v0.0.3/log.go:37
/go/pkg/mod/github.com/dagger/testctx/oteltest@v0.0.3/trace.go:83
/go/pkg/mod/github.com/dagger/testctx@v0.0.4/middleware.go:25
/go/pkg/mod/github.com/dagger/testctx@v0.0.4/testctx.go:149
Error: Error "empty reference" does not contain "no command has been set"
Test: TestContainer/TestExecStdoutStderr/stderr_without_exec
turns out we can just check for a nil meta and return the previously expected (and more clear) error message: https://github.com/alexcb/dagger/commit/cf87763de5e7ba4ba67ad1367f3ab481d7ad6f19
so I'm seeing a different failure:
--- FAIL: TestTelemetry (214.36s)
--- FAIL: TestTelemetry/TestGolden (214.36s)
--- FAIL: TestTelemetry/TestGolden/fail-log (3.86s)
...
testctx.go:197: assertion failed:
--- expected
+++ actual
@@ -16,5 +16,5 @@
โ viztest: Viztest! X.Xs
โผ .failLog: Void X.Xs ERROR
-โโโ container: Container! X.Xs
+โโโ container: Container! X.Xs
โโ$ .from(address: "alpine"): Container! X.Xs CACHED
โโโ .withEnvVariable(name: "NOW", value: "20XX-XX-XX XX:XX:XX.XXXX +XXXX UTC m=+X.X"): Container! X.Xs
see https://dagger.cloud/dagger/traces/c815ed9905b90a9522bc825c9deedc55#720d82a24374dd26:L75 for full logs
I've retried it 3 times now; however, I can't reproduce it locally ๐
Not sure why that diff is happening in the output but I wouldn't worry about it, looks likely to just be a case where we might need to regen the golden output
cc @naive gale โ -> โ is not really meaningful right?
I'm seeing what happens when I try to regen it locally; although I'm still confused about the fact that it doesn't happen locally, so if I regen it (and switch โ -> โ), won't it start passing in CI but failing locally?
if they aren't meaningful, maybe we need to add them to the scrubber?
I suspect so, but will let Alex confirm, I'm not 100% what the actual difference is between those two symbols in this case
Huh that's weird, I don't think check marks should be showing up at all. Will take a closer look after dinner but pretty sure I swapped those out entirely for circles since they look weird in a graph view
so regenerating it locally hasn't switched anything related to icons.
it has changed this:
โผ connect X.Xs
-โโโ starting engine X.Xs
+โโโผ starting engine X.Xs
+โ โฐโโผ create X.Xs
+โ โโโผ exec docker ps -a --no-trunc --filter name=^dagger-engine-|^dagger-engine-v0\.18\.11$ --format {{.Names}} X.Xs
+โ โ โ dagger-engine-v0.18.11
+โ โฐโโผ exec docker start dagger-engine-v0.18.11 X.Xs
+โ โ dagger-engine-v0.18.11
+โ
but I think this is a red herring to the current CI issue we're dealing with under https://github.com/dagger/dagger/pull/10569
confirmed this - we still have the const but it's unreferenced
I'd suggest maybe a rebase is needed, but CI already runs against the merged result 
$ grep -RI IconSuccess
dagql/idtui/frontend.go:// // sym = out.String(IconSuccess)
dagql/idtui/symbols.go: IconSuccess = "โ"
haha yeah it's even more mysterious now ๐ซ
we'll have to try a rebase
ohhhh
dagql/idtui/golden_test.go
450: "โ container: Container! X.Xs",
it's ze scrubbing
sweet! I'll see if I can change that.
@craggy heath pulled across this patch into my dagop-exec branch
oh? did your fix present other issues in the end? I took care of a few trivial tests failures under https://github.com/dagger/dagger/pull/10569/commits/49df79cf9712827525e2343eaf90513303037978 and https://github.com/dagger/dagger/pull/10569/commits/cf87763de5e7ba4ba67ad1367f3ab481d7ad6f19 which made your original fix work.