#with-exec + services + non-nil meta bug

1 messages ยท Page 1 of 1 (latest)

craggy heath
#

I was able to replicate the issue using:

package main

import (
    "context"
    "fmt"
    "time"

    "dagger.io/dagger/dag"
)

const alpineImage = "alpine:latest"

func main() {
    defer dag.Close()
    ctx := context.Background()
    salt := fmt.Sprintf("%v", time.Now().UnixNano())

    script := dag.Container().
        From("alpine:3.20.2").
        WithNewFile("script", "#!/bin/sh\nwhile true; do nc -l -p 8080; done").
        WithExec([]string{"/bin/sh", "-c", "chmod +x script"}).
        File("script")

    srv := dag.Container().
        From("alpine:3.20.2").
        WithNewFile("salty", salt).
        WithFile("/bin/app", script).
        WithEntrypoint([]string{"/bin/app", "via-entrypoint"}).
        WithDefaultArgs([]string{"/bin/app", "via-default-args"}).
        WithExposedPort(8080)

    s, err := dag.Container().
        From("alpine:3.20.2").
        WithServiceBinding("dagger-engine", srv.AsService()).
        WithoutDirectory("/doesnt-matter").
        WithExec([]string{"sh", "-c", "echo hello"}).
        Stdout(ctx)

    if err != nil {
        panic(err)
    }
    fmt.Printf("got %s\n", s)
}
#

The issue itself feels like a deadlock of sorts

#

one thing that jumps out is I was seeing this failure in the client logs:

11  : remotes.docker.resolver.HTTPRequest
12  : โ”† HTTP HEAD
12  : โ”† HTTP HEAD ERROR [0.3s]
11  : remotes.docker.resolver.HTTPRequest ERROR [0.3s]
#

but it doesn't actually cause the client to quit, nor do any retries occur

#

it seems like

func (s *Server) Resolve(ctx context.Context, self Object, sels ...Selection) (map[string]any, error) {

never gets the error, so it just deadlocks in this function on the pool.Wait call

craggy heath
#

so I put in the following in the buildkit scheduler to confirm there's no edges that are getting stuck in it:

    go func() {
        for {
            time.Sleep(time.Second * 30)
            s.mu.Lock()
            fmt.Printf("ACB scheduler debug %v; %d edges in waitq\n", time.Now(), len(s.waitq))
            for e := range s.waitq {
                fmt.Printf("ACB scheduler debug waiting on %+v %s\n", e, e.edge.Vertex.Name())
            }
            s.mu.Unlock()
        }
    }()
#

but there's a call to file.Stat that's getting stuck:

    func (file *File) Stat(ctx context.Context) (*fstypes.Stat, error) {
                ...
        fmt.Printf("ACB Stat called on %s LLB %+v\n", file.File, file.LLB)
        return ref.StatFile(ctx, bkgw.StatRequest{
            Path: file.File,
        })
craggy heath
#

I also instrumented the wrapped call the scheduler triggers:

@@ -75,11 +76,14 @@ func NewWithFunction[Payload, Value any](f func(context.Context) (Value, error))
        }
 
        return p, func() {
+               fmt.Printf("ACB pipe %v called\n", f)
                res, err := f(ctx)
                if err != nil {
+                       fmt.Printf("ACB pipe %v err=%v\n", f, err)
                        p.Sender.Finalize(*new(Value), err)
                        return
                }
+               fmt.Printf("ACB pipe %v ok\n", f)
                p.Sender.Finalize(res, nil)
        }
 }

the err is never hit, and I have 31 calls, and 30 ok -> so clearly there's one that's blocking.

#

I then did a docker exec -ti dagger-engine.dev sh -c 'kill -s SIGQUIT 1' and I can see the blocked call:

goroutine 14004 gp=0xc001c3c380 m=nil [select, 4 minutes]:
runtime.gopark(0xc001741b40?, 0x2?, 0x4b?, 0xf9?, 0xc001741aec?)
        /usr/lib/go/src/runtime/proc.go:435 +0xce fp=0xc001741968 sp=0xc001741948 pc=0x479c0e
runtime.selectgo(0xc001741b40, 0xc001741ae8, 0x0?, 0x0, 0x2ddf260?, 0x1)
        /usr/lib/go/src/runtime/select.go:351 +0x837 fp=0xc001741aa0 sp=0xc001741968 pc=0x4574d7
github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait(0x3a3ef20, {0x3a13f70, 0xc0003c9630})
        /app/buildkit/util/flightcontrol/flightcontrol.go:170 +0x5c7 fp=0xc001741be8 sp=0xc001741aa0 pc=0xf02ac7
github.com/moby/buildkit/util/flightcontrol.(*Group[...]).do(0x3a3c3c0, {0x3a13f70, 0xc0003c9630}, {0x32cdec5, 0x4}, 0xc0007220c0)
        /app/buildkit/util/flightcontrol/flightcontrol.go:80 +0x237 fp=0xc001741c28 sp=0xc001741be8 pc=0xf01a57
github.com/moby/buildkit/util/flightcontrol.(*Group[...]).Do(0x3a3c3c0, {0x3a13f70, 0xc0003c9630}, {0x32cdec5, 0x4}, 0xc0007220c0)
        /app/buildkit/util/flightcontrol/flightcontrol.go:38 +0x93 fp=0xc001741c90 sp=0xc001741c28 pc=0xf01ef3
...
#
...
github.com/moby/buildkit/solver.(*sharedOp).Exec(0xc00165c700, {0x3a13f70, 0xc0003c9630}, {0xc0002721c0, 0x2, 0x2})
        /app/buildkit/solver/jobs.go:1095 +0x1b0 fp=0xc001741d78 sp=0xc001741c90 pc=0xef36d0
github.com/moby/buildkit/solver.(*edge).execOp(0xc000a38c80, {0x3a13f70, 0xc0003c9630})
        /app/buildkit/solver/edge.go:910 +0x183 fp=0xc001741ee8 sp=0xc001741d78 pc=0xee3563
github.com/moby/buildkit/solver.(*edge).execOp-fm({0x3a13f70?, 0xc0003c9630?})
        <autogenerated>:1 +0x33 fp=0xc001741f10 sp=0xc001741ee8 pc=0xf09b93
github.com/moby/buildkit/solver/internal/pipe.NewWithFunction[...].func2()
        /app/buildkit/solver/internal/pipe/pipe.go:80 +0xa9 fp=0xc001741fe0 sp=0xc001741f10 pc=0xf06029
runtime.goexit({})
        /usr/lib/go/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc001741fe8 sp=0xc001741fe0 pc=0x481c81
created by github.com/moby/buildkit/solver.(*scheduler).newRequestWithFunc in goroutine 25
        /app/buildkit/solver/scheduler.go:300 +0x18e
#

and if i'm not mistaken, I end up under /app/engine/buildkit/executor_spec.go

#
goroutine 14008 gp=0xc001348e00 m=nil [chan receive, 4 minutes]:
runtime.gopark(0xc001186818?, 0x41101e?, 0x32?, 0xfe?, 0xc001186860?)
        /usr/lib/go/src/runtime/proc.go:435 +0xce fp=0xc001186800 sp=0xc0011867e0 pc=0x479c0e
runtime.chanrecv(0xc000034380, 0xc0011868d8, 0x1)
        /usr/lib/go/src/runtime/chan.go:664 +0x445 fp=0xc001186878 sp=0xc001186800 pc=0x411d25
runtime.chanrecv1(0x0?, 0xf94f7a?)
        /usr/lib/go/src/runtime/chan.go:506 +0x12 fp=0xc0011868a0 sp=0xc001186878 pc=0x4118b2
github.com/containerd/go-runc.(*defaultMonitor).Wait(0xc0015265a0?, 0xc0006dcc00?, 0xc0013c4be0?)
        /go/pkg/mod/github.com/containerd/go-runc@v1.1.0/monitor.go:116 +0x2d fp=0xc001186910 sp=0xc0011868a0 pc=0xf9516d
github.com/containerd/go-runc.(*Runc).Run(0xc000051c20, {0x3a13f70, 0xc0013c4be0}, {0xc0014ed281, 0x19}, {0xc001a87ac0?, 0x761ddaff4048?}, 0x761e21cf1a78?)
        /go/pkg/mod/github.com/containerd/go-runc@v1.1.0/runc.go:343 +0x438 fp=0xc001186aa8 sp=0xc001186910 pc=0xf97778
github.com/dagger/dagger/engine/buildkit.(*Worker).runContainer.func7({0x3a13f70, 0xc0013c4be0}, 0xc000051c20?, {0x3a1ac00?, 0xc0015265a0?}, {0x0?, 0x0?})
        /app/engine/buildkit/executor_spec.go:1186 +0x109 fp=0xc001186b88 sp=0xc001186aa8 pc=0x13afa69
...
#
github.com/dagger/dagger/engine/buildkit.(*Worker).callWithIO(0x2b73a00?, {0x3a13f38?, 0xc001a80db0?}, 0xc00145d680, 0xc0004fa820, {0xc000051c20, {0xc0014ed281, 0x19}, {0x0, 0x0}, ...}, ...)
        /app/engine/buildkit/executor.go:748 +0x97e fp=0xc001186db0 sp=0xc001186b88 pc=0x13a32fe
github.com/dagger/dagger/engine/buildkit.(*Worker).runContainer(0xc0013c4320, {0x3a13f38, 0xc001a80db0}, 0xc000367040)
        /app/engine/buildkit/executor_spec.go:1194 +0xb8b fp=0xc001187178 sp=0xc001186db0 pc=0x13aef8b
github.com/dagger/dagger/engine/buildkit.(*Worker).runContainer-fm({0x3a13f38?, 0xc001a80db0?}, 0xc000367040?)
        <autogenerated>:1 +0x36 fp=0xc0011871a8 sp=0xc001187178 pc=0x13e29b6
github.com/dagger/dagger/engine/buildkit.(*Worker).run(0xc0013c4320, {0x3a13f38, 0xc001a80db0}, 0xc000367040, {0xc001187488, 0x10, 0x5?})
        /app/engine/buildkit/executor.go:225 +0x18d fp=0xc001187250 sp=0xc0011871a8 pc=0x139e6cd
github.com/dagger/dagger/engine/buildkit.(*Worker).Run(_, {_, _}, {_, _}, {{0x39e6280, 0xc000272580}, {0x0, 0x0}, {0x0, ...}, ...}, ...)
        /app/engine/buildkit/executor.go:177 +0x617 fp=0xc001187518 sp=0xc001187250 pc=0x139e477
github.com/moby/buildkit/solver/llbsolver/ops.(*ExecOp).Exec(0xc001f33100, {0x3a13f38, 0xc001a80db0}, {0x39e6300, 0xc0002723e0}, {0xc0002721c0, 0x2, 0x2})
        /app/buildkit/solver/llbsolver/ops/exec.go:493 +0x12ad fp=0xc001187c68 sp=0xc001187518 pc=0x12d8fad
github.com/moby/buildkit/solver.(*sharedOp).Exec.func2({0x3a13f70, 0xc0003c9720})
        /app/buildkit/solver/jobs.go:1122 +0x632 fp=0xc001187e78 sp=0xc001187c68 pc=0xef3fd2
...
#
github.com/moby/buildkit/util/flightcontrol.(*call[...]).run(0x3a3ef20)
        /app/buildkit/util/flightcontrol/flightcontrol.go:123 +0x1fe fp=0xc001187f48 sp=0xc001187e78 pc=0xf02fbe
github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait.func1()
        /app/buildkit/util/flightcontrol/flightcontrol.go:168 +0x25 fp=0xc001187f68 sp=0xc001187f48 pc=0xf02d25
sync.(*Once).doSlow(0xc001a80180?, 0xc001a80198?)
        /usr/lib/go/src/sync/once.go:78 +0xab fp=0xc001187fc0 sp=0xc001187f68 pc=0x48c46b
sync.(*Once).Do(...)
        /usr/lib/go/src/sync/once.go:69
github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait.gowrap2()
        /app/buildkit/util/flightcontrol/flightcontrol.go:168 +0x2a fp=0xc001187fe0 sp=0xc001187fc0 pc=0xf02cca
runtime.goexit({})
        /usr/lib/go/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc001187fe8 sp=0xc001187fe0 pc=0x481c81
created by github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait in goroutine 14004
        /app/buildkit/util/flightcontrol/flightcontrol.go:168 +0x565
#

ok so let's figure out what's blocking under runc:

        runcCall := func(ctx context.Context, started chan<- int, io runc.IO, pidfile string) error {
+               fmt.Printf("ACB runc.Run id=%s bundle=%s\n", state.id, bundle)
                _, err := w.runc.Run(ctx, state.id, bundle, &runc.CreateOpts{
                        Started:   started,
                        IO:        io,
                        ExtraArgs: []string{"--keep"},
                })
+               fmt.Printf("ACB runc.Run id=%s bundle=%s err=%v\n", state.id, bundle, err)
                return err
        }

and here it is: we are stalled on

== pid: 3203575; bundle: /var/lib/dagger/worker/executor/osogryikvx45ln1wk8y5hdz2g ==
['/.init', '/bin/app', 'via-default-args']
#

so my best guess is there's something in dagops that isn't closing the bound services.

craggy heath
#

so I added a printf under:

    stopSvc := func(ctx context.Context, force bool) error {
        fmt.Printf("ACB stopSvc called %s from %s\n", execOp.Meta.Args, debug.Stack())

and it's not being called.

#

@inland sleet does anything obvious jump out at you regarding this blocking call? I think it's related to bound services not shutting down, but I'm not certain that's the actual cause (or just a side effect)

inland sleet
#

Like the file.Stat could be triggering an evaluate and hitting that deadlock

inland sleet
#

Could be 100% wrong but for some reason it feels similar

#

The service container being around might be a red herring since those will keeping running while the session is still up

craggy heath
#

ah, yeah I was wondering about that case.

#

in my repro case (at the top of this thread), as soon as I comment out WithoutDirectory("/doesnt-matter")., the only wrapped dagop, it works fine.

craggy heath
#
...
github.com/moby/buildkit/util/flightcontrol.(*Group[...]).Do(0x3a3db20, {0x3a156d0, 0xc000d18c30}, {0x32ceee5, 0x4}, 0xc000305bc0)
    /app/buildkit/util/flightcontrol/flightcontrol.go:38 +0x93 fp=0xc001418c90 sp=0xc001418c28 pc=0xf01ef3
github.com/moby/buildkit/solver.(*sharedOp).Exec(0xc000f3e460, {0x3a156d0, 0xc000d18c30}, {0xc00038e4c0, 0x2, 0x2})    <-------------------- yup!
    /app/buildkit/solver/jobs.go:1095 +0x1b0 fp=0xc001418d78 sp=0xc001418c90 pc=0xef36d0
github.com/moby/buildkit/solver.(*edge).execOp(0xc000724b40, {0x3a156d0, 0xc000d18c30})
    /app/buildkit/solver/edge.go:910 +0x183 fp=0xc001418ee8 sp=0xc001418d78 pc=0xee3563
github.com/moby/buildkit/solver.(*edge).execOp-fm({0x3a156d0?, 0xc000d18c30?})
    <autogenerated>:1 +0x33 fp=0xc001418f10 sp=0xc001418ee8 pc=0xf09b93
github.com/moby/buildkit/solver/internal/pipe.NewWithFunction[...].func2()
    /app/buildkit/solver/internal/pipe/pipe.go:80 +0xa9 fp=0xc001418fe0 sp=0xc001418f10 pc=0xf06029
runtime.goexit({})
    /usr/lib/go/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc001418fe8 sp=0xc001418fe0 pc=0x481c81
created by github.com/moby/buildkit/solver.(*scheduler).newRequestWithFunc in goroutine 42
    /app/buildkit/solver/scheduler.go:300 +0x18e
#

side note, argggg discord for not letting me paste the whole stack trace due to length ๐Ÿ˜ฆ

craggy heath
#

also have a second one under:

github.com/moby/buildkit/solver/llbsolver/ops.(*ExecOp).Exec(0xc00134da80, {0x3a15698, 0xc0010eb4a0}, {0x39e7a60, 0xc00038e6e0}, {0xc00038e4c0, 0x2, 0x2})
    /app/buildkit/solver/llbsolver/ops/exec.go:493 +0x12ad fp=0xc001bb5c68 sp=0xc001bb5518 pc=0x12d8fad
github.com/moby/buildkit/solver.(*sharedOp).Exec.func2({0x3a156d0, 0xc000d18d20})
    /app/buildkit/solver/jobs.go:1122 +0x632 fp=0xc001bb5e78 sp=0xc001bb5c68 pc=0xef3fd2
github.com/moby/buildkit/util/flightcontrol.(*call[...]).run(0x3a40680)
    /app/buildkit/util/flightcontrol/flightcontrol.go:123 +0x1fe fp=0xc001bb5f48 sp=0xc001bb5e78 pc=0xf02fbe
github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait.func1()
    /app/buildkit/util/flightcontrol/flightcontrol.go:168 +0x25 fp=0xc001bb5f68 sp=0xc001bb5f48 pc=0xf02d25
sync.(*Once).doSlow(0x3a156d0?, 0xc001913110?)
    /usr/lib/go/src/sync/once.go:78 +0xab fp=0xc001bb5fc0 sp=0xc001bb5f68 pc=0x48c46b
sync.(*Once).Do(...)
    /usr/lib/go/src/sync/once.go:69
github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait.gowrap2()
    /app/buildkit/util/flightcontrol/flightcontrol.go:168 +0x2a fp=0xc001bb5fe0 sp=0xc001bb5fc0 pc=0xf02cca
runtime.goexit({})
    /usr/lib/go/src/runtime/asm_amd64.s:1700 +0x1 fp=0xc001bb5fe8 sp=0xc001bb5fe0 pc=0x481c81
created by github.com/moby/buildkit/util/flightcontrol.(*call[...]).wait in goroutine 13006
    /app/buildkit/util/flightcontrol/flightcontrol.go:168 +0x565
#

I just tried a gist, and github gave me a 500.... double sigh.

inland sleet
#

No worries I'll pull your PR down and try to the repro myself

inland sleet
#

Hm yeah, I also saw that there's seemingly two of the service containers running.

What's also very odd is that it seems like the solver is blocked on an ExecOp, but service containers aren't even supposed to be exec-ops, they should just be gateway containers...

craggy heath
inland sleet
craggy heath
#

ohhhh, that makes a bit more sense.

inland sleet
#

So the deadlock is because it's trying to run a long-running service as an exec-op and waiting for it, but obviously the question is how it could ever be doing that...

#

It's somehow finding the Container underlying the Service that's bound and running it as an exec-op, but really struggling to think how that could happen

#

@craggy heath does that ring any bells, anything you can think of in your changes that could somehow result in that sort of confusion?

#

(i'm looking too, just checking)

craggy heath
#

honestly I have no idea ๐Ÿ˜…

inland sleet
#

yep no worries, I'll keep poking

#

good mystery we have here ๐Ÿ˜„

craggy heath
inland sleet
# craggy heath is this the location that's sort of unexpected? https://github.com/dagger/dagger...

No that would be expected for any container that needs to run.

The unexpected is:

  1. That there two containers for the service running
  2. That one of the codepaths for starting the container is going through an execop vs. just a gateway container, e.g. this stack trace:
goroutine 13656 gp=0x4001adf340 m=nil [chan receive, 1 minutes]:
runtime.gopark(0x40016e47d8?, 0x1f344?, 0xd8?, 0x47?, 0x1f3c8?)
        /usr/lib/go/src/runtime/proc.go:435 +0xc8 fp=0x40016e47b0 sp=0x40016e4790 pc=0x88858
runtime.chanrecv(0x4000565b20, 0x40016e48a0, 0x1)
        /usr/lib/go/src/runtime/chan.go:664 +0x42c fp=0x40016e4830 sp=0x40016e47b0 pc=0x1ff9c
runtime.chanrecv1(0x0?, 0x1?)
        /usr/lib/go/src/runtime/chan.go:506 +0x14 fp=0x40016e4860 sp=0x40016e4830 pc=0x1fb34
github.com/containerd/go-runc.(*defaultMonitor).Wait(0x4001a0fa70?, 0x40014b0480?, 0x40011185a0?)
        /go/pkg/mod/github.com/containerd/go-runc@v1.1.0/monitor.go:116 +0x30 fp=0x40016e48d0 sp=0x40016e4860 pc=0xac3160
github.com/containerd/go-runc.(*Runc).Run(0x400062ecf0, {0x320e830, 0x40011185a0}, {0x4001b7cce1, 0x19}, {0x4001300840?, 0x4002700008?},
 0x4001b41de8?)
        /go/pkg/mod/github.com/containerd/go-runc@v1.1.0/runc.go:343 +0x354 fp=0x40016e4a70 sp=0x40016e48d0 pc=0xac5194
github.com/dagger/dagger/engine/buildkit.(*Worker).runContainer.func7({0x320e830, 0x40011185a0}, 0x400062ecf0?, {0x3216500?, 0x4001a0fa7
0?}, {0x0?, 0x0?})
        /app/engine/buildkit/executor_spec.go:1186 +0xd0 fp=0x40016e4b50 sp=0x40016e4a70 pc=0xe72c50
github.com/dagger/dagger/engine/buildkit.(*Worker).callWithIO(0x4000378200?, {0x320e7f8?, 0x4000f8afc0?}, 0x4001002900, 0x40002910c0, {0
x400062ecf0, {0x4001b7cce1, 0x19}, {0x0, 0x0}, ...}, ...)
        /app/engine/buildkit/executor.go:748 +0x738 fp=0x40016e4d80 sp=0x40016e4b50 pc=0xe685c8
github.com/dagger/dagger/engine/buildkit.(*Worker).runContainer(0x4002485b30, {0x320e7f8, 0x4000f8afc0}, 0x40007e4780)
        /app/engine/buildkit/executor_spec.go:1194 +0x890 fp=0x40016e5140 sp=0x40016e4d80 pc=0xe72470
github.com/dagger/dagger/engine/buildkit.(*Worker).runContainer-fm({0x320e7f8?, 0x4000f8afc0?}, 0x40007e4780?)
        <autogenerated>:1 +0x40 fp=0x40016e5170 sp=0x40016e5140 pc=0xe9fbe0
github.com/dagger/dagger/engine/buildkit.(*Worker).run(0x4002485b30, {0x320e7f8, 0x4000f8afc0}, 0x40007e4780, {0x40016e5468, 0x10, 0x0?}
)
        /app/engine/buildkit/executor.go:225 +0x144 fp=0x40016e5220 sp=0x40016e5170 pc=0xe648f4
github.com/dagger/dagger/engine/buildkit.(*Worker).Run(_, {_, _}, {_, _}, {{_, _}, {_, _}, {_, ...}, ...}, ...)
        /app/engine/buildkit/executor.go:177 +0x484 fp=0x40016e54f0 sp=0x40016e5220 pc=0xe64724
github.com/moby/buildkit/solver/llbsolver/ops.(*ExecOp).Exec(0x4000a29e80, {0x320e7f8, 0x4000f8afc0}, {0x31e0c00, 0x40002b0340}, {0x4000
2b01c0, 0x2, 0x2})
        /go/pkg/mod/github.com/dagger/buildkit@v0.0.0-20250519134855-c1734e1f6155/solver/llbsolver/ops/exec.go:493 +0xd60 fp=0x40016e5c5
0 sp=0x40016e54f0 pc=0xdbb850
github.com/moby/buildkit/solver.(*sharedOp).Exec.func2({0x320e830, 0x40012f0370})
        /go/pkg/mod/github.com/dagger/buildkit@v0.0.0-20250519134855-c1734e1f6155/solver/jobs.go:1122 +0x4c0 fp=0x40016e5e60 sp=0x40016e
5c50 pc=0xa2f430
...
#

The fact that github.com/moby/buildkit/solver/llbsolver/ops.(*ExecOp).Exec is in the stack there means it's an actual LLB exec op

#

even though it appears to be a service container

craggy heath
#

ahhhh! thanks for pointing the llbsolver part out

inland sleet
#

Yeah there's only like 10 things named Exec(Op) in the (llb)solver packages, so easy to keep track of which is which! ๐Ÿ˜›

inland sleet
#

Ran w/ this change:

diff --git a/core/container_exec.go b/core/container_exec.go
index b16fe5a9f..1dafce470 100644
--- a/core/container_exec.go
+++ b/core/container_exec.go
@@ -386,6 +386,12 @@ func (container *Container) metaFileContents(ctx context.Context, filePath strin
                return "", fmt.Errorf("%w: %s requires an exec", ErrNoCommand, filePath)
        }
 
+       opdag, err := buildkit.DefToDAG(container.Meta)
+       if err != nil {
+               return "", fmt.Errorf("convert meta to dag: %w", err)
+       }
+       fmt.Println("META OPDAG", opdag.String())
+
        file := NewFile(
                container.Meta,
                path.Join(buildkit.MetaMountDestPath, filePath),

and can see that the stdout file where things start getting blocked has LLB pb:

META OPDAG 0 <nil>
  1 &{Exec:meta:<args:"sh" args:"-c" args:"echo hello" env:"_DAGGER_HOSTNAME_ALIASES=c2otd7ldheh4o=dagger-engine" env:"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" cwd:"/" removeMountStubsRecursive:true > mounts:<dest:"/" > mounts:<input:1 selector:"/.dagger_meta_mount" dest:"/.dagger_meta_mount" output:1 > }
    0 &{File:actions:<input:-1 secondaryInput:-1 output:-1 rm:<path:"/b1e9b1944265e99ebf2837a03a8943fd9b5880f807408b9dc003ff9afd70b341" > > actions:<input:2 output:-1 copy:<src:"/" dest:"/" mode:-1 timestamp:-1 > > actions:<input:3 output:-1 copy:<src:"/" dest:"/" mode:-1 timestamp:-1 > > actions:<input:4 secondaryInput:1 copy:<src:"/" dest:"/" mode:-1 timestamp:-1 > > }
      0 &{Source:identifier:"docker-image://docker.io/library/alpine:3.20.2@sha256:0a4eaa0eecf5f8c050e5bba433f58c052be7587ee8af3e8b3910ef9ab5fbe9f5" attrs:<key:"image.resolvemode" value:"local" > }
      0 &{Exec:meta:<args:"/bin/app" args:"via-default-args" env:"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" cwd:"/" removeMountStubsRecursive:true > mounts:<dest:"/" > mounts:<input:1 selector:"/.dagger_meta_mount" dest:"/.dagger_meta_mount" output:1 > }
        0 &{File:actions:<input:-1 secondaryInput:-1 output:-1 rm:<path:"/044ed1657f8023356acec570c55e2194bde33adc250cb3c41b82b8e43bdca1de" > > actions:<input:1 output:-1 copy:<src:"/" dest:"/" mode:-1 timestamp:-1 > > actions:<input:2 copy:<src:"/" dest:"/" mode:-1 timestamp:-1 > > }
          0 &{File:actions:<secondaryInput:-1 mkfile:<path:"/salty" mode:420 data:"1750723506734207360" timestamp:-1 > > }
            0 &{File:actions:<secondaryInput:-1 mkdir:<path:"/" mode:493 makeParents:true timestamp:-1 > > }
              0 &{Source:identifier:"docker-image://docker.io/library/alpine:3.20.2@sha256:0a4eaa0eecf5f8c050e5bba433f58c052be7587ee8af3e8b3910ef9ab5fbe9f5" attrs:<key:"image.resolvemode" value:"local" > }
        0 &{File:actions:<input:-1 secondaryInput:-1 mkdir:<path:"/.dagger_meta_mount" mode:511 timestamp:-1 > > }
    0 &{File:actions:<input:-1 secondaryInput:-1 mkdir:<path:"/.dagger_meta_mount" mode:511 timestamp:-1 > > }
#

Hard to read obviously, but that does indeed show the LLB for that stdout file includes an ExecOp for the service

#

Trying to see how that could happen

#

Have a feeling we are using PB in a place where we should be using a Result instead, but we'll see

inland sleet
#

Maybe? But can't connect all the dots really yet

#

Checking to see if it's something involving caching, like if cache keys are somehow equal for different containers

#

I'm looking still, but honestly suspect it may make sense for you to rebase on https://github.com/dagger/dagger/pull/10457 since Justin has a bunch of fixes/improvements to Container.withExec in the context of dag-op there. Wouldn't shock me if there's a fix for this lying in there

GitHub

The continuation of #10367, converts withExec to completely avoid generating LLB, and instead lifting all the relevant functionality into dagger.
TODO:

Something about the RawDigest functionality...

#

Yeah, Justin can confirm tomorrow but I think rebasing on his PR is probably the best next step. Something is going wrong in withExec specifically for you, so I'd be afraid that any fixes you make to this would end up just replicating what he had already done

craggy heath
#

Thanks for taking a look at things today, I was starting to spin my wheels a bit.

dry wasp
#

mm i have a suspicion that the work in my PR to rework how services are called (to now not depend on llb execs might potentially sort things through)

#

a couple obvious things to make sure of:

  • make sure .Clone is called in all the right places (and that it's definitely doing the right thing) - this can cause some wild errors when it's not. looks good to me, but maybe worth checking.
  • maybe confirm that Result doesn't escape the dagop context. it's only valid there, if we're getting confused, might be worth adding a hard check for that somewhere.
#

generally, the way that services are so entangled in withExec on main is insane, and so i broke that down a lot - if it doesn't go away on top of 10457, i can dive in a bit more

craggy heath
#

Thanks for those tips.
I was able to repliace it on your PR when I introduced a call to WithSymlink (which triggers the dagop wrapper path):

    script := dag.Container().
        From("alpine:3.20.2").
        WithNewFile("script", "#!/bin/sh\nwhile true; do nc -l -p 8080; done").
        WithExec([]string{"/bin/sh", "-c", "chmod +x script"}).
        File("script")

    srv := dag.Container().
        From("alpine:3.20.2").
        WithNewFile("salty", salt).
        WithFile("/bin/app", script).
        WithSymlink("doesnt", "matter"). // enabling this breaks things
        WithEntrypoint([]string{"/bin/app", "via-entrypoint"}).
        WithDefaultArgs([]string{"/bin/app", "via-default-args"}).
        WithExposedPort(8080)

    s, err := dag.Container().
        From("alpine:3.20.2").
        WithServiceBinding("dagger-engine", srv.AsService()).
        WithExec([]string{"sh", "-c", "echo hello"}).
        Stdout(ctx)
#

However in this case, the error is slightly different:

panic: start l1v3s5i4k0g76 (aliased as dagger-engine): open /.dagger_meta_mount: no such file or directory

However it feels like the underlying issue of the Meta field containing unexpected LLB.

I can make this issue go away by setting Meta = nil in container.AsService, since that then
causes service.startContainer to recreate the propper LLB when it loops over pbmounts:

        if pbmount.Input != pb.Empty {
            st = &states[pbmount.Input]
        } else if pbmount.Dest == buildkit.MetaMountDestPath {
            v := MetaMountState(ctx, "")
            st = &v
        }

So my best guess is the order of the mounts is incorrect, and something somewhere
is not following the required mount[0] = root, mount[1] = meta ordering.

#

I couldn't find any missing clones, nor were there cases where Result or MetaResult persisted

dry wasp
#

๐Ÿค” ohhhh okay nice to know it goes away when meta gets explicitly set to nil

#

actuallllyyyy, i've seen something like this yesterday! just put 2 and 2 together

#

sometimes the output of a dagop can't be chained into a terminal with this exact error

#

i will have a dig! since i was encountering it somewhere completely different as well ๐Ÿ˜„

dry wasp
#

AHA

#

yeah, on my branch, i can replicate this perfectly using dagger-dev core container from --address=alpine with-symlink --link-name=/foo --target=/bin terminal

#
โ–ผ .terminal: Container! 0.3s ERROR
! open /.dagger_meta_mount: no such file or directory
โ•ฐโ”€โœ˜ exec sh 0.2s ERROR
  ! open /.dagger_meta_mount: no such file or directory

Container@xxh3:7d37ede75b093e09
craggy heath
#

sweet! that's a much smaller repro case!

dry wasp
#

okay, sorry got pulled into a couple regressions in v0.18.11

#

i did end up hacking this together between bits today, which clears up a lot of the Meta things. this seems to fix the repro i had above?

#

it's all just an elaborate bit of dancing, but the main bit is - just set the Meta input to nil.

#

i'll be honest. i don't really understand why this fixes the issue.

#

ah i know what it was. sending in a nil input to meta was always invalid.

#

i set it to nil in withexec to make it happen everytime there

craggy heath
dry wasp
#

the problem really is using MetaMountDestPath in the SourcePath - with llb.Scratch that doesn't work, because there's no /.dagger_meta_mount

#

so buildkit errors (kinda reasonable honestly)

#

so to fix, we can remove the SourcePath selector - then we can have a meta=nil input, which fixes it for withExec, and also every other case.

#

okay! yeah that makes sense to me ๐Ÿ˜›

craggy heath
#

I don't really understand why you would want to ever mount a different directory. wouldn't one always want a new clean directory to output meta files to?

dry wasp
#

oh

#

how interesting

#

i think that's a hack as well that i added in

#

i have a suspicion we can get rid of that? lemme quickly test

#

i think that may do it

dry wasp
#

but, the real fix is to just make sure that we can handle a nil/empty source for the meta mount

#

otherwise, we'd have to add this little weird edge case to every place we call ContainerDagOp

#

i'm about to sign off - hopefully the commit i pushed fixes things, otherwsie, feel free to keep digging ๐Ÿ™
one thing that would definitely be useful is some integration test for this behavior (chaining a from -> withSymlink and starting a service/terminal from it)

craggy heath
#

thanks for the help with this, i did a rebase against your latest branch. I'll add a test for this too.

#

I need to rerun some flaky tests, to figure out if it's a flake or not, and will keep on digging. have a good night ๐Ÿ‘‹

craggy heath
#

๐Ÿฉธ ๐ŸŸฅ ๐Ÿท

dry wasp
#

think i fixed it - if not, i'm gone now

#

good luck salute

#

@craggy heath any chance you can rename this thread ๐Ÿฅบ I keep losing it sorry ๐Ÿ˜

#

discord only allows the creator to name it

craggy heath
#

with-exec + services + nil meta bug

#

I've been wanting to rename it too, but only figured out how to right now ๐Ÿ˜…

#

with-exec + services + non-nil meta bug

craggy heath
craggy heath
#

I tried your latest change -- it passes my repro test, but unfortunately some of the other existing tests are failing, e.g.

        --- FAIL: TestContainer/TestExecStdoutStderr/stderr_without_exec (0.01s)
            testctx.go:173: 
                        Error Trace:    /app/core/integration/suite_test.go:347
                                                                /app/core/integration/container_test.go:523
                                                                /go/pkg/mod/github.com/dagger/testctx@v0.0.4/testctx.go:295
                                                                /go/pkg/mod/github.com/dagger/testctx/oteltest@v0.0.3/log.go:37
                                                                /go/pkg/mod/github.com/dagger/testctx/oteltest@v0.0.3/trace.go:83
                                                                /go/pkg/mod/github.com/dagger/testctx@v0.0.4/middleware.go:25
                                                                /go/pkg/mod/github.com/dagger/testctx@v0.0.4/testctx.go:149
                        Error:          Error "empty reference" does not contain "no command has been set"
                        Test:           TestContainer/TestExecStdoutStderr/stderr_without_exec
craggy heath
craggy heath
#

so I'm seeing a different failure:

--- FAIL: TestTelemetry (214.36s)
    --- FAIL: TestTelemetry/TestGolden (214.36s)
        --- FAIL: TestTelemetry/TestGolden/fail-log (3.86s)
...
            testctx.go:197: assertion failed: 
                --- expected
                +++ actual
                @@ -16,5 +16,5 @@
                 โ— viztest: Viztest! X.Xs
                 โ–ผ .failLog: Void X.Xs ERROR
                -โ”œโ”€โ— container: Container! X.Xs
                +โ”œโ”€โœ” container: Container! X.Xs
                 โ”œโ”€$ .from(address: "alpine"): Container! X.Xs CACHED
                 โ”œโ”€โ— .withEnvVariable(name: "NOW", value: "20XX-XX-XX XX:XX:XX.XXXX +XXXX UTC m=+X.X"): Container! X.Xs

see https://dagger.cloud/dagger/traces/c815ed9905b90a9522bc825c9deedc55#720d82a24374dd26:L75 for full logs

I've retried it 3 times now; however, I can't reproduce it locally ๐Ÿ˜ž

inland sleet
#

cc @naive gale โ— -> โœ” is not really meaningful right?

craggy heath
#

I'm seeing what happens when I try to regen it locally; although I'm still confused about the fact that it doesn't happen locally, so if I regen it (and switch โ— -> โœ”), won't it start passing in CI but failing locally?

#

if they aren't meaningful, maybe we need to add them to the scrubber?

inland sleet
naive gale
#

Huh that's weird, I don't think check marks should be showing up at all. Will take a closer look after dinner but pretty sure I swapped those out entirely for circles since they look weird in a graph view

craggy heath
#

so regenerating it locally hasn't switched anything related to icons.

it has changed this:

 โ–ผ connect X.Xs
-โ”œโ”€โ— starting engine X.Xs
+โ”œโ”€โ–ผ starting engine X.Xs
+โ”‚ โ•ฐโ”€โ–ผ create X.Xs
+โ”‚   โ”œโ”€โ–ผ exec docker ps -a --no-trunc --filter name=^dagger-engine-|^dagger-engine-v0\.18\.11$ --format {{.Names}} X.Xs
+โ”‚   โ”‚ โ”ƒ dagger-engine-v0.18.11
+โ”‚   โ•ฐโ”€โ–ผ exec docker start dagger-engine-v0.18.11 X.Xs
+โ”‚     โ”ƒ dagger-engine-v0.18.11
+โ”‚    

but I think this is a red herring to the current CI issue we're dealing with under https://github.com/dagger/dagger/pull/10569

naive gale
#

I'd suggest maybe a rebase is needed, but CI already runs against the merged result thinkspin

craggy heath
#
$ grep -RI IconSuccess
dagql/idtui/frontend.go://             // sym = out.String(IconSuccess)
dagql/idtui/symbols.go:    IconSuccess         = "โœ”"

haha yeah it's even more mysterious now ๐Ÿซ 

#

we'll have to try a rebase

naive gale
#

ohhhh

#
dagql/idtui/golden_test.go
450:        "โœ” container: Container! X.Xs",

it's ze scrubbing

craggy heath
#

sweet! I'll see if I can change that.

dry wasp
dry wasp
#

ooh, thanks for getting the telemetry one in

#

i pulled that in too (co-authored-you on the latest patch, with the idea that each patch is buildable)

#

thanks thanks โค๏ธ

craggy heath
#

nice! glad you found it ๐Ÿ™‚

#

today I'm working on other pipeline-related tasks ๐Ÿ˜