Zig: Assertion failed in codegen.cpp:2030

Created on 29 Jun 2019  路  5Comments  路  Source: ziglang/zig

The following build.zig fails an assertion at codegen.cpp:2030.

const builtin = @import("builtin");
const std = @import("std");
const mem = std.mem;
const Builder = @import("std").build.Builder;

pub fn build(b: *Builder) void {
    const target = b.option([]const u8, "target", "target to build/run for") orelse "x86";
    const builtin_target = if (mem.eql(u8, target, "x86")) builtin.Arch.i386 else unreachable;
}
#0  src_assert (ok=false, source_node=0x5555632546d0) at /home/sam/repos/zig/src/analyze.cpp:7275
#1  0x0000555556454303 in ir_llvm_value (g=0x55555c5f27c0, instruction=0x555563933810)
    at /home/sam/repos/zig/src/codegen.cpp:2030
#2  0x000055555645a9e9 in ir_render_store_ptr (g=0x55555c5f27c0, executable=0x5555632553c8, 
    instruction=0x555563933a10) at /home/sam/repos/zig/src/codegen.cpp:3574
#3  0x0000555556463586 in ir_render_instruction (g=0x55555c5f27c0, executable=0x5555632553c8, 
    instruction=0x555563933a10) at /home/sam/repos/zig/src/codegen.cpp:5658
#4  0x0000555556463f60 in ir_render (g=0x55555c5f27c0, fn_entry=0x555563255270)
    at /home/sam/repos/zig/src/codegen.cpp:5834
#5  0x00005555564690ea in do_code_gen (g=0x55555c5f27c0) at /home/sam/repos/zig/src/codegen.cpp:7011
#6  0x000055555647273a in codegen_build_and_link (g=0x55555c5f27c0)
    at /home/sam/repos/zig/src/codegen.cpp:9615
#7  0x0000555556447dd9 in main (argc=2, argv=0x7fffffffe708) at /home/sam/repos/zig/src/main.cpp:581

The assertion is src_assert(instruction->value.special != ConstValSpecialRuntime, instruction->source_node)

bug

Most helpful comment

@SamTebbs33 asked for debugging tips, so on this issue I'm documented the process I'm going through to solve it.

The first thing I did to troubleshoot this is to make an even more minimal test case which has no dependency on the standard library, and test it with build-obj:

pub fn eql(comptime T: type, a: []const T, b: []const T) bool {
    if (a.len != b.len) return false;
    for (a) |item, index| {
        if (b[index] != item) return false;
    }
    return true;
}

export fn entry() void {
    const target: []const u8 = "whatever";
    const builtin_target = if (eql(u8, target, "x86")) builtin.Arch.i386 else unreachable;
}

const builtin = @import("builtin");
pub fn panic(msg: []const u8, error_return_trace: ?*builtin.StackTrace) noreturn {
    while (true) {}
}

The part at the end with panic overrides the default panic handler, to make it trivial while(true){}. The default panic handler dumps a stack trace, which brings in quite a bit of code from the standard library.

Activate the assertion with

[nix-shell:~/dev/zig/build]$ ./zig build-obj test.zig 
when analyzing /home/andy/dev/zig/build/test.zig:11:5: assertion failed. This is a bug in the Zig compiler.
/home/andy/dev/zig/src-self-hosted/stage1.zig:37:5: 0xc04219 in stage2_panic (userland)
    @panic(ptr[0..len]);
    ^
/home/andy/dev/zig/src/analyze.cpp:8496:0: 0xbe62f5 in src_assert (/home/andy/dev/zig/src/analyze.cpp)
    stage2_panic(msg, strlen(msg));

/home/andy/dev/zig/src/codegen.cpp:1659:0: 0xb0b1cf in ir_llvm_value (/home/andy/dev/zig/src/codegen.cpp)
        src_assert(instruction->value.special != ConstValSpecialRuntime, instruction->source_node);

/home/andy/dev/zig/src/codegen.cpp:3528:0: 0xb12d81 in ir_render_store_ptr (/home/andy/dev/zig/src/codegen.cpp)
        LLVMValueRef ptr = ir_llvm_value(g, instruction->ptr);

/home/andy/dev/zig/src/codegen.cpp:5821:0: 0xb1bc36 in ir_render_instruction (/home/andy/dev/zig/src/codegen.cpp)
            return ir_render_store_ptr(g, executable, (IrInstructionStorePtr *)instruction);

/home/andy/dev/zig/src/codegen.cpp:5985:0: 0xb1c55c in ir_render (/home/andy/dev/zig/src/codegen.cpp)
            instruction->llvm_value = ir_render_instruction(g, executable, instruction);

/home/andy/dev/zig/src/codegen.cpp:7267:0: 0xb21cc1 in do_code_gen (/home/andy/dev/zig/src/codegen.cpp)
        ir_render(g, fn_table_entry);

/home/andy/dev/zig/src/codegen.cpp:9965:0: 0xb2aef7 in codegen_build_and_link (/home/andy/dev/zig/src/codegen.cpp)
            do_code_gen(g);

/home/andy/dev/zig/src/main.cpp:1216:0: 0xb03c76 in main (/home/andy/dev/zig/src/main.cpp)
                codegen_build_and_link(g);

???:?:?: 0x7f56eaebab8d in ??? (???)


???:?:?: 0x2cf6258d4c544154 in ??? (???)


Segmentation fault

The segfault at the end is a flaw in the stack trace dumping code; unrelated to the actual bug.

Next I will try to further simplify the code until I have a truly minimal case:

const Num = enum {
    One,
    Two,
};

export fn entry() void {
    var t = true;
    const x = if (t) Num.Two else unreachable;
}

const builtin = @import("builtin");
pub fn panic(msg: []const u8, error_return_trace: ?*builtin.StackTrace) noreturn {
    while (true) {}
}

Observations:

  • Changing it from const x to var x or to _ avoids the assert.
  • Changing var t to const t avoids the assert.
  • Changing unreachable to Num.One avoids the assert.

Now this is small enough to enable --verbose-ir. This is still quite a bit of output, but we can find the Zig IR for the analyzed entry function:

fn entry() { // (analyzed)
Entry_0:
    #10 | StorePtr              | void        | - | *#6 = true
    :6  | AllocaGen             | *bool       | 2 | Alloca(align=0,name=t)
    #11 | DeclVarGen            | void        | - | var t: bool align(1) = #6 // comptime = false
    #16 | VarPtr                | *bool       | 1 | &t
    #17 | LoadPtrGen            | bool        | 1 | loadptr(#16)result=(null)
    #23 | CondBr                | noreturn    | - | if (#17) $Then_21 else $Else_22
Else_22:
    #28 | Unreachable           | noreturn    | - | unreachable
Then_21:
    #25 | VarPtr                | *const type | 0 | &Num
    #32 | StorePtr              | void        | - | *#31 = Num.Two
    :31 | AllocaGen             | *Num        | 0 | Alloca(align=0,name=x)
    #34 | Br                    | noreturn    | - | goto $EndIf_29
EndIf_29:
    #37 | DeclVarGen            | void        | - | const x: Num align(1) = *Num.Two // comptime = false
    #41 | Return                | noreturn    | - | return {}
}

Now I'm going to inspect state in gdb. In this case, I choose to use gdb on zig0, because it won't bother trying to print a stack trace, and I'm using gdb anyway. Plus if I make changes, I can just do make zig0 and not have to wait for zig to be additionally linked. However it works just as well to use gdb on zig.

gdb ./zig0
(gdb) run build-obj test.zig --verbose-ir

Going up the stack from the assertion, I see that the IR instruction we were rendering at the time was in

static LLVMValueRef ir_render_store_ptr(CodeGen *g, IrExecutable *executable, IrInstructionStorePtr *instruction) {

And indeed if I print the debug id...

(gdb) p instruction.base.debug_id
$1 = 32
(gdb) p instruction.base.source_node->src()
/home/andy/dev/zig/build/test.zig:8:25

You can see that the "debug id" is 32. That's #32 in the above Zig IR dump. Indeed you can see it is a StorePtr instruction.

Unrelated, but while at it I showed how to get from an IrInstruction to print the actual zig source location using gdb. That often comes in handy.

Anyway, the problem here is that we were expecting code generation of a previous instruction to populate the llvm_value of the ptr field of the StorePtr instruction:

(gdb) p instruction.ptr.debug_id
$5 = 31
(gdb) p instruction.ptr.llvm_value
$6 = (LLVMValueRef) 0x0

So the next task is to figure out, why is the llvm_value of #31 not getting populated?

    #32 | StorePtr              | void        | - | *#31 = Num.Two
    :31 | AllocaGen             | *Num        | 0 | Alloca(align=0,name=x)

Thanks to @mikdusan's recent patch, we can see how #31 comes to be - it's an AllocaGen. These are special - the :31 rather than #31 indicates that the previous instrucion references it, but it is not code-generated right there in that position. Rather, all the AllocaGen instructions are code-generated at the very beginning of a function before anything else.

So our job is to look at that code and find out why the llvm_value is not getting populated during the AllocaGen phase of code generation. I put a breakpoint here:

        if (!is_async) {
            // allocate temporary stack data
            for (size_t alloca_i = 0; alloca_i < fn_table_entry->alloca_gen_list.length; alloca_i += 1) {
                IrInstructionAllocaGen *instruction = fn_table_entry->alloca_gen_list.at(alloca_i);
                ZigType *ptr_type = instruction->base.value.type;
                assert(ptr_type->id == ZigTypeIdPointer);
                ZigType *child_type = ptr_type->data.pointer.child_type;
                if (type_resolve(g, child_type, ResolveStatusSizeKnown))
                    zig_unreachable();
                if (!type_has_bits(child_type))
                    continue;
                if (instruction->base.ref_count == 0)
                    continue;
                if (instruction->base.value.special != ConstValSpecialRuntime) {
                    if (const_ptr_pointee(nullptr, g, &instruction->base.value, nullptr)->special !=
                            ConstValSpecialRuntime)
                    {
                        continue;
                    }
                }
                if (type_resolve(g, child_type, ResolveStatusLLVMFull))
                    zig_unreachable();
                instruction->base.llvm_value = build_alloca(g, child_type, instruction->name_hint,
                        get_ptr_align(g, ptr_type));
            }
(gdb) break codegen.cpp:7085
Breakpoint 1 at 0xb1ff0b: file /home/andy/dev/zig/src/codegen.cpp, line 7085.
(gdb) run
The program being debugged has been started already.
Start it from the beginning? (y or n) y
(gdb) p fn_table_entry.symbol_name 
$2 = {
  list = {
    items = 0x65b4ae0 "panic", 
    length = 6, 
    capacity = 8
  }
}
(gdb) continue
(gdb) p fn_table_entry.symbol_name 
$3 = {
  list = {
    items = 0x65bed30 "entry", 
    length = 6, 
    capacity = 8
  }
}

I skipped over the panic function, found the entry function. I see the 2 AllocaGen instructions, and I see that the debug_id matches expectations:

(gdb) p fn_table_entry.alloca_gen_list.items[0].name_hint 
$7 = 0x65b1560 "t"
(gdb) p fn_table_entry.alloca_gen_list.items[1].name_hint 
$8 = 0x65b15c0 "x"
(gdb) p fn_table_entry.alloca_gen_list.items[1].base.debug_id
$9 = 31

So now I step through to the 2nd iteration of this loop, and try to learn why the llvm_value assignment line never gets run. (This line)

                instruction->base.llvm_value = build_alloca(g, child_type, instruction->name_hint,
                        get_ptr_align(g, ptr_type));

In the first iteration of the for loop, I see that the llvm_value of t gets assigned. So far so good. Now for x:

7086                    IrInstructionAllocaGen *instruction = fn_table_entry->alloca_gen_list.at(alloca_i);
(gdb) next
7087                    ZigType *ptr_type = instruction->base.value.type;
(gdb) 
7088                    assert(ptr_type->id == ZigTypeIdPointer);
(gdb) 
7089                    ZigType *child_type = ptr_type->data.pointer.child_type;
(gdb) 
7090                    if (type_resolve(g, child_type, ResolveStatusSizeKnown))
(gdb) 
7092                    if (!type_has_bits(child_type))
(gdb) 
7094                    if (instruction->base.ref_count == 0)
(gdb) 
7095                        continue;
(gdb) p instruction.base.ref_count 
$10 = 0

OK now we're on a hot trail. It appears that the ref_count is incorrectly zero.

Looking back at the Zig IR dump from above, we can now see the ref_count column is indeed 0 for x while it is 2 for t:

...
    :6  | AllocaGen             | *bool       | 2 | Alloca(align=0,name=t)
...
    #32 | StorePtr              | void        | - | *#31 = Num.Two
    :31 | AllocaGen             | *Num        | 0 | Alloca(align=0,name=x)
...

That's simply incorrect. Let's make sure that the StorePtr instruction is referencing the pointer:

static IrInstructionStorePtr *ir_build_store_ptr(IrBuilder *irb, Scope *scope, AstNode *source_node,
        IrInstruction *ptr, IrInstruction *value)
{
    IrInstructionStorePtr *instruction = ir_build_instruction<IrInstructionStorePtr>(irb, scope, source_node);
    instruction->base.value.special = ConstValSpecialStatic;
    instruction->base.value.type = irb->codegen->builtin_types.entry_void;
    instruction->ptr = ptr;
    instruction->value = value;

    ir_ref_instruction(ptr, irb->current_basic_block);
    ir_ref_instruction(value, irb->current_basic_block);

    return instruction;
}

Hmm. That looks fine. It is in fact referencing ptr. Let's find out if that value is getting set back to zero incorrectly. We'll set up a memory watch point on the field. So now the trick is to put a breakpoint somewhere when ir_build_store_ptr gets called for our case. This function is called frequently, so a naive breakpoint wouldn't be useful. Let's gamble on the debug_id being somewhat unique:

(gdb) break ir.cpp:1583 if instruction->base.debug_id == 32
Breakpoint 2 at 0xb34eb6: file /home/andy/dev/zig/src/ir.cpp, line 1583.
(gdb) run
The program being debugged has been started already.
Start it from the beginning? (y or n) y
Breakpoint 2, ir_build_store_ptr (irb=0x65c5a18, scope=0x65c4bc0, source_node=0x65b3660, 
    ptr=0x65c9280, value=0x65c8ed0) at /home/andy/dev/zig/src/ir.cpp:1583
1583        ir_ref_instruction(value, irb->current_basic_block);
(gdb) 

The breakpoint triggered. Let's make sure we are where we expect to be:

(gdb) p instruction->base.source_node->src()
/home/andy/dev/zig/build/test.zig:8:25

That's good, this is the breakpoint we wanted. So now, we make sure the ref count is nonzero, and then tell gdb to notify us if it changes:

(gdb) p ptr.ref_count 
$14 = 1
(gdb) p &ptr->ref_count
$15 = (size_t *) 0x65c92f0
(gdb) watch *0x65c92f0
Hardware watchpoint 3: *0x65c92f0
(gdb) continue
Continuing.
analyze #28
append new bb Then_21
resume (3,0) EndIf_20 #30
analyze #30
analyze #31

Hardware watchpoint 3: *0x65c92f0

Old value = 1
New value = 0
ir_resolve_result_raw (ira=0x65c59f0, suspend_source_instr=0x65c53d0, result_loc=0x65c4280, 
    value_type=0x65c79a0, value=0x65c8ed0, force_runtime=false, non_null_comptime=false)
    at /home/andy/dev/zig/src/ir.cpp:14982
14982                   alloca_src->base.child = alloca_gen;
(gdb) 

Aha, we found a culprit. When resolving the result location, the logic looks like this:

            ResultLocVar *result_loc_var = reinterpret_cast<ResultLocVar *>(result_loc);
            assert(result_loc->source_instruction->id == IrInstructionIdAllocaSrc);

            if (value_type->id == ZigTypeIdUnreachable || value_type->id == ZigTypeIdOpaque) {
                ir_add_error(ira, result_loc->source_instruction,
                    buf_sprintf("variable of type '%s' not allowed", buf_ptr(&value_type->name)));
                return ira->codegen->invalid_instruction;
            }

            IrInstructionAllocaSrc *alloca_src =
                reinterpret_cast<IrInstructionAllocaSrc *>(result_loc->source_instruction);
            bool force_comptime;
            if (!ir_resolve_comptime(ira, alloca_src->is_comptime->child, &force_comptime))
                return ira->codegen->invalid_instruction;
            bool is_comptime = force_comptime || (value != nullptr &&
                    value->value.special != ConstValSpecialRuntime && result_loc_var->var->gen_is_const);

            if (alloca_src->base.child == nullptr || is_comptime) {
                uint32_t align = 0;
                if (alloca_src->align != nullptr && !ir_resolve_align(ira, alloca_src->align->child, nullptr, &align)) {
                    return ira->codegen->invalid_instruction;
                }
                IrInstruction *alloca_gen;
                if (is_comptime && value != nullptr) {
                    if (align > value->value.global_refs->align) {
                        value->value.global_refs->align = align;
                    }
                    alloca_gen = ir_get_ref(ira, result_loc->source_instruction, value, true, false);
                } else {
                    alloca_gen = ir_analyze_alloca(ira, result_loc->source_instruction, value_type, align,
                            alloca_src->name_hint, force_comptime);
                }
                if (alloca_src->base.child != nullptr) {
                    alloca_src->base.child->ref_count = 0;
                }
                alloca_src->base.child = alloca_gen;
            }
            result_loc->written = true;
            result_loc->resolved_loc = is_comptime ? nullptr : alloca_src->base.child;
            return result_loc->resolved_loc;

The line alloca_src->base.child->ref_count = 0; was hit. So now the question becomes, why does analysis think that the ref_count should be set to zero?. It looks like here, analysis incorrectly thinks this is a constant, even though the if condition is a runtime value (because it's var t):

(gdb) p is_comptime 
$27 = true

This logic looks suspect:

            bool is_comptime = force_comptime || (value != nullptr &&
                    value->value.special != ConstValSpecialRuntime && result_loc_var->var->gen_is_const);

Now the task is to figure out how this logic can be improved to detect that the condition is runtime, not comptime

That's what I've got so far, will make a second post with further progress.

All 5 comments

Replacing unreachable with e.g. builtin.Arch.avr prevents the failure but isn't a workaround of course.

@SamTebbs33 asked for debugging tips, so on this issue I'm documented the process I'm going through to solve it.

The first thing I did to troubleshoot this is to make an even more minimal test case which has no dependency on the standard library, and test it with build-obj:

pub fn eql(comptime T: type, a: []const T, b: []const T) bool {
    if (a.len != b.len) return false;
    for (a) |item, index| {
        if (b[index] != item) return false;
    }
    return true;
}

export fn entry() void {
    const target: []const u8 = "whatever";
    const builtin_target = if (eql(u8, target, "x86")) builtin.Arch.i386 else unreachable;
}

const builtin = @import("builtin");
pub fn panic(msg: []const u8, error_return_trace: ?*builtin.StackTrace) noreturn {
    while (true) {}
}

The part at the end with panic overrides the default panic handler, to make it trivial while(true){}. The default panic handler dumps a stack trace, which brings in quite a bit of code from the standard library.

Activate the assertion with

[nix-shell:~/dev/zig/build]$ ./zig build-obj test.zig 
when analyzing /home/andy/dev/zig/build/test.zig:11:5: assertion failed. This is a bug in the Zig compiler.
/home/andy/dev/zig/src-self-hosted/stage1.zig:37:5: 0xc04219 in stage2_panic (userland)
    @panic(ptr[0..len]);
    ^
/home/andy/dev/zig/src/analyze.cpp:8496:0: 0xbe62f5 in src_assert (/home/andy/dev/zig/src/analyze.cpp)
    stage2_panic(msg, strlen(msg));

/home/andy/dev/zig/src/codegen.cpp:1659:0: 0xb0b1cf in ir_llvm_value (/home/andy/dev/zig/src/codegen.cpp)
        src_assert(instruction->value.special != ConstValSpecialRuntime, instruction->source_node);

/home/andy/dev/zig/src/codegen.cpp:3528:0: 0xb12d81 in ir_render_store_ptr (/home/andy/dev/zig/src/codegen.cpp)
        LLVMValueRef ptr = ir_llvm_value(g, instruction->ptr);

/home/andy/dev/zig/src/codegen.cpp:5821:0: 0xb1bc36 in ir_render_instruction (/home/andy/dev/zig/src/codegen.cpp)
            return ir_render_store_ptr(g, executable, (IrInstructionStorePtr *)instruction);

/home/andy/dev/zig/src/codegen.cpp:5985:0: 0xb1c55c in ir_render (/home/andy/dev/zig/src/codegen.cpp)
            instruction->llvm_value = ir_render_instruction(g, executable, instruction);

/home/andy/dev/zig/src/codegen.cpp:7267:0: 0xb21cc1 in do_code_gen (/home/andy/dev/zig/src/codegen.cpp)
        ir_render(g, fn_table_entry);

/home/andy/dev/zig/src/codegen.cpp:9965:0: 0xb2aef7 in codegen_build_and_link (/home/andy/dev/zig/src/codegen.cpp)
            do_code_gen(g);

/home/andy/dev/zig/src/main.cpp:1216:0: 0xb03c76 in main (/home/andy/dev/zig/src/main.cpp)
                codegen_build_and_link(g);

???:?:?: 0x7f56eaebab8d in ??? (???)


???:?:?: 0x2cf6258d4c544154 in ??? (???)


Segmentation fault

The segfault at the end is a flaw in the stack trace dumping code; unrelated to the actual bug.

Next I will try to further simplify the code until I have a truly minimal case:

const Num = enum {
    One,
    Two,
};

export fn entry() void {
    var t = true;
    const x = if (t) Num.Two else unreachable;
}

const builtin = @import("builtin");
pub fn panic(msg: []const u8, error_return_trace: ?*builtin.StackTrace) noreturn {
    while (true) {}
}

Observations:

  • Changing it from const x to var x or to _ avoids the assert.
  • Changing var t to const t avoids the assert.
  • Changing unreachable to Num.One avoids the assert.

Now this is small enough to enable --verbose-ir. This is still quite a bit of output, but we can find the Zig IR for the analyzed entry function:

fn entry() { // (analyzed)
Entry_0:
    #10 | StorePtr              | void        | - | *#6 = true
    :6  | AllocaGen             | *bool       | 2 | Alloca(align=0,name=t)
    #11 | DeclVarGen            | void        | - | var t: bool align(1) = #6 // comptime = false
    #16 | VarPtr                | *bool       | 1 | &t
    #17 | LoadPtrGen            | bool        | 1 | loadptr(#16)result=(null)
    #23 | CondBr                | noreturn    | - | if (#17) $Then_21 else $Else_22
Else_22:
    #28 | Unreachable           | noreturn    | - | unreachable
Then_21:
    #25 | VarPtr                | *const type | 0 | &Num
    #32 | StorePtr              | void        | - | *#31 = Num.Two
    :31 | AllocaGen             | *Num        | 0 | Alloca(align=0,name=x)
    #34 | Br                    | noreturn    | - | goto $EndIf_29
EndIf_29:
    #37 | DeclVarGen            | void        | - | const x: Num align(1) = *Num.Two // comptime = false
    #41 | Return                | noreturn    | - | return {}
}

Now I'm going to inspect state in gdb. In this case, I choose to use gdb on zig0, because it won't bother trying to print a stack trace, and I'm using gdb anyway. Plus if I make changes, I can just do make zig0 and not have to wait for zig to be additionally linked. However it works just as well to use gdb on zig.

gdb ./zig0
(gdb) run build-obj test.zig --verbose-ir

Going up the stack from the assertion, I see that the IR instruction we were rendering at the time was in

static LLVMValueRef ir_render_store_ptr(CodeGen *g, IrExecutable *executable, IrInstructionStorePtr *instruction) {

And indeed if I print the debug id...

(gdb) p instruction.base.debug_id
$1 = 32
(gdb) p instruction.base.source_node->src()
/home/andy/dev/zig/build/test.zig:8:25

You can see that the "debug id" is 32. That's #32 in the above Zig IR dump. Indeed you can see it is a StorePtr instruction.

Unrelated, but while at it I showed how to get from an IrInstruction to print the actual zig source location using gdb. That often comes in handy.

Anyway, the problem here is that we were expecting code generation of a previous instruction to populate the llvm_value of the ptr field of the StorePtr instruction:

(gdb) p instruction.ptr.debug_id
$5 = 31
(gdb) p instruction.ptr.llvm_value
$6 = (LLVMValueRef) 0x0

So the next task is to figure out, why is the llvm_value of #31 not getting populated?

    #32 | StorePtr              | void        | - | *#31 = Num.Two
    :31 | AllocaGen             | *Num        | 0 | Alloca(align=0,name=x)

Thanks to @mikdusan's recent patch, we can see how #31 comes to be - it's an AllocaGen. These are special - the :31 rather than #31 indicates that the previous instrucion references it, but it is not code-generated right there in that position. Rather, all the AllocaGen instructions are code-generated at the very beginning of a function before anything else.

So our job is to look at that code and find out why the llvm_value is not getting populated during the AllocaGen phase of code generation. I put a breakpoint here:

        if (!is_async) {
            // allocate temporary stack data
            for (size_t alloca_i = 0; alloca_i < fn_table_entry->alloca_gen_list.length; alloca_i += 1) {
                IrInstructionAllocaGen *instruction = fn_table_entry->alloca_gen_list.at(alloca_i);
                ZigType *ptr_type = instruction->base.value.type;
                assert(ptr_type->id == ZigTypeIdPointer);
                ZigType *child_type = ptr_type->data.pointer.child_type;
                if (type_resolve(g, child_type, ResolveStatusSizeKnown))
                    zig_unreachable();
                if (!type_has_bits(child_type))
                    continue;
                if (instruction->base.ref_count == 0)
                    continue;
                if (instruction->base.value.special != ConstValSpecialRuntime) {
                    if (const_ptr_pointee(nullptr, g, &instruction->base.value, nullptr)->special !=
                            ConstValSpecialRuntime)
                    {
                        continue;
                    }
                }
                if (type_resolve(g, child_type, ResolveStatusLLVMFull))
                    zig_unreachable();
                instruction->base.llvm_value = build_alloca(g, child_type, instruction->name_hint,
                        get_ptr_align(g, ptr_type));
            }
(gdb) break codegen.cpp:7085
Breakpoint 1 at 0xb1ff0b: file /home/andy/dev/zig/src/codegen.cpp, line 7085.
(gdb) run
The program being debugged has been started already.
Start it from the beginning? (y or n) y
(gdb) p fn_table_entry.symbol_name 
$2 = {
  list = {
    items = 0x65b4ae0 "panic", 
    length = 6, 
    capacity = 8
  }
}
(gdb) continue
(gdb) p fn_table_entry.symbol_name 
$3 = {
  list = {
    items = 0x65bed30 "entry", 
    length = 6, 
    capacity = 8
  }
}

I skipped over the panic function, found the entry function. I see the 2 AllocaGen instructions, and I see that the debug_id matches expectations:

(gdb) p fn_table_entry.alloca_gen_list.items[0].name_hint 
$7 = 0x65b1560 "t"
(gdb) p fn_table_entry.alloca_gen_list.items[1].name_hint 
$8 = 0x65b15c0 "x"
(gdb) p fn_table_entry.alloca_gen_list.items[1].base.debug_id
$9 = 31

So now I step through to the 2nd iteration of this loop, and try to learn why the llvm_value assignment line never gets run. (This line)

                instruction->base.llvm_value = build_alloca(g, child_type, instruction->name_hint,
                        get_ptr_align(g, ptr_type));

In the first iteration of the for loop, I see that the llvm_value of t gets assigned. So far so good. Now for x:

7086                    IrInstructionAllocaGen *instruction = fn_table_entry->alloca_gen_list.at(alloca_i);
(gdb) next
7087                    ZigType *ptr_type = instruction->base.value.type;
(gdb) 
7088                    assert(ptr_type->id == ZigTypeIdPointer);
(gdb) 
7089                    ZigType *child_type = ptr_type->data.pointer.child_type;
(gdb) 
7090                    if (type_resolve(g, child_type, ResolveStatusSizeKnown))
(gdb) 
7092                    if (!type_has_bits(child_type))
(gdb) 
7094                    if (instruction->base.ref_count == 0)
(gdb) 
7095                        continue;
(gdb) p instruction.base.ref_count 
$10 = 0

OK now we're on a hot trail. It appears that the ref_count is incorrectly zero.

Looking back at the Zig IR dump from above, we can now see the ref_count column is indeed 0 for x while it is 2 for t:

...
    :6  | AllocaGen             | *bool       | 2 | Alloca(align=0,name=t)
...
    #32 | StorePtr              | void        | - | *#31 = Num.Two
    :31 | AllocaGen             | *Num        | 0 | Alloca(align=0,name=x)
...

That's simply incorrect. Let's make sure that the StorePtr instruction is referencing the pointer:

static IrInstructionStorePtr *ir_build_store_ptr(IrBuilder *irb, Scope *scope, AstNode *source_node,
        IrInstruction *ptr, IrInstruction *value)
{
    IrInstructionStorePtr *instruction = ir_build_instruction<IrInstructionStorePtr>(irb, scope, source_node);
    instruction->base.value.special = ConstValSpecialStatic;
    instruction->base.value.type = irb->codegen->builtin_types.entry_void;
    instruction->ptr = ptr;
    instruction->value = value;

    ir_ref_instruction(ptr, irb->current_basic_block);
    ir_ref_instruction(value, irb->current_basic_block);

    return instruction;
}

Hmm. That looks fine. It is in fact referencing ptr. Let's find out if that value is getting set back to zero incorrectly. We'll set up a memory watch point on the field. So now the trick is to put a breakpoint somewhere when ir_build_store_ptr gets called for our case. This function is called frequently, so a naive breakpoint wouldn't be useful. Let's gamble on the debug_id being somewhat unique:

(gdb) break ir.cpp:1583 if instruction->base.debug_id == 32
Breakpoint 2 at 0xb34eb6: file /home/andy/dev/zig/src/ir.cpp, line 1583.
(gdb) run
The program being debugged has been started already.
Start it from the beginning? (y or n) y
Breakpoint 2, ir_build_store_ptr (irb=0x65c5a18, scope=0x65c4bc0, source_node=0x65b3660, 
    ptr=0x65c9280, value=0x65c8ed0) at /home/andy/dev/zig/src/ir.cpp:1583
1583        ir_ref_instruction(value, irb->current_basic_block);
(gdb) 

The breakpoint triggered. Let's make sure we are where we expect to be:

(gdb) p instruction->base.source_node->src()
/home/andy/dev/zig/build/test.zig:8:25

That's good, this is the breakpoint we wanted. So now, we make sure the ref count is nonzero, and then tell gdb to notify us if it changes:

(gdb) p ptr.ref_count 
$14 = 1
(gdb) p &ptr->ref_count
$15 = (size_t *) 0x65c92f0
(gdb) watch *0x65c92f0
Hardware watchpoint 3: *0x65c92f0
(gdb) continue
Continuing.
analyze #28
append new bb Then_21
resume (3,0) EndIf_20 #30
analyze #30
analyze #31

Hardware watchpoint 3: *0x65c92f0

Old value = 1
New value = 0
ir_resolve_result_raw (ira=0x65c59f0, suspend_source_instr=0x65c53d0, result_loc=0x65c4280, 
    value_type=0x65c79a0, value=0x65c8ed0, force_runtime=false, non_null_comptime=false)
    at /home/andy/dev/zig/src/ir.cpp:14982
14982                   alloca_src->base.child = alloca_gen;
(gdb) 

Aha, we found a culprit. When resolving the result location, the logic looks like this:

            ResultLocVar *result_loc_var = reinterpret_cast<ResultLocVar *>(result_loc);
            assert(result_loc->source_instruction->id == IrInstructionIdAllocaSrc);

            if (value_type->id == ZigTypeIdUnreachable || value_type->id == ZigTypeIdOpaque) {
                ir_add_error(ira, result_loc->source_instruction,
                    buf_sprintf("variable of type '%s' not allowed", buf_ptr(&value_type->name)));
                return ira->codegen->invalid_instruction;
            }

            IrInstructionAllocaSrc *alloca_src =
                reinterpret_cast<IrInstructionAllocaSrc *>(result_loc->source_instruction);
            bool force_comptime;
            if (!ir_resolve_comptime(ira, alloca_src->is_comptime->child, &force_comptime))
                return ira->codegen->invalid_instruction;
            bool is_comptime = force_comptime || (value != nullptr &&
                    value->value.special != ConstValSpecialRuntime && result_loc_var->var->gen_is_const);

            if (alloca_src->base.child == nullptr || is_comptime) {
                uint32_t align = 0;
                if (alloca_src->align != nullptr && !ir_resolve_align(ira, alloca_src->align->child, nullptr, &align)) {
                    return ira->codegen->invalid_instruction;
                }
                IrInstruction *alloca_gen;
                if (is_comptime && value != nullptr) {
                    if (align > value->value.global_refs->align) {
                        value->value.global_refs->align = align;
                    }
                    alloca_gen = ir_get_ref(ira, result_loc->source_instruction, value, true, false);
                } else {
                    alloca_gen = ir_analyze_alloca(ira, result_loc->source_instruction, value_type, align,
                            alloca_src->name_hint, force_comptime);
                }
                if (alloca_src->base.child != nullptr) {
                    alloca_src->base.child->ref_count = 0;
                }
                alloca_src->base.child = alloca_gen;
            }
            result_loc->written = true;
            result_loc->resolved_loc = is_comptime ? nullptr : alloca_src->base.child;
            return result_loc->resolved_loc;

The line alloca_src->base.child->ref_count = 0; was hit. So now the question becomes, why does analysis think that the ref_count should be set to zero?. It looks like here, analysis incorrectly thinks this is a constant, even though the if condition is a runtime value (because it's var t):

(gdb) p is_comptime 
$27 = true

This logic looks suspect:

            bool is_comptime = force_comptime || (value != nullptr &&
                    value->value.special != ConstValSpecialRuntime && result_loc_var->var->gen_is_const);

Now the task is to figure out how this logic can be improved to detect that the condition is runtime, not comptime

That's what I've got so far, will make a second post with further progress.

Upon further reflection, it occurred to me that the is_comptime is actually correct! If you have foo orelse unreachable then it's comptime-known that the answer is foo. Indeed, if we add a comptime-conditional compile error here, you can see that, at least at comptime, zig actually does figure out x at comptime:

export fn entry() void {
    var t = true;
    const x = if (t) Num.Two else unreachable;
    if (x != .Two) @compileError("bad"); // never triggers, zig gets to the assert in codegen
}

So actually the only problem here is the StorePtr function is bogus, and should be elided. The question becomes, how can we know to skip generateding code for the store pointer instruction in this case, but only this case?

Well if the ref_count gets intentionally set to zero, that should be the information we want. So in the StorePtr instruction we can check if the pointer has ref_count = 0. In this case we omit the store.

--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -3522,6 +3522,15 @@ static LLVMValueRef ir_render_store_ptr(CodeGen *g, IrExecutable *executable, Ir
     assert(ptr_type->id == ZigTypeIdPointer);
     if (!type_has_bits(ptr_type))
         return nullptr;
+    if (instruction->ptr->ref_count == 0) {
+        // In this case, this StorePtr instruction should be elided. Something happened like this:
+        //     var t = true;
+        //     const x = if (t) Num.Two else unreachable;
+        // The if condition is a runtime value, so the StorePtr for `x = Num.Two` got generated
+        // (this instruction being rendered) but because of `else unreachable` the result ended
+        // up being a comptime const value.
+        return nullptr;
+    }

     bool have_init_expr = !value_is_all_undef(&instruction->value->value);
     if (have_init_expr) {

With this patch, the code example works. The next step is to add a behavioral test to cover the case. Thanks to our super minimal test case, this is fairly straightforward:

test "const result loc, runtime if cond, else unreachable" {
    const Num = enum {
        One,
        Two,
    };

    var t = true;
    const x = if (t) Num.Two else unreachable;
    if (x != .Two) @compileError("bad");
}

With the patch:

$ ./zig test test.zig
1/1 test "const result loc, runtime if cond, else unreachable"...OK
All tests passed.

Add that sucker to test/stage1/behavior/if.zig. Run the full test suite, minus the slow release builds:

$ ./zig build test -Dskip-release
...snip...

10 minutes later, all good. The patch is ready.

Thanks @andrewrk. That's a brilliant write-up and I learnt a lot about what you can do with zig and gdb to help debug issues. I especially like the --verbose-ir tip and watching memory changes in gdb.

For the gdb 'noobies' like me, I would like to add to this that is can be really handy to use the builtin debugging features of VScode. Installing the C++ extension allows VScode to use gdb and it is really handy to debug the Zig source code like this. For me this gave a much better overview, because you can see the actual source code while also following the control flow.

Was this page helpful?
0 / 5 - 0 ratings

Related issues

jayschwa picture jayschwa  路  3Comments

daurnimator picture daurnimator  路  3Comments

dobkeratops picture dobkeratops  路  3Comments

andrewrk picture andrewrk  路  3Comments

jorangreef picture jorangreef  路  3Comments