TL;DR
在类设计含有多重继承的内存布局下,对象的 this
指针是存在编译器隐含的偏移修正行为
现象如下:
- 首个 base class 并不需要修正,因为内存布局与 derived class 是重叠共用的,无论有无
vptr
的介入 - 其它 base class 由编译器修正
this
指针,需要付出轻微的运行时成本(多一条加法指令) - 如果 derived class 隐藏了 base class 的函数,那调用时无需修正(因为调用的是 derived class 实现的函数)
- 对于含有非首个 base class 且具有
virtual
函数的 base class,函数调用时this
指针将通过访问 non-virtual thunk进行修正,当然是要更多的额外运行时成本
累了,不想水文章,既然解释复杂就看代码吧
case1
假设存在多重继承,代码如下:
struct A {
int a;
void func_a() {}
};
struct B {
int b;
void func_b() {}
};
struct C: A, B {
int c;
void func_c() {}
};
int main() {
C c;
c.func_a();
c.func_b();
c.func_c();
}
0000000000401106 <main>:
401106: 55 push %rbp
401107: 48 89 e5 mov %rsp,%rbp
40110a: 48 83 ec 10 sub $0x10,%rsp
40110e: 48 8d 45 f4 lea -0xc(%rbp),%rax
this等于%rbp-$0xc
401112: 48 89 c7 mov %rax,%rdi
401115: e8 24 00 00 00 callq 40113e <_ZN1A6func_aEv>
40111a: 48 8d 45 f4 lea -0xc(%rbp),%rax
40111e: 48 83 c0 04 add $0x4,%rax
在这里this+4然后调用func_b
401122: 48 89 c7 mov %rax,%rdi
401125: e8 20 00 00 00 callq 40114a <_ZN1B6func_bEv>
作为成员函数接受的是B*类型,因此需要调整
40112a: 48 8d 45 f4 lea -0xc(%rbp),%rax
40112e: 48 89 c7 mov %rax,%rdi
401131: e8 20 00 00 00 callq 401156 <_ZN1C6func_cEv>
401136: b8 00 00 00 00 mov $0x0,%eax
40113b: c9 leaveq
40113c: c3 retq
40113d: 90 nop
case2
struct A {
int a;
void func_a() {}
};
struct B {
int b;
void func_b() {}
};
struct C: A, B {
int c;
// hide B::func_b
void func_b() {}
void func_c() {}
};
int main() {
C c;
c.func_a();
c.func_b();
c.func_c();
}
0000000000401106 <main>:
401106: 55 push %rbp
401107: 48 89 e5 mov %rsp,%rbp
40110a: 48 83 ec 10 sub $0x10,%rsp
40110e: 48 8d 45 f4 lea -0xc(%rbp),%rax
401112: 48 89 c7 mov %rax,%rdi
401115: e8 20 00 00 00 callq 40113a <_ZN1A6func_aEv>
40111a: 48 8d 45 f4 lea -0xc(%rbp),%rax
40111e: 48 89 c7 mov %rax,%rdi
401121: e8 20 00 00 00 callq 401146 <_ZN1C6func_bEv>
this并不需要偏移,符合直觉
401126: 48 8d 45 f4 lea -0xc(%rbp),%rax
40112a: 48 89 c7 mov %rax,%rdi
40112d: e8 20 00 00 00 callq 401152 <_ZN1C6func_cEv>
401132: b8 00 00 00 00 mov $0x0,%eax
401137: c9 leaveq
401138: c3 retq
401139: 90 nop
case3
struct A {
int a;
virtual void func_a() {}
};
struct B {
int b;
virtual void func_b() {}
};
struct C: A, B {
int c;
void func_b() override {}
virtual void func_c() {}
};
int main() {
C *c = new C();
B *b = c;
c->func_a();
c->func_b();
c->func_c();
b->func_b();
}
注:此时的 vtable for C 布局如下:
vtable for C:
.quad 0
.quad typeinfo for C
.quad A::func_a()
.quad C::func_b()
.quad C::func_c()
.quad -16
.quad typeinfo for C
.quad non-virtual thunk to C::func_b()
dump 解释如下
0000000000401126 <main>:
401126: 55 push %rbp
401127: 48 89 e5 mov %rsp,%rbp
40112a: 53 push %rbx
40112b: 48 83 ec 18 sub $0x18,%rsp
40112f: bf 20 00 00 00 mov $0x20,%edi
401134: e8 f7 fe ff ff callq 401030 <_Znwm@plt>
执行operator new
401139: 48 89 c3 mov %rax,%rbx
40113c: 48 c7 03 00 00 00 00 movq $0x0,(%rbx)
401143: c7 43 08 00 00 00 00 movl $0x0,0x8(%rbx)
40114a: 48 c7 43 10 00 00 00 movq $0x0,0x10(%rbx)
401151: 00
401152: c7 43 18 00 00 00 00 movl $0x0,0x18(%rbx)
401159: c7 43 1c 00 00 00 00 movl $0x0,0x1c(%rbx)
401160: 48 89 df mov %rbx,%rdi
401163: e8 e4 00 00 00 callq 40124c <_ZN1CC1Ev>
执行C::C()
401168: 48 89 5d e8 mov %rbx,-0x18(%rbp)
this_c位于%rbp-0x18
40116c: 48 83 7d e8 00 cmpq $0x0,-0x18(%rbp)
401171: 74 0a je 40117d <main+0x57>
401173: 48 8b 45 e8 mov -0x18(%rbp),%rax
401177: 48 83 c0 10 add $0x10,%rax
this_b相对this_c做了偏移$0x10修正
在gdb中运行时信息如下:
$ p c
> (C *) 0x416eb0
$ p b
> (B *) 0x416ec0
40117b: eb 05 jmp 401182 <main+0x5c>
40117d: b8 00 00 00 00 mov $0x0,%eax
401182: 48 89 45 e0 mov %rax,-0x20(%rbp)
this_b位于%rbp-0x20
401186: 48 8b 45 e8 mov -0x18(%rbp),%rax
获取this_c
40118a: 48 8b 00 mov (%rax),%rax
获取vptr,即vtable for C+16
40118d: 48 8b 10 mov (%rax),%rdx
获取func_a in vtable
401190: 48 8b 45 e8 mov -0x18(%rbp),%rax
401194: 48 89 c7 mov %rax,%rdi
传入this指针
401197: ff d2 callq *%rdx
调用func_a
401199: 48 8b 45 e8 mov -0x18(%rbp),%rax
40119d: 48 8b 00 mov (%rax),%rax
获取vptr
4011a0: 48 83 c0 08 add $0x8,%rax
使得访问vtable偏移+0x8(以func_a或者说vtable for C+16为基准)
4011a4: 48 8b 10 mov (%rax),%rdx
4011a7: 48 8b 45 e8 mov -0x18(%rbp),%rax
4011ab: 48 89 c7 mov %rax,%rdi
4011ae: ff d2 callq *%rdx
调用func_b
4011b0: 48 8b 45 e8 mov -0x18(%rbp),%rax
4011b4: 48 8b 00 mov (%rax),%rax
4011b7: 48 83 c0 10 add $0x10,%rax
4011bb: 48 8b 10 mov (%rax),%rdx
4011be: 48 8b 45 e8 mov -0x18(%rbp),%rax
4011c2: 48 89 c7 mov %rax,%rdi
4011c5: ff d2 callq *%rdx
调用func_c
4011c7: 48 8b 45 e0 mov -0x20(%rbp),%rax
获取this_b
4011cb: 48 8b 00 mov (%rax),%rax
获取vptr,即vtable for C+56
4011ce: 48 8b 10 mov (%rax),%rdx
获取non-virtual thunk to C::func_b()
4011d1: 48 8b 45 e0 mov -0x20(%rbp),%rax
4011d5: 48 89 c7 mov %rax,%rdi
传入B*类型的this_b
4011d8: ff d2 callq *%rdx
调用non-virtual thunk to C::func_b(),即_ZThn16_N1C6func_bEv
4011da: b8 00 00 00 00 mov $0x0,%eax
4011df: 48 8b 5d f8 mov -0x8(%rbp),%rbx
4011e3: c9 leaveq
4011e4: c3 retq
4011e5: 90 nop
0000000000401209 <_ZThn16_N1C6func_bEv>:
401209: 48 83 ef 10 sub $0x10,%rdi
修正this指针,即从B*到C*
40120d: eb ef jmp 4011fe <_ZN1C6func_bEv>
再传入到重写的C::func_b()
40120f: 90 nop
更新:benchmark
最近写了点 perf
测试,这里展示了 virtual call
容易受 cacheline 影响的案例