前言

上文使用了 packetdrill 来观测 TCP backlog 行为,还提供了两次握手特例、优化的 Nagle 实现、TCP 40ms 魔数的复现程序。本文继续使用 packetdrill 观测 TCP 的有状态连接管理,并对照 RFC793 / RFC9293 的 TCP 状态转移图来完成测试用例。测试结果基于 Linux 6.4.8。

基本上是一边看 RFC 一边整理的资料,内容比较零散,看看就好。如有错误还请大佬指出。

RFC9293

2022 年发布的 RFC9293 取代了经典的 RFC793,状态转移图也略有差异:

                             +---------+ ---------\      active OPEN
                             |  CLOSED |            \    -----------
                             +---------+<---------\   \   create TCB
                               |     ^              \   \  snd SYN
                  passive OPEN |     |   CLOSE        \   \
                  ------------ |     | ----------       \   \
                   create TCB  |     | delete TCB         \   \
                               V     |                      \   \
+          rcv RST (note 1)  +---------+            CLOSE    |    \
        -------------------->|  LISTEN |          ---------- |     |
       /                     +---------+          delete TCB |     |
      /           rcv SYN      |     |     SEND              |     |
     /           -----------   |     |    -------            |     V
 +--------+      snd SYN,ACK  /       \   snd SYN          +--------+
 |        |<-----------------           ------------------>|        |
 |  SYN   |                    rcv SYN                     |  SYN   |
 |  RCVD  |<-----------------------------------------------|  SENT  |
-|        |                    snd ACK                     |        |
+|        |                  snd SYN,ACK                   |        |
 |        |------------------           -------------------|        |
 +--------+   rcv ACK of SYN  \       /  rcv SYN,ACK       +--------+
    |         --------------   |     |   -----------
    |                x         |     |     snd ACK
    |                          V     V
    |  CLOSE                 +---------+
    | -------                |  ESTAB  |
    | snd FIN                +---------+
    |                 CLOSE    |     |    rcv FIN
    V                -------   |     |    -------
 +---------+         snd FIN  /       \   snd ACK         +---------+
 |  FIN    |<----------------          ------------------>|  CLOSE  |
 | WAIT-1  |------------------                            |   WAIT  |
 +---------+          rcv FIN  \                          +---------+
   | rcv ACK of FIN   -------   |                          CLOSE  |
   | --------------   snd ACK   |                         ------- |
   V        x                   V                         snd FIN V
 +---------+               +---------+                    +---------+
 |FINWAIT-2|               | CLOSING |                    | LAST-ACK|
 +---------+               +---------+                    +---------+
   |              rcv ACK of FIN |                 rcv ACK of FIN |
   |  rcv FIN     -------------- |    Timeout=2MSL -------------- |
   |  -------            x       V    ------------        x       V
    \ snd ACK              +---------+delete TCB          +---------+
      -------------------->|TIME-WAIT|------------------->| CLOSED  |
                           +---------+                    +---------+

状态转移对于 RFC793 版本的差异有两点:

  • SYN_RCVD 状态接收到 RST 报文后回退到 LISTEN 状态。
  • 同时打开特性 SYN_SENT 发送 SYNACK 而不是 ACK。

报文生成器

这么复杂的状态机,不可能每一个样例都要手写,因此需要一个简单的 packetdrill 文件生成器。

思路也很简单,用基础图论的做法,把每个 TCP 状态视为顶点,新的状态只需在旧的状态下增加任意一条转移路径就好了。这里转移路径对应新增的报文。

代码部分

程序使用 C++20 实现:

#include <bits/stdc++.h>

struct State_base {
    int win {1000};
    int out_seq {0}; // >
    int in_seq  {0}; // <
    std::ostream &os;
    State_base(std::ostream &os): os(os) {}
    ~State_base() {os << "+.0 `sleep 1000000` \n";}
};

// 为了避免在构造函数里拉面条
void add_edge(auto &from, auto &to);

template <std::derived_from<State_base> Vertex, auto UNIQUE_using = [](){}>
struct From: Vertex {
    From(std::ostream &os): Vertex(os) {
        add_edge(static_cast<Vertex&>(*this), *this);
    }
};

using LISTEN   = From<State_base>;
using SYN_RCVD = From<LISTEN>;
using ESTAB    = From<SYN_RCVD>;
using FW1      = From<ESTAB>;
using FW2      = From<FW1>;
using TW       = From<FW2>;

// SYN_SENT 和 LISTEN 并非同一类型
using SYN_SENT = From<State_base>;
using CW       = From<SYN_SENT>;
using LA       = From<CW>;

void add_edge(State_base &, LISTEN &state) {
    state.os <<
        "--bind_port=8848\n"
        "  0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3\n"
        "+.0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0\n"
        "+.0 bind(3, ..., ...) = 0\n"
        "+.0 listen(3, 128) = 0\n";
}

void add_edge(LISTEN &, SYN_RCVD &state) {
    state.os << std::format(
        "+.1 < S 0:0(0) win {0}\n"
        "+.0 > S. 0:0(0) ack 1 <...>\n"
    , state.win);
    state.in_seq++;
}

// 注意这是基于被动打开的连接
void add_edge(SYN_RCVD &, ESTAB &state) {
    state.os << std::format(
        "+.1 < . 1:1(0) ack 1 win {}\n"
        "+.0 accept(3, ..., ...) = 4\n"
    , state.win);
    state.out_seq++; // S. packet acked.
}

void add_edge(ESTAB &, FW1 &state) {
    // 虽然 shutdown() 也能触发 FW1
    // 后续收到 ACK of FIN 也能进入 FW2
    // 但是再次发出 FIN 无法进入 TW,而 close() 可以
    state.os << std::format(
        "+1  close(4) = 0\n"
    );
}

void add_edge(FW1 &, FW2 &state) {
    state.os << std::format(
        "+.1 < . 1:1(0) ack 2 win {}\n"
    , state.win);
    state.out_seq++; // F. packet acked.
}

void add_edge(FW2 &, TW &state) {
    state.os << std::format(
        "+.1 < F. 1:1(0) win {}\n"
    , state.win);
    state.in_seq++;
}

void add_edge(State_base &, SYN_SENT &state) {
    // 此时 8848 作为 client,必须要 bind 绑定
    state.os <<
        "--bind_port=8848\n"
        "--connect_port=38848\n"
        "--tolerance_usecs=10000\n"
        "  0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3\n"
        "+.0 bind(3, ..., ...) = 0\n"
        "+.0 fcntl(3, F_SETFL, O_RDWR | O_NONBLOCK) = 0\n"
        "+.1 connect(3, ..., ...) = -1\n"
        "+.0 fcntl(3, F_SETFL, O_RDWR) = 0\n"
        "+.0 > S  0:0(0) <...>\n";
    state.out_seq++;
}

void add_edge(SYN_SENT &, CW &state) {
    state.os << std::format(
        "+.1 < S. 0:0(0) ack 1 win {0}\n"
        "+.0 > . 1:1(0) ack 1\n"

        "+.0 < F. 1:1(0) win {0}\n"
        "+.0 > . 1:1(0) ack 2\n"
    , state.win);
    state.in_seq += 2;
}

void add_edge(CW &, LA &state) {
    state.os <<
        "+.1 shutdown(3, SHUT_WR) = 0\n"
        "+.0 > F. 1:1(0) ack 2\n";
    state.out_seq++;
}

//////////////////////////////////////////////////////////////////

constexpr struct packet {
    char flag {' '};
    int  len {0};
    bool logical_seq {flag != ' '};
    bool ack {true};
    bool random_ack {false};
    int  addend_ack {0};
    int  seq {-1};
} default_ack_packet;

template <packet option = default_ack_packet>
void make_single_packet(auto &state) {
    char ack_flag = option.ack ? '.': ' ';
    std::string ack_str;
    if(option.ack) {
        int ack_num = (option.random_ack ? 19260817 : state.out_seq)
                    +  option.addend_ack;
        ack_str = std::format("ack {}", ack_num);
    }

    bool use_state_seq = option.seq < 0;
    int seq_start = use_state_seq ? state.in_seq : option.seq;
    int seq_end = seq_start + option.len;

    state.os << std::format(
        "+.1 < {}{} {}:{}({}) {} win {}\n"
    , option.flag, ack_flag
    , seq_start, seq_end, option.len
    , ack_str, state.win);

    if((option.len || option.logical_seq) && use_state_seq) {
        state.in_seq += std::max(option.len, 1);
    }
}

constexpr struct write_packet {
    int len {10};
} default_write_packet;

template <write_packet write_option>
void make_single_packet(auto &state) {
    int len = write_option.len;
    int seq_start = state.out_seq;
    int seq_end = seq_start + write_option.len;

    state.os << std::format(
        "+.1 write(4, ..., {0}) = {0}\n"
        "+.0 > P. {1}:{2}({0}) ack {3} <...>\n"
    , len, seq_start, seq_end, state.in_seq);
}

template <auto ...options>
void make_packet(auto &state) {
    (make_single_packet<options>(state), ...);
}

//////////////////////////////////////////////////////////////////

// rfc9293 目录是未经任何修改报文,其文件名(states)对应于连接可到达的状态
// 其余目录会在 rfc9293 的基础上使用 make_前缀函数构造多余的报文
template <typename T, typename Ptr = void(*)(T&)>
auto dir2func = std::unordered_map<std::string_view, Ptr> {
    {"rfc9293",     make_packet},
    {"ack",         make_packet<default_ack_packet>},
    {"syn",         make_packet<packet{.flag='S', .ack=false}>},
    {"fin",         make_packet<packet{.flag='F', .ack=false}>},
    {"rst",         make_packet<packet{.flag='R', .ack=false}>},
    {"synack",      make_packet<packet{.flag='S'}>},
    {"finack",      make_packet<packet{.flag='F'}>},
    {"rstack",      make_packet<packet{.flag='R'}>},
    {"data",        make_packet<packet{.flag='P', .len=10}>},
    {"data_random", make_packet<packet{.flag='P', .len=10, .random_ack=true}>},
    {"older_ack",   make_packet<packet{.addend_ack=-1}>},
    {"newer_ack",   make_packet<packet{.addend_ack=+1}>},
    {"write",       make_packet<default_write_packet>},
};

std::string_view states[]{"LISTEN", "SYN_RCVD", "SYN_SENT", "ESTAB", "FW1", "FW2", "TW", "CW", "LA"};
using State_types =
               std::tuple< LISTEN,   SYN_RCVD,   SYN_SENT,   ESTAB,   FW1,   FW2,   TW,   CW,   LA >;

//////////////////////////////////////////////////////////////////

using dentry_base = std::string;

struct dentry: dentry_base {
    struct lazy_evaluation: std::reference_wrapper<dentry> {
        operator dentry() const { return get() + ".pkt"; }
    } pkt {*this};
    dentry(std::string storage): dentry_base(std::move(storage)) {}
    dentry(std::string_view sv): dentry_base(sv) {}
};

dentry operator/(auto &&t, const dentry &d) {
    return {t + dentry_base("/") + d};
}

int main() {
    []<std::size_t ...Is>(std::index_sequence<Is...>) {
        ([&]<std::size_t I> {
            using namespace std::literals;
            using State = std::tuple_element_t<I, State_types>;
            for(auto &&[dir, func] : dir2func<State>) {
                std::filesystem::create_directories(dir);
                dentry root = "."sv;
                dentry name = states[I];
                dentry path = root/dir/name.pkt;
                std::ofstream filestream(path);
                State state(filestream);
                func(state);
            }
        }.template operator()<Is>(), ...);
    }(std::make_index_sequence<std::size(states)>{});
}

这是单文件程序,直接拿去用就行了。代码也存放在 Github,里面还多一个没啥用的构建文件。

使用说明

编译并执行程序就能生成基于 RFC9293 的状态转移图(./rfc9293 目录),每一个 packetdrill 文件都对应着最终生成的状态。比方说,SYN_RCVD.pkt 文件,表示当前的连接会最终停留在 SYN_RCVD 状态。

同时还为每一个状态添加了继续伪造报文(make_)的功能,因此可以观察处于某个状态的 TCP 连接收到不同类型的报文后会有怎样的行为。比方说,位于 ./rst/SYN_RCVD.pkt 的文件,表示在连接到达 SYN_RCVD 状态后,会接收到对端放出的一个 RST 报文(然后暂停,供用户观测协议栈行为)。

NOTES:

  • 程序是静态的,它是按组合生成了 \((states 状态数 \times make 函数个数)\) 个 packetdrill 文件。
  • 目前的数目是 118 个 pkt 文件,要测到天荒地老了。
  • 偷懒了 CLOSING 没写,后面补上。
  • 生成的目录请参考代码中 dir2func 映射表,每一个状态的继承操作请参考 add_edge() 重载函数。这是一个图上加边的过程,状态完整构造所选用的路径请参考 From 类型模板。

NOTES 测试

RFC 文件也说了上面的转移图是不完整的(出于可读性考虑),还在图示下方补充了三条备注:

  • Note 1: The transition from SYN-RECEIVED to LISTEN on receiving a RST is conditional on having reached SYN-RECEIVED after a passive OPEN.
  • Note 2: The figure omits a transition from FIN-WAIT-1 to TIME-WAIT if a FIN is received and the local FIN is also acknowledged.
  • Note 3: A RST can be sent from any state with a corresponding transition to TIME-WAIT (see [70] for rationale). These transitions are not explicitly shown; otherwise, the diagram would become very difficult to read. Similarly, receipt of a RST from any state results in a transition to LISTEN or CLOSED, though this is also omitted from the diagram for legibility.

NOTE1

NOTE1 表明了从 SYN_RCVD 回退到 LISTEN 是仅被动打开才会发生的,如果是途径 SYN_SENT(由于同时打开导致转换为 SYN_RCVD)就不能回退。

       TCP Peer A                                           TCP Peer B

   1.  CLOSED                                               LISTEN

   2.  SYN-SENT    --> <SEQ=100><CTL=SYN>               ...

   3.  (duplicate) ... <SEQ=90><CTL=SYN>                --> SYN-RECEIVED

   4.  SYN-SENT    <-- <SEQ=300><ACK=91><CTL=SYN,ACK>   <-- SYN-RECEIVED

   5.  SYN-SENT    --> <SEQ=91><CTL=RST>                --> LISTEN

   6.              ... <SEQ=100><CTL=SYN>               --> SYN-RECEIVED

   7.  ESTABLISHED <-- <SEQ=400><ACK=101><CTL=SYN,ACK>  <-- SYN-RECEIVED

   8.  ESTABLISHED --> <SEQ=101><ACK=401><CTL=ACK>      --> ESTABLISHED

                 Figure 8: Recovery from Old Duplicate SYN

原因在 Figure 8 处已经说明,这可以使得连接识别出旧且重复的 SYN(对应图中 SEQ=90)。

要测试这个状态转移需要再加一行代码:

template <typename T, typename Ptr = void(*)(T&)>
auto dir2func = std::unordered_map<std::string_view, Ptr> {
    {"rfc9293",     make_packet},
    {"ack",         make_packet<default_ack_packet>},
    {"syn",         make_packet<packet{.flag='S', .ack=false}>},
    {"fin",         make_packet<packet{.flag='F', .ack=false}>},
    {"rst",         make_packet<packet{.flag='R', .ack=false}>},
    {"synack",      make_packet<packet{.flag='S'}>},
    {"finack",      make_packet<packet{.flag='F'}>},
    {"rstack",      make_packet<packet{.flag='R'}>},
    {"data",        make_packet<packet{.flag='P', .len=10}>},
    {"data_random", make_packet<packet{.flag='P', .len=10, .random_ack=true}>},
    {"older_ack",   make_packet<packet{.addend_ack=-1}>},
    {"newer_ack",   make_packet<packet{.addend_ack=+1}>},
    {"write",       make_packet<default_write_packet>},
+   {"note1",       make_packet<packet{.flag='S', .ack=false}, packet{.flag='R', .ack=false}>},
};

被动打开测试,执行 ./rst/SYN_RCVD.pkt 文件并使用 tcpdump -i any -n port 8848 抓包:

18:57:02.786093 tun0  In  IP 192.0.2.1.45801 > 192.168.203.72.8848: Flags [S], seq 0, win 1000, length 0
18:57:02.786163 tun0  Out IP 192.168.203.72.8848 > 192.0.2.1.45801: Flags [S.], seq 1116725817, ack 1, win 64240, options [mss 1460], length 0
18:57:02.889316 tun0  In  IP 192.0.2.1.45801 > 192.168.203.72.8848: Flags [R], seq 1, win 1000, length 0

关联资料:Is it necessary for an RST packet to have an acknowledgement number?

使用 netstat -ant | grep 8848 查看端口信息,确实存在 LISTEN 状态:

tcp        0      0 192.168.203.72:8848     0.0.0.0:*               LISTEN

主动打开测试,执行 ./note1/SYN_SENT.pkt 文件,以同样的方式抓包:

18:58:18.999471 tun0  Out IP 192.168.5.249.8848 > 192.0.2.1.38848: Flags [S], seq 2521803630, win 64240, options [mss 1460,sackOK,TS val 4021042687 ecr 0,nop,wscale 7], length 0
18:58:19.101643 tun0  In  IP 192.0.2.1.38848 > 192.168.5.249.8848: Flags [S], seq 0, win 1000, length 0
18:58:19.101670 tun0  Out IP 192.168.5.249.8848 > 192.0.2.1.38848: Flags [S.], seq 2521803630, ack 1, win 64240, options [mss 1460,sackOK,TS val 4021042790 ecr 0,nop,wscale 7], length 0
18:58:19.201425 tun0  In  IP 192.0.2.1.38848 > 192.168.5.249.8848: Flags [R], seq 1, win 1000, length 0

此时查看端口信息并没有关于 8848 端口的状态信息。

NOTE2

NOTE2 提到 FINWAIT-1 状态可直接跳转到 TIME_WAIT 状态,条件是自身发出 FIN 后,紧接着又收到 FIN(而不是 ack of FIN)。

直接使用 ./finack/FW1.pkt 文件(因为连接建立后的每一个报文都会携带 ack):

19:08:08.776444 tun0  In  IP 192.0.2.1.51811 > 192.168.42.52.8848: Flags [S], seq 0, win 1000, length 0
19:08:08.776501 tun0  Out IP 192.168.42.52.8848 > 192.0.2.1.51811: Flags [S.], seq 628615028, ack 1, win 64240, options [mss 1460], length 0
19:08:08.878744 tun0  In  IP 192.0.2.1.51811 > 192.168.42.52.8848: Flags [.], ack 1, win 1000, length 0
19:08:09.882515 tun0  Out IP 192.168.42.52.8848 > 192.0.2.1.51811: Flags [F.], seq 1, ack 1, win 64240, length 0
19:08:09.984640 tun0  In  IP 192.0.2.1.51811 > 192.168.42.52.8848: Flags [F.], seq 1, ack 2, win 1000, length 0
19:08:09.984698 tun0  Out IP 192.168.42.52.8848 > 192.0.2.1.51811: Flags [.], ack 2, win 64239, length 0

tcp        0      0 192.168.42.52:8848      192.0.2.1:51811         TIME_WAIT

其实这就是三次挥手特例,因为没有剩余数据传递,所以不需要 close wait 阶段,把原先四次挥手的第二、三次挥手都合并在一起发给对方了。

NOTE3

NOTE3 表明了每一状态均可发出 RST,并可以直接状态转移到 TIME_WAIT。(显然,没有建立连接就不需要 TIME_WAIT 了)。对端收到 RST 也可以转换为 LISTEN 或者 CLOSED。(回退到 LISTEN 在 NOTE1 小节就测过了。)

个人觉得可以这么做的原理应该和正常关闭相似,主动发出的本端既要保证已「闭嘴」的对端连接关闭,又要耗尽此前仍徘徊在网络中的报文,因此转移到 TIME_WAIT 也可能是合理的。但是我之前的印象是发出端也是直接关闭的(而不是 TIME_WAIT),即使 RST 丢失了也无所谓。总之测一下吧。

使用 ./rstack/ESTAB.pkt 测试被动接收 RST 报文。本端作为服务器将使得该连接处于 CLOSED,并且保留 LISTEN。和 NOTE1 一样就不贴结果了。

问题在于如何使得本端建立连接后主动发出 RST。一个简单的操作是 Ctrl+C 中断程序,当对端给出数据但是本端未被用户层读取而提前结束时,将发出 RST 报文(这符合 broken pipe 的定义)。

因此使用 ./data/ESTAB.pkt,在睡眠阶段 Ctrl+C:

19:56:20.343793 tun0  In  IP 192.0.2.1.41881 > 192.168.9.181.8848: Flags [S], seq 0, win 1000, length 0
19:56:20.343882 tun0  Out IP 192.168.9.181.8848 > 192.0.2.1.41881: Flags [S.], seq 2764299726, ack 1, win 64240, options [mss 1460], length 0
19:56:20.444131 tun0  In  IP 192.0.2.1.41881 > 192.168.9.181.8848: Flags [.], ack 1, win 1000, length 0
19:56:20.547897 tun0  In  IP 192.0.2.1.41881 > 192.168.9.181.8848: Flags [P.], seq 1:11, ack 1, win 1000, length 10
19:56:20.547932 tun0  Out IP 192.168.9.181.8848 > 192.0.2.1.41881: Flags [.], ack 11, win 64230, length 0
19:56:29.541596 ?     Out IP 192.168.9.181.8848 > 192.0.2.1.41881: Flags [R.], seq 1, ack 11, win 64230, length 0

虽然成功发出了 RST,但是并没有找到任何状态信息,因为进程也跟着挂了,有点尴尬……

无论是手动调用 close(),还是未携带数据时使用 lingerclose(),均无法复现¯\_(ツ)_/¯。


进一步查阅了文档。已同步的 TCP 状态其实并不会轻易发出 RST,一般是涉及网络层的 IP Security Compartment 问题(不了解这方面)才会发出。而一些无法理解的报文则是回复正常的 ACK。

If the connection is in a synchronized state (ESTABLISHED, FIN- WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, TIME-WAIT), any unacceptable segment (out-of-window sequence number or unacceptable acknowledgment number) must be responded to with an empty acknowledgment segment (without any user data) containing the current send sequence number and an acknowledgment indicating the next sequence number expected to be received, and the connection remains in the same state.

If an incoming segment has a security level or compartment that does not exactly match the level and compartment requested for the connection, a reset is sent and the connection goes to the CLOSED state. The reset takes its sequence number from the ACK field of the incoming segment.

3.5.2. Reset Generation

其实上面的 broken pipe 行为是否符合 RFC9293 也不好说,毕竟实现和标准总是有一点差距。

同时打开

           rcv RST (note 1)  +---------+            CLOSE    |    \
        -------------------->|  LISTEN |          ---------- |     |
       /                     +---------+          delete TCB |     |
      /           rcv SYN      |     |     SEND              |     |
     /           -----------   |     |    -------            |     V
 +--------+      snd SYN,ACK  /       \   snd SYN          +--------+
 |        |<-----------------           ------------------>|        |
 |  SYN   |                    rcv SYN                     |  SYN   |
 |  RCVD  |<-----------------------------------------------|  SENT  |
-|        |                    snd ACK                     |        |
+|        |                  snd SYN,ACK                   |        |
 |        |------------------           -------------------|        |
 +--------+   rcv ACK of SYN  \       /  rcv SYN,ACK       +--------+

TCP 存在同时打开特性,从图示可以得知,这个状态转移只能是从主动方(SYN_SENT)转换为被动方(SYN_RCVD),而不能反过来。

前面也说了状态转移图存在版本差异,这里测试 Linux 实现是否符合 RFC9293。

使用 ./syn/SYN_SENT.pkt 文件查看即可。

03:06:23.453201 tun0  Out IP 192.168.151.78.8848 > 192.0.2.1.38848: Flags [S], seq 1463561890, win 64240, options [mss 1460,sackOK,TS val 1725663921 ecr 0,nop,wscale 7], length 0
03:06:23.553378 tun0  In  IP 192.0.2.1.38848 > 192.168.151.78.8848: Flags [S], seq 0, win 1000, length 0
03:06:23.553440 tun0  Out IP 192.168.151.78.8848 > 192.0.2.1.38848: Flags [S.], seq 1463561890, ack 1, win 64240, options [mss 1460,sackOK,TS val 1725664021 ecr 0,nop,wscale 7], length 0
03:06:24.472721 tun0  Out IP 192.168.151.78.8848 > 192.0.2.1.38848: Flags [S.], seq 1463561890, ack 1, win 64240, options [mss 1460,sackOK,TS val 1725664941 ecr 0,nop,wscale 7], length 0
03:06:26.542733 tun0  Out IP 192.168.151.78.8848 > 192.0.2.1.38848: Flags [S.], seq 1463561890, ack 1, win 64240, options [mss 1460,sackOK,TS val 1725667011 ecr 0,nop,wscale 7], length 0
03:06:30.622719 tun0  Out IP 192.168.151.78.8848 > 192.0.2.1.38848: Flags [S.], seq 1463561890, ack 1, win 64240, options [mss 1460,sackOK,TS val 1725671091 ecr 0,nop,wscale 7], length 0

tcp        0      1 192.168.151.78:8848     192.0.2.1:38848         SYN_RECV

很显然是发出了 SYNACK 而不是 ACK,符合 RFC9293 描述。

同时关闭

 +---------+         snd FIN  /       
 |  FIN    |<----------------         
 | WAIT-1  |------------------        
 +---------+          rcv FIN  \      
   | rcv ACK of FIN   -------   |     
   | --------------   snd ACK   |     
   V        x                   V     
 +---------+               +---------+
 |FINWAIT-2|               | CLOSING |
 +---------+               +---------+
   |              rcv ACK of FIN |    
   |  rcv FIN     -------------- |    
   |  -------            x       V    
    \ snd ACK              +---------+
      -------------------->|TIME-WAIT|
                           +---------+

TCP 也存在同时关闭特性,也就是双方都进入 FINWAIT-1, CLOSING, TIME-WAIT 的转移。

这些现象很少见,但是现在观测就简单了,哪怕是我偷懒没写 CLOSING 状态。

使用 ./finack/FW1.pkt 文件:

16:02:11.839967 tun0  In  IP 192.0.2.1.57001 > 192.168.31.162.8848: Flags [S], seq 0, win 1000, length 0
16:02:11.840010 tun0  Out IP 192.168.31.162.8848 > 192.0.2.1.57001: Flags [S.], seq 722750401, ack 1, win 64240, options [mss 1460], length 0
16:02:11.940123 tun0  In  IP 192.0.2.1.57001 > 192.168.31.162.8848: Flags [.], ack 1, win 1000, length 0
16:02:12.940472 tun0  Out IP 192.168.31.162.8848 > 192.0.2.1.57001: Flags [F.], seq 1, ack 1, win 64240, length 0
16:02:13.040605 tun0  In  IP 192.0.2.1.57001 > 192.168.31.162.8848: Flags [F.], seq 1, ack 1, win 1000, length 0
16:02:13.040624 tun0  Out IP 192.168.31.162.8848 > 192.0.2.1.57001: Flags [.], ack 2, win 64239, length 0
16:02:13.258970 tun0  Out IP 192.168.31.162.8848 > 192.0.2.1.57001: Flags [F.], seq 1, ack 2, win 64239, length 0
16:02:13.889271 tun0  Out IP 192.168.31.162.8848 > 192.0.2.1.57001: Flags [F.], seq 1, ack 2, win 64239, length 0

tcp        1      1 192.168.31.162:8848     192.0.2.1:57001         CLOSING

RFC1337

sysctl -a | grep rfc 中有一个唯一用 RFC 命名的选项:net.ipv4.tcp_rfc1337(默认值为 0)。这是一个决定 TIME_WAIT 阶段收到 RST 是否自杀的选项。

With setting 0 the system would ‘assassinate’ a socket in time_wait prematurely upon receiving a RST.

Talk: Sysctl – ArchWiki

使用 ./rstack/TW.pkt 文件:

16:05:00.349597 tun0  In  IP 192.0.2.1.52555 > 192.168.149.165.8848: Flags [S], seq 0, win 1000, length 0
16:05:00.349645 tun0  Out IP 192.168.149.165.8848 > 192.0.2.1.52555: Flags [S.], seq 1417792594, ack 1, win 64240, options [mss 1460], length 0
16:05:00.450252 tun0  In  IP 192.0.2.1.52555 > 192.168.149.165.8848: Flags [.], ack 1, win 1000, length 0
16:05:01.450395 tun0  Out IP 192.168.149.165.8848 > 192.0.2.1.52555: Flags [F.], seq 1, ack 1, win 64240, length 0
16:05:01.550478 tun0  In  IP 192.0.2.1.52555 > 192.168.149.165.8848: Flags [.], ack 2, win 1000, length 0
16:05:01.650750 tun0  In  IP 192.0.2.1.52555 > 192.168.149.165.8848: Flags [F.], seq 1, ack 0, win 1000, length 0
16:05:01.650771 tun0  Out IP 192.168.149.165.8848 > 192.0.2.1.52555: Flags [.], ack 2, win 64240, length 0
16:05:01.750506 tun0  In  IP 192.0.2.1.52555 > 192.168.149.165.8848: Flags [R.], seq 2, ack 2, win 1000, length 0

// net.ipv4.tcp_rfc1337 设为 1 才能维持 TIME_WAIT
tcp        0      0 192.168.149.165:8848    192.0.2.1:52555         TIME_WAIT

如果使用默认值 0,那么端口就会自杀,如果改为 1,那么仍会保持 TIME_WAIT 状态(忽视 RST 报文)。两个选项产生的报文相同,只贴一份。

Acceptable ACK

前面提到已同步的 TCP 状态即便收到一些无法理解的报文,也必须回复 ACK。

现在来观测一下未同步的 TCP 状态(比如 SYN_SENT 状态)收到不同报文的行为:

  • 如果发出 ACK . 0:0(0) ack 1,那么本端保持 SYN_SENT 状态而不做回应。
  • 如果发出 ACK . 0:0(0) ack 2,那么本端会抛出 RST,因为这属于不可理解的报文。
  • 即使发出了 RST,本端仍保留在 SYN_SENT 状态。
  • 可以分别使用./ack/SYN_SENT.pkt 和./newer_ack/SYN_SENT.pkt 复现测试。

ACK 报文能不能理解/接受见 acceptable ACK 的定义。(大致意思是要确认的范围在对方发送窗口内)

A new acknowledgment (called an “acceptable ack”) is one for which the inequality below holds:

SND.UNA < SEG.ACK =< SND.NXT

3.4-12. acceptable ACK

If the connection is in any non-synchronized state (LISTEN, SYN-SENT, SYN-RECEIVED), and the incoming segment acknowledges something not yet sent (the segment carries an unacceptable ACK), or if an incoming segment has a security level or compartment (Appendix A.1) that does not exactly match the level and compartment requested for the connection, a reset is sent.

3.5.2. Reset Generation

也就是说如果连接未同步且收到携带不可接受 ACK 的报文,本端会发出 RST 报文。

// ack 1
20:21:46.446068 tun0  Out IP 192.168.112.105.8848 > 192.0.2.1.38848: Flags [S], seq 2802350882, win 64240, options [mss 1460,sackOK,TS val 63407479 ecr 0,nop,wscale 7], length 0
20:21:46.546201 tun0  In  IP 192.0.2.1.38848 > 192.168.112.105.8848: Flags [.], ack 2802350883, win 1000, length 0
20:21:47.485278 tun0  Out IP 192.168.112.105.8848 > 192.0.2.1.38848: Flags [S], seq 2802350882, win 64240, options [mss 1460,sackOK,TS val 63408518 ecr 0,nop,wscale 7], length 0
20:21:49.565553 tun0  Out IP 192.168.112.105.8848 > 192.0.2.1.38848: Flags [S], seq 2802350882, win 64240, options [mss 1460,sackOK,TS val 63410599 ecr 0,nop,wscale 7], length 0
20:21:53.645646 tun0  Out IP 192.168.112.105.8848 > 192.0.2.1.38848: Flags [S], seq 2802350882, win 64240, options [mss 1460,sackOK,TS val 63414679 ecr 0,nop,wscale 7], length 0

// ack 2
20:21:59.585648 tun0  Out IP 192.168.74.103.8848 > 192.0.2.1.38848: Flags [S], seq 973534190, win 64240, options [mss 1460,sackOK,TS val 1877370807 ecr 0,nop,wscale 7], length 0
20:21:59.685726 tun0  In  IP 192.0.2.1.38848 > 192.168.74.103.8848: Flags [.], ack 973534192, win 1000, length 0
20:21:59.685747 tun0  Out IP 192.168.74.103.8848 > 192.0.2.1.38848: Flags [R], seq 973534192, win 0, length 0
20:21:59.715260 tun0  Out IP 192.168.74.103.8848 > 192.0.2.1.38848: Flags [S], seq 973534190, win 64240, options [mss 1460,sackOK,TS val 1877370936 ecr 0,nop,wscale 7], length 0
20:22:01.725294 tun0  Out IP 192.168.74.103.8848 > 192.0.2.1.38848: Flags [S], seq 973534190, win 64240, options [mss 1460,sackOK,TS val 1877372946 ecr 0,nop,wscale 7], length 0
20:22:05.805264 tun0  Out IP 192.168.74.103.8848 > 192.0.2.1.38848: Flags [S], seq 973534190, win 64240, options [mss 1460,sackOK,TS val 1877377026 ecr 0,nop,wscale 7], length 0

简单的观测数据如上。

Acceptable segment

       +=========+=========+======================================+
       | Segment | Receive | Test                                 |
       | Length  | Window  |                                      |
       +=========+=========+======================================+
       | 0       | 0       | SEG.SEQ = RCV.NXT                    |
       +---------+---------+--------------------------------------+
       | 0       | >0      | RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND |
       +---------+---------+--------------------------------------+
       | >0      | 0       | not acceptable                       |
       +---------+---------+--------------------------------------+
       | >0      | >0      | RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND |
       |         |         |                                      |
       |         |         | or                                   |
       |         |         |                                      |
       |         |         | RCV.NXT =< SEG.SEQ+SEG.LEN-1 <       |
       |         |         | RCV.NXT+RCV.WND                      |
       +---------+---------+--------------------------------------+

                   Table 5: Segment Acceptability Tests

在前面 NOTE3 小节的备注中提到的任意的不可接受报文(any unacceptable segment)除了有上一章所说的携带不可接受 ACK 的报文以外,还包括携带超出窗口范围的序列号的报文,具体见 表 5。最普遍的就是条件是第四条:要想让接收的报文合法,需要保证序列号头尾两端任一边不超过接收窗口范围。当然,TCP 还要考虑各种边界条件,包括零窗口和长度为 0 的报文,主要想表达的意思应该是:只携带控制位的报文即使在零窗口的情况下,只要有合理的序列号(条件一和条件二),那也是可接受的。

NOTE: 条件四只要任意一边满足即可合法,是因为要处理的报文可能与窗口是重叠的。

测试需要将报文生成器中的第四行 State_base::win 大小设为 0 做对比,重新编译运行后再执行./write/ESTAB.pkt:

// win=1000 的情况
16:19:21.189276 tun0  In  IP 192.0.2.1.57659 > 192.168.48.95.8848: Flags [S], seq 0, win 1000, length 0
16:19:21.189302 tun0  Out IP 192.168.48.95.8848 > 192.0.2.1.57659: Flags [S.], seq 3108445590, ack 1, win 64240, options [mss 1460], length 0
16:19:21.289380 tun0  In  IP 192.0.2.1.57659 > 192.168.48.95.8848: Flags [.], ack 1, win 1000, length 0
16:19:21.389939 tun0  Out IP 192.168.48.95.8848 > 192.0.2.1.57659: Flags [P.], seq 1:11, ack 1, win 64240, length 10
16:19:21.708969 tun0  Out IP 192.168.48.95.8848 > 192.0.2.1.57659: Flags [P.], seq 1:11, ack 1, win 64240, length 10
16:19:22.338939 tun0  Out IP 192.168.48.95.8848 > 192.0.2.1.57659: Flags [P.], seq 1:11, ack 1, win 64240, length 10

// win=0 的情况
16:21:27.299392 tun0  In  IP 192.0.2.1.58469 > 192.168.97.5.8848: Flags [S], seq 0, win 0, length 0
16:21:27.299418 tun0  Out IP 192.168.97.5.8848 > 192.0.2.1.58469: Flags [S.], seq 1811567674, ack 1, win 64240, options [mss 1460], length 0
16:21:27.399508 tun0  In  IP 192.0.2.1.58469 > 192.168.97.5.8848: Flags [.], ack 1, win 0, length 0
16:21:27.818947 tun0  Out IP 192.168.97.5.8848 > 192.0.2.1.58469: Flags [.], ack 1, win 64240, length 0
16:21:28.448957 tun0  Out IP 192.168.97.5.8848 > 192.0.2.1.58469: Flags [.], ack 1, win 64240, length 0
16:21:29.699256 tun0  Out IP 192.168.97.5.8848 > 192.0.2.1.58469: Flags [.], ack 1, win 64240, length 0

tcp        0     10 192.168.97.5:8848       192.0.2.1:58469         ESTABLISHED

win>0 就是常规的握手后,服务端主动传输数据给客户端。win=0 对应于第一个和第三个条件,即使是零窗口,SYN 确认报文正常也能发出并建立 ESTABLISHED,但是数据报文就没法动手了。

Challenge ACK

上面对于「理解的报文」还是定义得太草率了,状态机哪有什么理解不理解的,只有在哪一个状态对哪些事件做出怎样的处理。RFC 里有非常完备的描述:3.10.7. SEGMENT ARRIVES

但是这里面很复杂,有不少的边角条件是 RFC5961 搞出来的(非必须实现,但是 Linux 实现了)。这份文档主要涉及到 TCP 的安全性,看得不多,就说下我了解到的 challenge ACK。

如果没有上面这份文档,那么多数状态被动收到一个 RST 报文应该做出什么样的表现?大概可分为以下两种行为:

  • RST 报文的序列号不在接收窗口内,忽视。
  • RST 报文的序列号在接收窗口内,沉默自杀。

NOTES: LISTEN 状态是无条件忽视 RST 报文;不被忽视的 RST 也称为合法的 RST。

但是这种自杀行为可以被利用为一种攻击手段,称为 blind RST attack。大意是有些应用的连接很容易猜对四元组,那么针对这个连接就可以构造 RST 报文进行盲猜序列号,猜中了(落在窗口内)就能把对方杀死,达成恶意攻击的效果。

解决办法就是将 窗口范围内均允许 改为 仅限于窗口期待的下一个序列号:

  • RST 报文的序列号精准匹配接收窗口期待的下一个序列号,自杀。
  • RST 报文的序列号只落在窗口内(但不符合上述条件),只返回一个 challenge ACK。

NOTE: SYN_SENT 状态除外,它是测试 RST 报文中的 ACK 是否确认了 SYN。

challenge ACK 可理解为假装正常的 ACK(报文格式就很普通,<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>),本端不会处理异常的攻击报文,(真实而非攻击方的)对端收到普通的 ACK 也不会改变 TCP 状态。这样做使得 RST 攻击必须命中唯一的序列号,从而降低了引起连接自杀的概率。

但同时也使得合法的 RST 报文成为不一定能杀死对方的报文(因为真实的对端也会 RST),这种情况会因为第一次发出(合法但不精准匹配)就删除了自身连接的 TCB,收到 challenge ACK 后会因为缺失 TCB 而继续继续发出第二次 RST,总会达成精准匹配的情况。

同时对于已同步的 TCP 状态,也有类似的 SYN 检查(同样是 RST 攻击的变种)。但是条件改为:只要设置了 SYN 位,无论序列号多少,都会发出 challenge ACK。

After sending the acknowledgment, TCP implementations MUST drop the unacceptable segment and stop processing further.

check the RST bit
check the SYN bit

Further segments destined to this connection will be processed as normal.

RFC 5961 - 3.2. Mitigation


复述了很长一段背景(累死人了 (#゚д゚)),测试就简单复现下 RST 的情况。需要添加对应的报文:

template <typename T, typename Ptr = void(*)(T&)>
auto dir2func = std::unordered_map<std::string_view, Ptr> {
    {"rfc9293",     make_packet},
    {"ack",         make_packet<default_ack_packet>},
    {"syn",         make_packet<packet{.flag='S', .ack=false}>},
    {"fin",         make_packet<packet{.flag='F', .ack=false}>},
    {"rst",         make_packet<packet{.flag='R', .ack=false}>},
    {"synack",      make_packet<packet{.flag='S'}>},
    {"finack",      make_packet<packet{.flag='F'}>},
    {"rstack",      make_packet<packet{.flag='R'}>},
    {"data",        make_packet<packet{.flag='P', .len=10}>},
    {"data_random", make_packet<packet{.flag='P', .len=10, .random_ack=true}>},
    {"older_ack",   make_packet<packet{.addend_ack=-1}>},
    {"newer_ack",   make_packet<packet{.addend_ack=+1}>},
    {"write",       make_packet<default_write_packet>},
    {"note1",       make_packet<packet{.flag='S', .ack=false}, packet{.flag='R', .ack=false}>},
+   {"challenge1",  make_packet<write_packet{.len=8}, packet{.flag='R', .seq=1 /* reset */}>},
+   {"challenge2",  make_packet<write_packet{.len=8}, packet{.flag='R', .seq=3 /* challenge */}>},
+   {"challenge3",  make_packet<write_packet{.len=8}, packet{.flag='R', .seq=0 /* ignore */}>},
};

使用 ./challenge[1-3]/ESTAB.pkt 可以观测三种不同的 RST 行为。

// challenge1: reset
16:14:14.289416 tun0  In  IP 192.0.2.1.32769 > 192.168.168.172.8848: Flags [S], seq 0, win 1000, length 0
16:14:14.289443 tun0  Out IP 192.168.168.172.8848 > 192.0.2.1.32769: Flags [S.], seq 2641743937, ack 1, win 64240, options [mss 1460], length 0
16:14:14.389603 tun0  In  IP 192.0.2.1.32769 > 192.168.168.172.8848: Flags [.], ack 1, win 1000, length 0
16:14:14.489737 tun0  Out IP 192.168.168.172.8848 > 192.0.2.1.32769: Flags [P.], seq 1:9, ack 1, win 64240, length 8
16:14:14.589907 tun0  In  IP 192.0.2.1.32769 > 192.168.168.172.8848: Flags [R.], seq 1, ack 1, win 1000, length 0

// challenge2: challenge
16:14:20.499409 tun0  In  IP 192.0.2.1.38261 > 192.168.152.139.8848: Flags [S], seq 0, win 1000, length 0
16:14:20.499464 tun0  Out IP 192.168.152.139.8848 > 192.0.2.1.38261: Flags [S.], seq 2608614879, ack 1, win 64240, options [mss 1460], length 0
16:14:20.599824 tun0  In  IP 192.0.2.1.38261 > 192.168.152.139.8848: Flags [.], ack 1, win 1000, length 0
16:14:20.700226 tun0  Out IP 192.168.152.139.8848 > 192.0.2.1.38261: Flags [P.], seq 1:9, ack 1, win 64240, length 8
16:14:20.800601 tun0  In  IP 192.0.2.1.38261 > 192.168.152.139.8848: Flags [R.], seq 3, ack 1, win 1000, length 0
16:14:20.800618 tun0  Out IP 192.168.152.139.8848 > 192.0.2.1.38261: Flags [.], ack 1, win 64240, length 0
16:14:21.019282 tun0  Out IP 192.168.152.139.8848 > 192.0.2.1.38261: Flags [P.], seq 1:9, ack 1, win 64240, length 8
16:14:21.648955 tun0  Out IP 192.168.152.139.8848 > 192.0.2.1.38261: Flags [P.], seq 1:9, ack 1, win 64240, length 8

// challenge3: ignore
16:14:28.729736 tun0  In  IP 192.0.2.1.34781 > 192.168.45.153.8848: Flags [S], seq 0, win 1000, length 0
16:14:28.729864 tun0  Out IP 192.168.45.153.8848 > 192.0.2.1.34781: Flags [S.], seq 405765145, ack 1, win 64240, options [mss 1460], length 0
16:14:28.830141 tun0  In  IP 192.0.2.1.34781 > 192.168.45.153.8848: Flags [.], ack 1, win 1000, length 0
16:14:28.930250 tun0  Out IP 192.168.45.153.8848 > 192.0.2.1.34781: Flags [P.], seq 1:9, ack 1, win 64240, length 8
16:14:29.030349 tun0  In  IP 192.0.2.1.34781 > 192.168.45.153.8848: Flags [R.], seq 0, ack 1, win 1000, length 0
16:14:29.249179 tun0  Out IP 192.168.45.153.8848 > 192.0.2.1.34781: Flags [P.], seq 1:9, ack 1, win 64240, length 8
16:14:29.879377 tun0  Out IP 192.168.45.153.8848 > 192.0.2.1.34781: Flags [P.], seq 1:9, ack 1, win 64240, length 8

上面的测试样例是主动发出了 [1, 9) 的数据包,RST 报文的序列号分为三种情况:

  • 如果 RST seq 定位到 1,引起连接 RESET。
  • 如果 RST seq 定位到 (1, 1+RCV.WND),发出 challenge ACK。
  • 其它范围会被忽略。

Segment arrives

上面写了这么长也只是 3.10.7. SEGMENT ARRIVES 的一部分,即使再加上状态转移图也不一定能覆盖所有情况。但是这一块实在是看着都嫌麻烦,其它的细节有空再测吧。

最后再八卦一下,这些测试工作其实 FreeBSD 社区 也有在做,但是至今测了 8 年也没完工,所以说用爱发电就别强求太多了……


本文已转发到知乎