首页 > 编程学习 > TCP bbr简介

TCP bbr简介

发布时间:2022/1/24 2:44:13

1、背景

现有的拥塞控制算法,如cubic,基于丢包检查,问题

1)、网络设备buffer大,导致bufferbloat只要不丢包,就会发送,这样就容易把网络设备的buffer填充满,导致延时增加;

2)、网络设备buffer小,容易丢包,拥塞算法根据丢包控制发包速率,导致整体带宽吞吐小;

2、BBR四个阶段流程

start up

进入start up有两个时机:

1)、初始化

bbr_init,先将bbr->mode初始化成BBR_STARTUP;

static void bbr_init(struct sock *sk)
{bbr_reset_startup_mode(sk);cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
}

2)、tcp ack收到拥塞信号,如果当前处于probe rtt阶段,并且还没探测到最大带宽,则重新进入start up阶段;

static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{struct tcp_sock *tp = tcp_sk(sk);struct bbr *bbr = inet_csk_ca(sk);if (event == CA_EVENT_TX_START && tp->app_limited) {bbr->idle_restart = 1;/* Avoid pointless buffer overflows: pace at est. bw if we don't* need more speed (we're restarting from idle and app-limited).*///如果在probe bw阶段收到拥塞信号,则将pacing_gain调成成1,控制发包量if (bbr->mode == BBR_PROBE_BW)bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);else if (bbr->mode == BBR_PROBE_RTT)//如果是在probe rtt阶段收到拥塞信号,则判断当前如果还未探测到最大带宽,则重新//进入start up阶段,否则进入probe bw阶段bbr_check_probe_rtt_done(sk);}
}

在start up阶段,bbr会以较大的pacing gain和cwnd gain发送数据包;

static void bbr_update_gains(struct sock *sk)
{struct bbr *bbr = inet_csk_ca(sk);switch (bbr->mode) {case BBR_STARTUP:bbr->pacing_gain = bbr_high_gain;  //2.95倍bbr->cwnd_gain	 = bbr_high_gain;break;
}

drain排空

在start up阶段,以较大的pacing、cwnd速率发送消息包,因此探测到的bw也越来越大,当bbr发现已经探测到最大带宽时,此时链路上的设备buffer已经被填充满,开始进入drain阶段,将链路设备的buffer排空;

static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
{struct bbr *bbr = inet_csk_ca(sk);if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {bbr->mode = BBR_DRAIN;	/* drain queue we created */tcp_sk(sk)->snd_ssthresh =bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);}	/* fall through to check if in-flight is already small: */if (bbr->mode == BBR_DRAIN &&tcp_packets_in_flight(tcp_sk(sk)) <=bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
}

在drain排空阶段,bbr会控制减少pacing的速率;

static void bbr_update_gains(struct sock *sk)
{struct bbr *bbr = inet_csk_ca(sk);switch (bbr->mode) {case BBR_DRAIN: bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain 1/2.9倍增益 */bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */break;
}

等到检测到in_flight的数据包个数少于BDP时,进入probe_bw阶段。

static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
{struct bbr *bbr = inet_csk_ca(sk);if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {bbr->mode = BBR_DRAIN;	/* drain queue we created */tcp_sk(sk)->snd_ssthresh =bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);}	/* fall through to check if in-flight is already small: */if (bbr->mode == BBR_DRAIN &&tcp_packets_in_flight(tcp_sk(sk)) <=bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
}

Probe bw

如何探测?

进入probe_bw阶段,会使用以下的pacing增益系数不断循环探测;

static const int bbr_pacing_gain[] = {BBR_UNIT * 5 / 4,	/* probe for more available bw */BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
};

当处在某个phase下,判断当前满足条件,进入下一个phase;

static bool bbr_is_next_cycle_phase(struct sock *sk,const struct rate_sample *rs)
{struct tcp_sock *tp = tcp_sk(sk);struct bbr *bbr = inet_csk_ca(sk);bool is_full_length =tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >bbr->min_rtt_us;u32 inflight, bw;/* The pacing_gain of 1.0 paces at the estimated bw to try to fully* use the pipe without increasing the queue.*///如果是1倍增益,则探测时间到达一个min_rtt,就进入下一个phaseif (bbr->pacing_gain == BBR_UNIT)return is_full_length;		/* just use wall clock time */inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */bw = bbr_max_bw(sk);/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at* least pacing_gain*BDP; this may take more than min_rtt if min_rtt is* small (e.g. on a LAN). We do not persist if packets are lost, since* a path with small buffers may not hold that much.*///如果是大于1倍增益,则探测时间超过1个min_rtt并且有丢包或inflight达到该增益倍数下计算的bdp值//则进入下一个phase//正增益下,意图是提高链路的使用率if (bbr->pacing_gain > BBR_UNIT)return is_full_length &&(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));/* A pacing_gain < 1.0 tries to drain extra queue we added if bw* probing didn't find more bw. If inflight falls to match BDP then we* estimate queue is drained; persisting would underutilize the pipe.*///如果是小于1倍增益,则探测时间超过1个min_rtt并且infight达到1倍增益下的bdp值,则进入下一个phase//减增益下,意图是释放链路上的带宽资源return is_full_length ||inflight <= bbr_inflight(sk, bw, BBR_UNIT);
}

探测何时结束?

在probe_bw模式下,经过一轮的phase定义的不同pacing增益发包逻辑,探测到链路开始出现丢包,并且丢包率达到预期值后,认为开始探测到最大bw了,如果两次探测的最大bw在固定误差范围内,则设置本轮探测的bw为二者的平均值,然后进入lt_use_bw状态;

static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
{struct bbr *bbr = inet_csk_ca(sk);u32 diff;//本次计算得到的bw跟之前探测到的bbr->lt_bw误差在1/8内,则设置本轮探测的bw为二者的平均值//然后设置lt_ues_bw为1,进入long term状态,暂时退出probe_bw,等待下一轮的探测周期//在long term模式下,pacing_gain设置为1倍增益if (bbr->lt_bw) {  /* do we have bw from a previous interval? *//* Is new bw close to the lt_bw from the previous interval? */diff = abs(bw - bbr->lt_bw);if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||(bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=bbr_lt_bw_diff)) {/* All criteria are met; estimate we're policed. */bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */bbr->lt_use_bw = 1;bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */bbr->lt_rtt_cnt = 0;return;}}bbr->lt_bw = bw;bbr_reset_lt_bw_sampling_interval(sk);
}

进入lt_use_bw状态后,当经过固定周期(bbr_lt_bw_max_rtts),probe_bw模式重新生效;

static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
{struct tcp_sock *tp = tcp_sk(sk);struct bbr *bbr = inet_csk_ca(sk);u32 lost, delivered;u64 bw;u32 t;//如果是在lt_use_bw状态下,判断当前处于probe_bw阶段,则经过bbr_lt_bw_max_rtts次有正常ack后//重新进入prote_bw阶段//当bbr->mode==BBR_RROBE_BW时,并不一定真的有去探测bw,而是要看当前是否处于lt_use_bw,//每次probe_bw周期,都会重新探测到一个稳定的最大带宽,然后设置lt_use_bw为1if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */}return;}/* Wait for the first loss before sampling, to let the policer exhaust* its tokens and estimate the steady-state rate allowed by the policer.* Starting samples earlier includes bursts that over-estimate the bw.*/if (!bbr->lt_is_sampling) {if (!rs->losses)return;bbr_reset_lt_bw_sampling_interval(sk);bbr->lt_is_sampling = true;}//探测probe_bw阶段,如果处于app_limited,则不进行探测/* To avoid underestimates, reset sampling if we run out of data. */if (rs->is_app_limited) {bbr_reset_lt_bw_sampling(sk);return;}if (bbr->round_start)bbr->lt_rtt_cnt++;	/* count round trips in this interval */if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)return;		/* sampling interval needs to be longer */if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {bbr_reset_lt_bw_sampling(sk);  /* interval is too long */return;}/* End sampling interval when a packet is lost, so we estimate the* policer tokens were exhausted. Stopping the sampling before the* tokens are exhausted under-estimates the policed rate.*///探测发送到有出现丢包,才会计算bwif (!rs->losses)return;/* Calculate packets lost and delivered in sampling interval. */lost = tp->lost - bbr->lt_last_lost;delivered = tp->delivered - bbr->lt_last_delivered;/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. *///丢包率至少要达到20%if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)return;/* Find average delivery rate in this sampling interval. */t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;if ((s32)t < 1)return;		/* interval is less than one ms, so wait *//* Check if can multiply without overflow */if (t >= ~0U / USEC_PER_MSEC) {bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */return;}t *= USEC_PER_MSEC;bw = (u64)delivered * BW_UNIT;do_div(bw, t);bbr_lt_bw_interval_done(sk, bw);
}

Delivered速率怎么计算?

如上图,t1时刻发送序列号seq=2,t2时刻发送3号skb,t3时刻发送4号skb,然后t4时刻ack4号skb;bbr在计算带宽时的原理就是计算某个skb从发送到被ack时一共delivered了多少个数据以及用了的时间interval_us,然后两者相除就得到当前的带宽值;

由于存在延时ack等因素,因此可能计算出来的带宽值会偏差,bbr在计算interval_us时会同时计算发送时间及ack时间,然后取两者的小值,因此bw的计算方法如下:

delivered = tp->delivered – tp->tp_pri_delivered

Interverl_ns = min(send_usack_us)

Bw = delivered / Interverl_ns

Probe rtt

每隔10s,bbr都会进行一轮min_rtt探测,探测周期为200ms,在probe_rtt阶段,bbr会降低发包速率,保证链路不会出现拥堵;

static void bbr_update_gains(struct sock *sk)
{struct bbr *bbr = inet_csk_ca(sk);switch (bbr->mode) {case BBR_PROBE_RTT:bbr->pacing_gain = BBR_UNIT;bbr->cwnd_gain	 = BBR_UNIT;break;default:WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);break;}
}
static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
{struct tcp_sock *tp = tcp_sk(sk);struct bbr *bbr = inet_csk_ca(sk);bool filter_expired;/* Track min RTT seen in the min_rtt_win_sec filter window: *///每经过10s(bbr_min_rtt_win_sec),就会探测一次min_rttfilter_expired = after(tcp_jiffies32,bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);//10s超时后,判断本轮探测的rtt是否比之前探测的min_rtt更小,如果是,则更新min_rtt为本轮探测的rttif (rs->rtt_us >= 0 &&(rs->rtt_us <= bbr->min_rtt_us ||(filter_expired && !rs->is_ack_delayed))) {bbr->min_rtt_us = rs->rtt_us;bbr->min_rtt_stamp = tcp_jiffies32;}//检测周期到期,当前不是probe_rtt模式,则进入probe_rtt模式if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&!bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */bbr_save_cwnd(sk);  /* note cwnd so we can restore it */bbr->probe_rtt_done_stamp = 0;}if (bbr->mode == BBR_PROBE_RTT) {/* Ignore low rate samples during this mode. */tp->app_limited =(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;/* Maintain min packets in flight for max(200 ms, 1 round). *///设置本轮的采样周期(200ms),进入probe_rtt阶段后,会降低发包速率,等in_flight数据包//个数降到bbr_cwnd_min_target开始探测rttif (!bbr->probe_rtt_done_stamp &&tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {bbr->probe_rtt_done_stamp = tcp_jiffies32 +msecs_to_jiffies(bbr_probe_rtt_mode_ms);bbr->probe_rtt_round_done = 0;bbr->next_rtt_delivered = tp->delivered;} else if (bbr->probe_rtt_done_stamp) {if (bbr->round_start)bbr->probe_rtt_round_done = 1;if (bbr->probe_rtt_round_done)bbr_check_probe_rtt_done(sk);}}/* Restart after idle ends only once we process a new S/ACK for data */if (rs->delivered > 0)bbr->idle_restart = 0;
}

当一轮rtt探测结束后,通过bbr_reset_mode重新进入probe_bw或start up阶段;

static void bbr_check_probe_rtt_done(struct sock *sk)
{struct tcp_sock *tp = tcp_sk(sk);struct bbr *bbr = inet_csk_ca(sk);if (!(bbr->probe_rtt_done_stamp &&after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))return;bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);bbr_reset_mode(sk);
}

BBR的输出

数据包ack后进入bbr模块处理,处理完成后,bbr模块会有两个输出,一个是根据bw设置的pacing速率,一个是根据bdp计算得到的cwnd;

Pacing速率

Pacing处理

pacing速率的计算:

bbr的pacing处理有两种方式:

1)、依赖于tc-fq的pacing

当使用tc-fq时,qdisc默认会使能rate_enable限速,这个流程也会利用bbr算法计算得到的sk_pacing_rate完成pacing功能;

static struct sk_buff *fq_dequeue(struct Qdisc *sch)
{//rate_enable模式使能if (!q->rate_enable)goto out;/* Do not pace locally generated ack packets */if (skb_is_tcp_pure_ack(skb))goto out;rate = q->flow_max_rate;if (skb->sk)rate = min(skb->sk->sk_pacing_rate, rate);if (rate <= q->low_rate_threshold) {f->credit = 0;plen = qdisc_pkt_len(skb);} else {plen = max(qdisc_pkt_len(skb), q->quantum);if (f->credit > 0)goto out;}if (rate != ~0U) {u64 len = (u64)plen * NSEC_PER_SEC;if (likely(rate))do_div(len, rate);/* Since socket rate can change later,* clamp the delay to 1 second.* Really, providers of too big packets should be fixed !*/if (unlikely(len > NSEC_PER_SEC)) {len = NSEC_PER_SEC;q->stat_pkts_too_long++;}/* Account for schedule/timers drifts.* f->time_next_packet was set when prior packet was sent,* and current time (@now) can be too late by tens of us.*/if (f->time_next_packet)len -= min(len/2, now - f->time_next_packet);f->time_next_packet = now + len;}
out:qdisc_bstats_update(sch, skb);return skb;
}

2)、tcp主动pacing

bbr_init时,默认使能SK_PACING_NEEDED;

static void bbr_init(struct sock *sk)
{cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
}

tcp_write_xmit的时候,通过tcp_pacing_check判断当前是否已经启动pacing高精度定时器,如果已经启动,则退出xmit流程;

static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,int push_one, gfp_t gfp)
{struct tcp_sock *tp = tcp_sk(sk);struct sk_buff *skb;unsigned int tso_segs, sent_pkts;int cwnd_quota;int result;bool is_cwnd_limited = false, is_rwnd_limited = false;u32 max_segs;sent_pkts = 0;tcp_mstamp_refresh(tp);if (!push_one) {/* Do MTU probing. */result = tcp_mtu_probe(sk);if (!result) {return false;} else if (result > 0) {sent_pkts = 1;}}max_segs = tcp_tso_segs(sk, mss_now);while ((skb = tcp_send_head(sk))) {unsigned int limit;if (tcp_pacing_check(sk))break;...
}

__tcp_transmit_skb判断是否需要tcp层做pacing,需要的话就启动高精度定时器;

static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{...if (skb->len != tcp_header_size) {tcp_event_data_sent(tp, sk);tp->data_segs_out += tcp_skb_pcount(skb);tp->bytes_sent += skb->len - tcp_header_size;tcp_internal_pacing(sk, skb);}...
}
static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
{u64 len_ns;u32 rate;if (!tcp_needs_internal_pacing(sk))return;rate = sk->sk_pacing_rate;if (!rate || rate == ~0U)return;/* Should account for header sizes as sch_fq does,* but lets make things simple.*///sk_pacing_rate表示1分钟能发送的字节数//skb->len / rate表示发送skb->len字节数需要的时间长度(长度是分钟)//len_ns = skb->len / rate * NSEC_PER_SEC即将时间换算成纳秒,然后启动pacing高精度定时器len_ns = (u64)skb->len * NSEC_PER_SEC;do_div(len_ns, rate);hrtimer_start(&tcp_sk(sk)->pacing_timer,ktime_add_ns(ktime_get(), len_ns),HRTIMER_MODE_ABS_PINNED_SOFT);sock_hold(sk);
}

拥塞窗口cwnd

static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,u32 acked, u32 bw, int gain)
{struct tcp_sock *tp = tcp_sk(sk);struct bbr *bbr = inet_csk_ca(sk);u32 cwnd = tp->snd_cwnd, target_cwnd = 0;if (!acked)goto done;  /* no packet fully ACKed; just apply caps *///第一次进入recovery状态时,返回true,此时cwnd=tcp_packets_in_flight(tp) + acked,因此这时候主要还是//考虑链路上的数据包守恒,ack多少个数据包就发送多少个数据包,相当于保持cwnd不变//退出recovery或loss状态时,cwnd=进入recovery或loss状态时的cwndif (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))goto done;/* If we're below target cwnd, slow start cwnd toward target cwnd. */// 1、先根据bdp计算得到target_cwndtarget_cwnd = bbr_bdp(sk, bw, gain);target_cwnd = bbr_quantization_budget(sk, target_cwnd);//当cwnd=cwnd+acked时表示: 本次ack p个数据包,则可发送2*p个数据包//本次ack p个包后,in_flight数据包少了p个,所以本来cwnd里就有空闲的p个数据包可以发送,再加acked//那就相当于可以发送2*p个数据包了if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */cwnd = min(cwnd + acked, target_cwnd);else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)cwnd = cwnd + acked;cwnd = max(cwnd, bbr_cwnd_min_target);done:tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
}

 


本文链接:https://www.ngui.cc/51cto/show-884764.html
Copyright © 2010-2022 ngui.cc 版权所有 |关于我们| 联系方式| 豫B2-20100000