0
点赞
收藏
分享

微信扫一扫

TSO-GSO reading

少_游 06-24 06:20 阅读 1

对 TCP,在网卡不支持 TSO 时,使用和不使用 GSO 的情形

TSO-GSO reading_数据 

TSO  :

TSO-GSO reading_首部_02

在 分析:IP层发包时:如果是gso 报文会调用

ip_finish_output_gso

 来处理

static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) { //仅经过ip_forward流程处理的报文携带该对象
IPCB(skb)->flags |= IPSKB_REROUTED; //该flag会影响后续报文的GSO处理
return dst_output_sk(sk, skb); //由于SNAT等策略处理,需要再次调用xfrm4_output函数来发包
}
#endif
if (skb_is_gso(skb))
return ip_finish_output_gso(sk, skb); //如果是gso报文

if (skb->len > ip_skb_dst_mtu(skb)) //非gso报文,报文大小超过设备MTU值,则需要进行IP分片
return ip_fragment(sk, skb, ip_finish_output2);

return ip_finish_output2(sk, skb); //直接发送报文
}

 

 

static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
{
netdev_features_t features;
struct sk_buff *segs;
int ret = 0;

/* common case: locally created skb or seglen is <= mtu */
if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || //只有ip forward流程该条件才会不成立,否则该条件成立
skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
return ip_finish_output2(sk, skb);

/* Slowpath - GSO segment length is exceeding the dst MTU.
*
* This can happen in two cases:
* 1) TCP GRO packet, DF bit not set
* 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
* from host network stack.
*/
features = netif_skb_features(skb); //获取dev的offload feature
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); //skb gso报文分段
if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
return -ENOMEM;
}

consume_skb(skb);

do {
struct sk_buff *nskb = segs->next;
int err;

segs->next = NULL;
err = ip_fragment(sk, segs, ip_finish_output2); //分段报文经过ip分片后通过ip_finish_output2发送

if (err && ret == 0)
ret = err;
segs = nskb;
} while (segs);

return ret;
}

SO:可知正常情况下本地发包是不需要进行gso处理的;

实际上本地发包都是延迟到网络设备发包时在处理;一般不会再IP层处理,一般都是在网络设备层处理---->进行软件GSO(硬件不支持)

 

检测当前报文是GSO数据包,同时物理设备不支此种GSO的分片聚合,或者当前报文已经不需要物理设备进行校验和,则直接到软件GSO逻辑处理

static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
{
return net_gso_ok(features, skb_shinfo(skb)->gso_type) &&
(!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
}
//skb_is_gso 判断skb的shinfo中gso_size字段是否有值来确定当前是GSO包
  //skb_gso_ok 检测设备是否支持当前gso包类型(gso可以有UDP、TCP等几种)
static inline bool netif_needs_gso(struct sk_buff *skb,
netdev_features_t features)
{
return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
(skb->ip_summed != CHECKSUM_UNNECESSARY)));
//skb->ip_summed != CHECKSUM_PARTIAL 表明该包软件实现校验和
}

static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
{
netdev_features_t features;

features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
goto out_null;

if (netif_needs_gso(skb, features)) {

//检测当前报文是GSO数据包,同时物理设备不支此种GSO的分片聚合,或者当前报
  //文已经不需要物理设备进行校验和,则直接进行软件实现GSO处理。
struct sk_buff *segs;

segs = skb_gso_segment(skb, features);
if (IS_ERR(segs)) {
goto out_kfree_skb;
} else if (segs) {
consume_skb(skb);
skb = segs;
}
} else {
//如果当前报文有多个frag_list组成,并且当前设备不支持多段处理,则需要使用
  //__skb_linearize进行线性化,也就是需要将多个段数据和入到一个单独的skb中
  //如果__skb_linearize处理失败,该包需要丢弃,这里失败原因比如说创建一个大的
  //skb时没有足够内存资源等。
if (skb_needs_linearize(skb, features) &&
__skb_linearize(skb))
goto out_kfree_skb;

/* If packet is not checksummed and device does not
* support checksumming for this protocol, complete
* checksumming here.
*///如果当前报文需要硬件设备进行校验和,
  //但当前设备不支持任何校验和处理,或者当前设备不支持IP校验和,或者当前设备
  //支持IP校验和可是当前报文不是IP报文。
  //则需要进行软件校验和处理,
if (skb->ip_summed == CHECKSUM_PARTIAL) {
if (skb->encapsulation)
skb_set_inner_transport_header(skb,
skb_checksum_start_offset(skb));
else
skb_set_transport_header(skb,
skb_checksum_start_offset(skb));
if (!(features & NETIF_F_CSUM_MASK) &&
skb_checksum_help(skb))
goto out_kfree_skb;
}
}

return skb;

out_kfree_skb:
kfree_skb(skb);
out_null:
atomic_long_inc(&dev->tx_dropped);
return NULL;
}

int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev, struct netdev_queue *txq,
spinlock_t *root_lock, bool validate)
{
int ret = NETDEV_TX_BUSY;
---------------------------------------------

/* Note that we validate skb (GSO, checksum, ...) outside of locks */
if (validate)//报文校验,gso分段、csum计算
skb = validate_xmit_skb_list(skb, dev);

if (likely(skb)) {
HARD_TX_LOCK(dev, txq, smp_processor_id());
/*如果说txq被stop,即置位QUEUE_STATE_ANY_XOFF_OR_FROZEN,就直接ret = NETDEV_TX_BUSY
*如果说txq 正常运行,那么直接调用dev_hard_start_xmit发送数据包*/
skb = dev_hard_start_xmit(skb, dev, txq, &ret);//调用驱动发送报文

-----------------------------------

}

 

 

 TSO-GSO reading_首部_03

 

 

 

 

看下 gso 的处理方式:入口函数skb_gso_segment

这个函数将skb分片,并返回一个skb list。如果skb不需要分片则返回NULL。

 

/**
* __skb_gso_segment - Perform segmentation on skb.
* @skb: buffer to segment
* @features: features for the output path (see dev->features)
* @tx_path: whether it is called in TX path
*
* This function segments the given skb and returns a list of segments.
*
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*
* Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
*/
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
{
struct sk_buff *segs;

if (unlikely(skb_needs_check(skb, tx_path))) {// 判断等于 skb->ip_summed != CHECKSUM_PARTIAL CHECKSUM_UNNECESSARY
int err;

/* We're going to init ->check field in TCP or UDP header copy header of skb when it is required
* If the skb passed lacks sufficient headroom or its data part
* is shared, data is reallocated. If reallocation fails, an error
* is returned and original skb is not changed. */
err = skb_cow_head(skb, 0);
if (err < 0)
return ERR_PTR(err);
}

/* Only report GSO partial support if it will enable us to
* support segmentation on this frame without needing additional
* work.
*/
if (features & NETIF_F_GSO_PARTIAL) {
netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
struct net_device *dev = skb->dev;

partial_features |= dev->features & dev->gso_partial_features;
if (!skb_gso_ok(skb, features | partial_features))
features &= ~NETIF_F_GSO_PARTIAL;
}

BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));

SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);//设置mac_offset, 用于skb_segment分段拷贝外层报文
SKB_GSO_CB(skb)->encap_level = 0;//encap_level为零,说明是最外层的报文

skb_reset_mac_header(skb);//重置mac header
skb_reset_mac_len(skb);//重置mac len

segs = skb_mac_gso_segment(skb, features);

if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
skb_warn_bad_offload(skb);

return segs;
}

 

/**
* skb_mac_gso_segment - mac layer segmentation handler.
* @skb: buffer to segment
* @features: features for the output path (see dev->features)
*/
struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_offload *ptype;
int vlan_depth = skb->mac_len;//__skb_gso_segment函数中计算得到
__be16 type = skb_network_protocol(skb, &vlan_depth);//得到skb协议

if (unlikely(!type))
return ERR_PTR(-EINVAL);

__skb_pull(skb, vlan_depth);//skb data指针移动到IP头

rcu_read_lock();
list_for_each_entry_rcu(ptype, &offload_base, list) {
if (ptype->type == type && ptype->callbacks.gso_segment) {
segs = ptype->callbacks.gso_segment(skb, features);//调用IP层的GSO segment函数
break;
}
}
rcu_read_unlock();

__skb_push(skb, skb->data - skb_mac_header(skb));//skb data指针移动到MAC头

return segs;
}

IP层对GSO的支持  

需要做gso分段,则先进入ip层的分段处理,在ip层分段处理函数里,主要工作是调用tcp层的分段处理函数,等tcp层分段完成后,重新对分段的skb的ip头做checksum

static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
unsigned int offset = 0;
bool udpfrag, encap;
struct iphdr *iph;
int proto;
int nhoff;
int ihl;
int id;
//校验待软GSO分段的的skb,其gso_tpye是否存在其他非法值
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_TCPV4 |
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_GRE |
SKB_GSO_GRE_CSUM |
SKB_GSO_IPIP |
SKB_GSO_SIT |
SKB_GSO_TCPV6 |
SKB_GSO_UDP_TUNNEL |
SKB_GSO_UDP_TUNNEL_CSUM |
SKB_GSO_TUNNEL_REMCSUM |
0)))
goto out;

skb_reset_network_header(skb);
nhoff = skb_network_header(skb) - skb_mac_header(skb); //根据network header和mac header得到IP头相对MAC的偏移
if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) //分段数据至少大于IP首部长度
goto out;

iph = ip_hdr(skb);
//检验首部中的长度字段是否有效
ihl = iph->ihl * 4; //得到IP包头的实际长度,基于此可以得到L4的首地址
if (ihl < sizeof(*iph))
goto out;

id = ntohs(iph->id);//取出首部中的id字段
proto = iph->protocol; //取出IP首部的协议值,L4层协议类型 用于定位与之对应的传输层接口(tcp还是udp)

/* Warning: after this point, iph might be no longer valid */
//再次通过首部中的长度字段检测skb长度是否有效
if (unlikely(!pskb_may_pull(skb, ihl))) //检测skb是否可以移动到L4头?
goto out;
__skb_pull(skb, ihl); //报文data指针移动到传输层

encap = SKB_GSO_CB(skb)->encap_level > 0;
if (encap)
features &= skb->dev->hw_enc_features; //如果encap,那么feature与hw_enc_features取交集
SKB_GSO_CB(skb)->encap_level += ihl; //用来标示是否为内层报文

skb_reset_transport_header(skb); //设置transport header值

segs = ERR_PTR(-EPROTONOSUPPORT);

if (skb->encapsulation &&
skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
udpfrag = proto == IPPROTO_UDP && encap;
else
udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; //vxlan封装报文走此分支,此时udpfrag为false

ops = rcu_dereference(inet_offloads[proto]);//调用上册协议的GSO处理函数
if (likely(ops && ops->callbacks.gso_segment))
segs = ops->callbacks.gso_segment(skb, features); //UDP或TCP的分段函数

if (IS_ERR_OR_NULL(segs))
goto out;

skb = segs;//开始处理分段后的skb
do {
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); //根据分段报文的mac header 和 IP偏移
if (udpfrag) { //ip分片报文
iph->id = htons(id);
iph->frag_off = htons(offset >> 3); //设置ip头的frag_off值
if (skb->next)
iph->frag_off |= htons(IP_MF); //后面还有报文,需要设置more frag标记
offset += skb->len - nhoff - ihl; //计算offset值,下一个报文需要使用
} else {
iph->id = htons(id++); //每个报文为完整的IP报文
}
iph->tot_len = htons(skb->len - nhoff);
ip_send_check(iph); //计算ip头 csum值
if (encap) //如果encap值非空,说明当前处于内层报文中,所以需要设置inner heaer值
skb_reset_inner_headers(skb);
skb->network_header = (u8 *)iph - skb->head; //设置network header
} while ((skb = skb->next));

out:
return segs;
}

 TCP层对GSO的支持 

  UDP经过GSO分片后每个分片的IP头部id是一样的,这个符合IP分片的逻辑,但是为什么TCP的GSO分片,IP头部的id会依次加1呢?原因是: tcp建立三次握手的过程中产生合适的mss,这个mss肯定是<=网络层的最大路径MTU,然后tcp数据封装成ip数据包通过网络层发送,当服务器端传输层接收到tcp数据之后进行tcp重组。所以正常情况下tcp产生的ip数据包在传输过程中是不会发生分片的!由于GSO应该保证对外透明,所以其效果应该也和在TCP层直接分片的效果是一样的,所以这里对UDP的处理是IP分片逻辑,但对TCP的处理是构造新的skb逻辑。

l  对于GSO

    UDP:所有分片ip头部id都相同,设置IP_MF分片标志(除最后一片(等同于IP分片)

    TCP:分片后,每个分片IP头部中id1, (等同于TCP分段)

static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
return ERR_PTR(-EINVAL);

if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb); //ip层报文保证了transport header值

/* Set up checksum pseudo header, usually expect stack to
* have done this already.
*/

th->check = 0;
skb->ip_summed = CHECKSUM_PARTIAL;
__tcp_v4_send_check(skb, iph->saddr, iph->daddr); //计算伪头check值
}

return tcp_gso_segment(skb, features); //TCP GSO分段
}
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int sum_truesize = 0;
struct tcphdr *th;
unsigned int thlen;
unsigned int seq;
__be32 delta;
unsigned int oldlen;
unsigned int mss;
struct sk_buff *gso_skb = skb;
__sum16 newcheck;
bool ooo_okay, copy_destructor;

th = tcp_hdr(skb);
thlen = th->doff * 4; //得到tcp头的长度
if (thlen < sizeof(*th))
goto out;

if (!pskb_may_pull(skb, thlen)) //再次通过首部中的长度字段检测skb长度是否有效
goto out;
//把tcp header移到skb header里,把skb->len存到oldlen中,此时skb->len就只有ip payload的长度(包含TCP首部)
oldlen = (u16)~skb->len;
__skb_pull(skb, thlen); //skb移动到用户数据区(payload)

mss = tcp_skb_mss(skb); //得到mss值
if (unlikely(skb->len <= mss))
goto out;

if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
int type = skb_shinfo(skb)->gso_type;

if (unlikely(type &
~(SKB_GSO_TCPV4 |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_TCPV6 |
SKB_GSO_GRE |
SKB_GSO_GRE_CSUM |
SKB_GSO_IPIP |
SKB_GSO_SIT |
SKB_GSO_UDP_TUNNEL |
SKB_GSO_UDP_TUNNEL_CSUM |
SKB_GSO_TUNNEL_REMCSUM |
0) ||
!(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
goto out; //校验待软GSO分段的的skb,其gso_tpye是否存在其他非法值

skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); //如果报文来源不可信,则重新计算segs,返回

segs = NULL;
goto out;
}

copy_destructor = gso_skb->destructor == tcp_wfree;
ooo_okay = gso_skb->ooo_okay;
/* All segments but the first should have ooo_okay cleared */
skb->ooo_okay = 0;

segs = skb_segment(skb, features); //调用payload根据mss值分段
if (IS_ERR(segs))
goto out;

/* Only first segment might have ooo_okay set */
segs->ooo_okay = ooo_okay;

delta = htonl(oldlen + (thlen + mss)); //TCP头+mss - 原始报文,该值为负值

skb = segs;
th = tcp_hdr(skb); //skb_segment分段后,可以直接从skb中获取tcp头, skb_segment或udp4_ufo_fragment保证
seq = ntohl(th->seq);

if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);

newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + //第一个报文基于原先值,根据delta快速计算
(__force u32)delta));

do { //刷新分段后报文的TCP头设置
th->fin = th->psh = 0;
th->check = newcheck;
//计算每个分片的校验和
if (skb->ip_summed != CHECKSUM_PARTIAL)
th->check = gso_make_checksum(skb, ~th->check); //重新计算check值

seq += mss; //重新初始化每个分片的序列号
if (copy_destructor) {
skb->destructor = gso_skb->destructor;
skb->sk = gso_skb->sk;
sum_truesize += skb->truesize;
}
skb = skb->next;
th = tcp_hdr(skb);

th->seq = htonl(seq);
th->cwr = 0;
} while (skb->next);

/* Following permits TCP Small Queues to work well with GSO :
* The callback to TCP stack will be called at the time last frag
* is freed at TX completion, and not right now when gso_skb
* is freed by GSO engine
*/
if (copy_destructor) {
swap(gso_skb->sk, skb->sk);
swap(gso_skb->destructor, skb->destructor);
sum_truesize += skb->truesize;
atomic_add(sum_truesize - gso_skb->truesize,
&skb->sk->sk_wmem_alloc);
}

delta = htonl(oldlen + (skb_tail_pointer(skb) -
skb_transport_header(skb)) + //最后一个报文的delta值不同
skb->data_len);
th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
if (skb->ip_summed != CHECKSUM_PARTIAL)
th->check = gso_make_checksum(skb, ~th->check); //重新计算check值
out:
return segs;
}

 

skg_segment是实现封装报文GSO分段的基础

 

/**
* skb_segment - Perform protocol segmentation on skb.
* @head_skb: buffer to segment
* @features: features for the output path (see dev->features)
*
* This function performs segmentation on the given skb. It returns
* a pointer to the first in a list of new skbs for the segments.
* In case of error it returns ERR_PTR(err).
*/
struct sk_buff *skb_segment(struct sk_buff *head_skb,
netdev_features_t features)
{
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
skb_frag_t *frag = skb_shinfo(head_skb)->frags;
unsigned int mss = skb_shinfo(head_skb)->gso_size;
//mac头+ip头+tcp头 或mac头+ip头(对于UDP传入时没有将头部偏移过去)
unsigned int doffset = head_skb->data - skb_mac_header(head_skb); //得到内层报头的长度
struct sk_buff *frag_skb = head_skb;
unsigned int offset = doffset;
unsigned int tnl_hlen = skb_tnl_header_len(head_skb); //得到外层报头的长度,非封装报文该值为0, 是支持封装报文GSO的基础
unsigned int headroom;
unsigned int len;
__be16 proto;
bool csum;
int sg = !!(features & NETIF_F_SG); //是否支持SG
int nfrags = skb_shinfo(head_skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
int pos;
int dummy;

__skb_push(head_skb, doffset); //报文移到内层报文的mac头
proto = skb_network_protocol(head_skb, &dummy); //报文协议类型
if (unlikely(!proto))
return ERR_PTR(-EINVAL);

csum = !head_skb->encap_hdr_csum &&
!!can_checksum_protocol(features, proto);

headroom = skb_headroom(head_skb); //得到报文的headroom大小
pos = skb_headlen(head_skb); //报文线性区长度

do {
struct sk_buff *nskb;
skb_frag_t *nskb_frag;
int hsize;
int size;
/* offset为分片已处理的长度,len为skb->len减去直到offset的部分。开始时,offset只是mac header + ip header + tcp header的长度,
len即tcp payload的长度。随着segment增加, offset每次都增加mss长度。
因此len的定义是每个segment的payload长度(最后一个segment的payload可能小于一个mss长度)
*/
len = head_skb->len - offset; //计算报文待拷贝的长度,不包括包头
if (len > mss) //len为本次要创建的新分片的长度
len = mss; //len超过mss,则只能拷贝mss长度
// hsize为线性区部分的payload减去offset后的大小,如果hsize小于0,那么说明payload在skb的frags或frag_list中。
//随着offset一直增长,必定会有hsize一直<0的情况开始出现,除非skb是一个完全linearize化的skb
hsize = skb_headlen(head_skb) - offset; //待拷贝的线性区长度
if (hsize < 0)
hsize = 0;//这种情况说明线性区已经没有tcp payload的部分,需要pull数据过来
if (hsize > len || !sg)
hsize = len;//如果不支持NETIF_F_SG或者hsize大于len,那么hsize就为len(本次新分片的长度),此时说明segment的payload还在skb 线性区中
//如果把frags数组中的数据拷贝完还不够len长度,则需要从frag_list中拷贝了
//表示需要从frags数组或者frag_list链表中拷贝出数据,i >= nfrags说明frags数组中的数据也拷贝完了//下面需要从frag_list链表中拷贝数据了
if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
(skb_headlen(list_skb) == len || sg)) {
BUG_ON(skb_headlen(list_skb) > len); //frag_list中的skb线性区长度不超过len,即mss值

i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;
pos += skb_headlen(list_skb); //增加线性区长度

while (pos < offset + len) { //只能拷贝len长度
BUG_ON(i >= nfrags);

size = skb_frag_size(frag);
if (pos + size > offset + len)
break;

i++;
pos += size; //增加frag的长度
frag++;
}
//frag_list的数据不用真的拷贝,只需要拷贝其skb描述符,就可以复用其数据区
nskb = skb_clone(list_skb, GFP_ATOMIC); //克隆报文,该报文包含完整的数据,需要裁剪
list_skb = list_skb->next;

if (unlikely(!nskb))
goto err;

if (unlikely(pskb_trim(nskb, len))) { //裁剪报文到len长度
kfree_skb(nskb);
goto err;
}

hsize = skb_end_offset(nskb); //保证新的skb的headroom有mac header+ip header+tcp/udp+header的大小
if (skb_cow_head(nskb, doffset + headroom)) { //扩展head,以容得下外层报头
kfree_skb(nskb);
goto err;
}
//调整truesize,使其包含本次已分片的数据部分长度(hsize)
nskb->truesize += skb_end_offset(nskb) - hsize; //truesize值刷新
skb_release_head_state(nskb);
__skb_push(nskb, doffset); //skb移动到内层报文的mac头
} else {
//每次要拷贝出的数据长度为len,其中hsize位于线性区
nskb = __alloc_skb(hsize + doffset + headroom, //skb的frag还未使用完,采用新申请skb的方式
GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
NUMA_NO_NODE);

if (unlikely(!nskb))
goto err;

skb_reserve(nskb, headroom); //skb预留headroom长度
__skb_put(nskb, doffset); //线性区扩展内层报头长度
}

if (segs)
tail->next = nskb;
else
segs = nskb;
tail = nskb;

__copy_skb_header(nskb, head_skb); //拷贝skb的相关信息,包括header都拷贝了

skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); //刷新header值
skb_reset_mac_len(nskb); //重置mac len值
//把skb->data开始doffset长度的内容拷贝到nskb->data中
skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, //拷贝外两层报头(如果封装的话)
nskb->data - tnl_hlen,
doffset + tnl_hlen);

if (nskb->len == len + doffset) //对于使用frag_list场景,满足条件;拷贝frag场景不满足
goto perform_csum_check;

if (!sg && !nskb->remcsum_offload) {//如果不支持NETIF_F_SG,说明frags数组中没有数据,只考虑从线性区中拷贝数据
nskb->ip_summed = CHECKSUM_NONE;
nskb->csum = skb_copy_and_csum_bits(head_skb, offset, //计算cusm值
skb_put(nskb, len),
len, 0);
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
continue;
}

nskb_frag = skb_shinfo(nskb)->frags;
//如果hsize不为0,那么拷贝hsize的内容到nskb的线性区中
skb_copy_from_linear_data_offset(head_skb, offset, //拷贝线性区数据
skb_put(nskb, hsize), hsize);

skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
SKBTX_SHARED_FRAG;

//每次要拷贝的数据长度是len,其中hsize是位于线性区中,但是随着线性区数据逐渐被处理,hsize可能不够len,这时剩下的(len-hsize)长度就要从frags数组中拷贝了
while (pos < offset + len) {
if (i >= nfrags) {
BUG_ON(skb_headlen(list_skb));

i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;

BUG_ON(!nfrags);

list_skb = list_skb->next; //frag_list场景,取下一个skb
}

if (unlikely(skb_shinfo(nskb)->nr_frags >=
MAX_SKB_FRAGS)) {
net_warn_ratelimited(
"skb_segment: too many frags: %u %u\n",
pos, mss);
goto err;
}

if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
goto err;

*nskb_frag = *frag; //frag_list的逻辑和frag的逻辑合并在了一起,增加了复杂度
__skb_frag_ref(nskb_frag);
size = skb_frag_size(nskb_frag);

if (pos < offset) {//pos初始为线性区长度,后来表示已经被拷贝的长度
nskb_frag->page_offset += offset - pos;
skb_frag_size_sub(nskb_frag, offset - pos); //frag分拆
}

skb_shinfo(nskb)->nr_frags++;

if (pos + size <= offset + len) {
i++;
frag++;
pos += size;
} else {

skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); //frag分拆
goto skip_fraglist;
}

nskb_frag++;
}

skip_fraglist:
nskb->data_len = len - hsize;
nskb->len += nskb->data_len;
nskb->truesize += nskb->data_len;

perform_csum_check:
if (!csum && !nskb->remcsum_offload) {
nskb->csum = skb_checksum(nskb, doffset,
nskb->len - doffset, 0); //计算csum值
nskb->ip_summed = CHECKSUM_NONE;
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
}
} while ((offset += len) < head_skb->len);

/* Some callers want to get the end of the list.
* Put it in segs->prev to avoid walking the list.
* (see validate_xmit_skb_list() for example)
*/
segs->prev = tail;

/* Following permits correct backpressure, for protocols
* using skb_set_owner_w().
* Idea is to tranfert ownership from head_skb to last segment.
*/
if (head_skb->destructor == sock_wfree) {
swap(tail->truesize, head_skb->truesize);
swap(tail->destructor, head_skb->destructor);
swap(tail->sk, head_skb->sk);
}
return segs;

err:
kfree_skb_list(segs);
return ERR_PTR(err);
}

 

输出报文 分片:

int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph;
int ptr;
struct sk_buff *skb2;
unsigned int mtu, hlen, left, len, ll_rs;
int offset;
__be16 not_last_frag;
struct rtable *rt = skb_rtable(skb);
int err = 0;

/* for offloaded checksums cleanup checksum before fragmentation */
/* PARTIAL类型需要清除校验和 */
if (skb->ip_summed == CHECKSUM_PARTIAL &&
(err = skb_checksum_help(skb)))
goto fail;

/*
* Point into the IP datagram header.
*/

iph = ip_hdr(skb);

/* 获取mtu */
mtu = ip_skb_dst_mtu(sk, skb);

/* 接收到的最大分片长度 < mtu,则将mtu设置为该值 */
if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
mtu = IPCB(skb)->frag_max_size;

/*
* Setup starting values.
*/

hlen = iph->ihl * 4;
mtu = mtu - hlen; /* Size of data space */
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;

/* When frag_list is given, use it. First, check its validity:
* some transformers could create wrong frag_list or break existing
* one, it is not prohibited. In this case fall back to copying.
*
* LATER: this step can be merged to real generation of fragments,
* we can switch to copy when see the first bad fragment.
*/
/* 有分片列表 */
if (skb_has_frag_list(skb)) {
struct sk_buff *frag, *frag2;

/* 线性区域和分页区的数据长度 */
unsigned int first_len = skb_pagelen(skb);

/* 以下情况,进入慢路处理 */
if (first_len - hlen > mtu || /* 分片长度>MTU */
((first_len - hlen) & 7) || /* 没有8字节对齐 */
ip_is_fragment(iph) || /* 是一个分片 */
skb_cloned(skb)) /* 是克隆的 */
goto slow_path;

/* 遍历分片列表 */
skb_walk_frags(skb, frag) {
/* Correct geometry. */
/* 以下情况,恢复状态,进入慢速路径 */
if (frag->len > mtu || /* 分片长度>mtu */
((frag->len & 7) && frag->next) || /* 除最后一个分片外,其余有非8字节对齐的 */
skb_headroom(frag) < hlen) /* 头部长度过小 */
goto slow_path_clean;

/* Partially cloned skb? */
/* 克隆的,恢复状态,进入慢速路径 */
if (skb_shared(frag))
goto slow_path_clean;

BUG_ON(frag->sk);

/* 分片关联控制块 */
if (skb->sk) {
frag->sk = skb->sk;
frag->destructor = sock_wfree;
}

/* 第一个skb的长度去掉当前分片的长度 */
skb->truesize -= frag->truesize;
}

/* Everything is OK. Generate! */

/* 现在分片没问题了,设置分片信息 */
err = 0;
offset = 0;
frag = skb_shinfo(skb)->frag_list;
skb_frag_list_init(skb);
skb->data_len = first_len - skb_headlen(skb);
skb->len = first_len;
iph->tot_len = htons(first_len);
iph->frag_off = htons(IP_MF);
ip_send_check(iph);

/* 循环设置分片信息,并发送 */
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
/* 为每一片都拷贝ip头,设置偏移信息 */
if (frag) {
frag->ip_summed = CHECKSUM_NONE;
skb_reset_transport_header(frag);
__skb_push(frag, hlen);
skb_reset_network_header(frag);
memcpy(skb_network_header(frag), iph, hlen);
iph = ip_hdr(frag);
iph->tot_len = htons(frag->len);
ip_copy_metadata(frag, skb);
if (offset == 0)
ip_options_fragment(frag);
offset += skb->len - hlen;
iph->frag_off = htons(offset>>3);
if (frag->next)
iph->frag_off |= htons(IP_MF);
/* Ready, complete checksum */
ip_send_check(iph);
}

/* 调用发送回调 */
err = output(net, sk, skb);

if (!err)
IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
if (err || !frag)
break;

skb = frag;
frag = skb->next;
skb->next = NULL;
}

if (err == 0) {
IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
return 0;
}

/* 出错,释放分片 */
while (frag) {
skb = frag->next;
kfree_skb(frag);
frag = skb;
}
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;

slow_path_clean:
/* 将分片恢复原状态 */
skb_walk_frags(skb, frag2) {
if (frag2 == frag)
break;
frag2->sk = NULL;
frag2->destructor = NULL;
skb->truesize += frag2->truesize;
}
}

slow_path:
/* 慢速分片路径 */


iph = ip_hdr(skb);

/* 除去首部的剩余空间 */
left = skb->len - hlen; /* Space per frame */
ptr = hlen; /* Where to start from */

/* 二层头部空间 */
ll_rs = LL_RESERVED_SPACE(rt->dst.dev);

/*
* Fragment the datagram.
*/

/* 初始化mf和offset */
offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
not_last_frag = iph->frag_off & htons(IP_MF);

/*
* Keep copying data until we run out.
*/

/* 开始分片了 */
while (left > 0) {
/* len初始为剩余长度 */
len = left;
/* IF: it doesn't fit, use 'mtu' - the data space left */
/* 根据mtu确认长度 */
if (len > mtu)
len = mtu;
/* IF: we are not sending up to and including the packet end
then align the next start on an eight byte boundary */
/* 除最后分片外,其余8字节对齐 */
if (len < left) {
len &= ~7;
}

/* Allocate buffer */
/* 分配skb */
skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
if (!skb2) {
err = -ENOMEM;
goto fail;
}

/*
* Set up data on packet
*/

/* 拷贝元数据 */
ip_copy_metadata(skb2, skb);

/* 预留空间,设置头部偏移 */
skb_reserve(skb2, ll_rs);
skb_put(skb2, len + hlen);
skb_reset_network_header(skb2);
skb2->transport_header = skb2->network_header + hlen;

/*
* Charge the memory for the fragment to any owner
* it might possess
*/
/* 关联sk */
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);

/*
* Copy the packet header into the new buffer.
*/

/* 拷贝头部 */
skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);

/*
* Copy a block of the IP datagram.
*/
/* 拷贝数据 */
if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
BUG();
left -= len;

/*
* Fill in the new header fields.
*/
iph = ip_hdr(skb2);

/* 设置偏移 *//
iph->frag_off = htons((offset >> 3));

/* 转发的数据包,带有FRAG_PMTU标记,则打上DF */
if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
iph->frag_off |= htons(IP_DF);

/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
* on the initial skb, so that all the following fragments
* will inherit fixed options.
*/
/* 第一个分片包含ip选项 */
if (offset == 0)
ip_options_fragment(skb);

/*
* Added AC : If we are fragmenting a fragment that's not the
* last fragment then keep MF on each bit
*/
/* 不是最后分片需要设定MF标记 */
if (left > 0 || not_last_frag)
iph->frag_off |= htons(IP_MF);

/* 指针和偏移更新 */
ptr += len;
offset += len;

/*
* Put this fragment into the sending queue.
*/
/* 设置数据长度 */
iph->tot_len = htons(len + hlen);

/* 校验和 */
ip_send_check(iph);

/* 发送分片 */
err = output(net, sk, skb2);
if (err)
goto fail;

IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
}

/* 分片完成并发送,释放skb */
consume_skb(skb);
IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
return err;

fail:

/* 出错,释放skb */
kfree_skb(skb);
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
}

 

http代理服务器(3-4-7层代理)-网络事件库公共组件、内核kernel驱动 摄像头驱动 tcpip网络协议栈、netfilter、bridge 好像看过!!!! 但行好事 莫问前程 --身高体重180的胖子

举报

相关推荐

0 条评论