# Linux E 100 网卡与 TCP 层原理三

# ip_rcv 函数

前面我们看到从网卡接受到的包 sk_buff *skb 将会找到 list_head ptype_base[16] hash 表中注册的 packet_type ，随后调用该结构中设置的回调函数：ip_rcv 。至此，从数据链路层的数据包正式进入网络层处理。该函数的处理较为简单：对原始 skb 数据包（包含 IP 头部和数据）检测处理，如果一切没问题那么便将该数据包调用 NF_HOOK 宏定义完成处理。

int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
{
    struct iphdr *iph;

    // 包类型为其他主机的数据包，直接丢弃
    if (skb->pkt_type == PACKET_OTHERHOST)
        goto drop;

    IP_INC_STATS_BH(IpInReceives); // 增加接受IP数据包分析计数

    if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { // 检测数据包是否为共享数据包，若是，那么将该 skb clone 一个新的处理，并减少一个 旧 skb 的引用计数
        IP_INC_STATS_BH(IpInDiscards);
        goto out;
    }

    if (!pskb_may_pull(skb, sizeof(struct iphdr))) // 检测并处理数据包IP头部
        goto inhdr_error;
    iph = skb->nh.iph; // IP 数据头部

    /*
     *  RFC1122: 3.1.2.2 规定必须静默丢弃任何校验和失败的IP帧
     *
     *  检测IP数据报是否可以被接受：
     *
     *  1.  IP 数据报的长度至少是 ip 头的大小
     *  2.  IP 数据报的版本必须为 IPV4（因为我们这里的源码为ipv4下的处理代码，而不是ipv6）
     *  3.  IP 数据报的校验和正确
     *  4.  IP 数据报不存在伪长度
     */

    if (iph->ihl < 5 || iph->version != 4)
        goto inhdr_error; 

    if (!pskb_may_pull(skb, iph->ihl*4))
        goto inhdr_error;

    iph = skb->nh.iph;

    if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) // 校验和检测失败
        goto inhdr_error; 

    {
        __u32 len = ntohs(iph->tot_len); // 获取 ip 头部总长度
        if (skb->len < len || len < (iph->ihl<<2)) // skb 在ip头部处理后长度小于 IP头部长度 那么 IP头部处理失败
            goto inhdr_error;
        if (skb->len > len) {  // 若 skb 的总长度 大于 ip 头部长度，那么尝试修正 skb 真实的长度（比如：如果我们填充了padding 对齐数据，那么将该数据去除）
            __pskb_trim(skb, len);
            if (skb->ip_summed == CHECKSUM_HW)
                skb->ip_summed = CHECKSUM_NONE;
        }
    }

    return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
               ip_rcv_finish); // 调用 NETFILTER 模块完成处理

inhdr_error:
    IP_INC_STATS_BH(IpInHdrErrors);
drop:
        kfree_skb(skb);
out:
        return NET_RX_DROP;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

# NETFILTER 模块定义

NETFILTER 模块是 Linux 网络处理的核心：对不同阶段的 IP 数据报进行拦截处理，比如防火墙机制便是通过该机制完成。它定义了如下五个处理回调钩子。

/* IP 数据报进行 路由 之前 回调 */
#define NF_IP_PRE_ROUTING   0

/* IP 数据报需要递交到 TCP 层时 回调  */
#define NF_IP_LOCAL_IN      1

/* IP 数据报需要递交给 网络中其他主机，并且在本机进行 forward 转发机制时 回调 */
#define NF_IP_FORWARD       2

/* IP 数据报需要从本机传输到驱动层处理前 回调*/
#define NF_IP_LOCAL_OUT     3

/* IP 数据报递交到网卡驱动层前 回调*/
#define NF_IP_POST_ROUTING  4

1
2
3
4
5
6
7
8
9
10
11
12
13
14

看如下的图描述，其中 IN 、OUT 上层为 TCP 层，INPUT 、OUTPUT 下层为数据链路层（网卡驱动层），方框为上述的钩子名，方框中的白色为利用这些钩子的功能（比如：mangle 防火墙、NAT 网络地址转换）。

接下来我们来看看 NETFILTER 定义的宏定义。我们看到当定义了 NETFITLER 那么 NF_HOOK 宏定义会进入 NETFILTER层处理，如果没有，那么直接执行 okfn 回调函数，我打算 NETFILTER 后面的文章单独讲解，所以这里有个概念即可。

#ifdef CONFIG_NETFILTER // 配置了 NETFILTER 框架

#ifdef CONFIG_NETFILTER_DEBUG  // 配置了 NETFILTER 调试机制，那么定义如下宏定义

#define NF_HOOK(pf, hook, skb, indev, outdev, okfn)  nf_hook_slow((pf), (hook), (skb), (indev), (outdev), (okfn), INT_MIN)

#define NF_HOOK_THRESH nf_hook_slow

#else // 没有配置 NETFILTER 调试机制，那么定义如下宏定义 

#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (list_empty(&nf_hooks[(pf)][(hook)])? (okfn)(skb) : nf_hook_slow((pf), (hook), (skb), (indev), (outdev), (okfn), INT_MIN))

#define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh) (list_empty(&nf_hooks[(pf)][(hook)]) ? (okfn)(skb) : nf_hook_slow((pf), (hook), (skb), (indev), (outdev), (okfn), (thresh)))

#endif

#else /* !CONFIG_NETFILTER */

#define NF_HOOK(pf, hook, skb, indev, outdev, okfn 回调函数) (okfn)(skb) // 若没有配置 NETFILTER 框架，那么这里直接调用传入的 okfn 回调函数

#endif /*CONFIG_NETFILTER*/

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

# ip_rcv_finish 函数

static inline int ip_rcv_finish(struct sk_buff *skb)
{
    struct net_device *dev = skb->dev;
    struct iphdr *iph = skb->nh.iph;

    //  初始化数据包的虚拟路径缓存，它描述了包如何在Linux网络中传递执行（也即路由信息）
    if (skb->dst == NULL) {
        if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
            goto drop; 
    }

    if (iph->ihl > 5) { // IP 头部长度大于5（ 20 字节，IP包头最小长度为 20 字节，最大长度为：一个IP包头的长度最长为 1111 ，即：15*4 ＝ 60 字节，所以这里大于 正常长度的20字节，所以存在其他 IP 额外 options，所以这里处理这些选项）了解即可
        ...
    }

    return dst_input(skb); // 执行进一步处理

inhdr_error:
    IP_INC_STATS_BH(IpInHdrErrors);
drop:
        kfree_skb(skb);
        return NET_RX_DROP;
}


// 将数据包传递给传输层
static inline int dst_input(struct sk_buff *skb)
{
    int err;
    for (;;) {
        err = skb->dst->input(skb); // 回调 dst 的 input 函数
        if (likely(err == 0))
            return err;
        if (unlikely(err != NET_XMIT_BYPASS))
            return err;
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

# ip_route_input 函数

ip_rcv_finish 函数将会调用该函数完成对 skb->dst->input 函数的设置，由于该方法很大很长，我这里关注重点即可：函数如何设置。

int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
           u8 tos, struct net_device *dev)
{
    ...
    for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) { // 查找路由缓存，若存在，那么直接返回，缓存使用 hash 表（链地址法确定）
        smp_read_barrier_depends();
        if (rth->fl.fl4_dst == daddr &&
            rth->fl.fl4_src == saddr &&
            rth->fl.iif == iif &&
            rth->fl.oif == 0 &&
            rth->fl.fl4_tos == tos) {
            ...
            return 0;
        }
        RT_CACHE_STAT_INC(in_hlist_search);
    }
    rcu_read_unlock();

    if (MULTICAST(daddr)) { // 数据报的目的地址为 广播地址
        ...
    }
    return ip_route_input_slow(skb, daddr, saddr, tos, dev); // 其他数据报
}

int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, u8 tos, struct net_device *dev)
{
   ...
   rth->u.dst.input = ip_forward; // 设置 input 为 ip_forward 函数，将数据包在本机路由到其他主机
   ...
   local_input:
       rth->u.dst.input= ip_local_deliver;  // 设置 input 为 ip_local_deliver 函数，传递到本机 TCP 层
   
   ...
}

// 将 IP 数据报传递到传输层
int ip_local_deliver(struct sk_buff *skb)
{
    if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { // IP 是分片数据，那么重组 IP 分片，然后再传递
        skb = ip_defrag(skb);
        if (!skb)
            return 0;
    }

    return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
               ip_local_deliver_finish); // 否则进入 NF_IP_LOCAL_IN 钩子，如果钩子确定进入 传输层，那么回调 ip_local_deliver_finish 函数
}

static inline int ip_local_deliver_finish(struct sk_buff *skb)
{
    ...
    // TCP 层的数据
    skb->h.raw = skb->data;
    {
        //  找到可以处理该 IP 数据报的网络层
        int protocol = skb->nh.iph->protocol;
        int hash;
        struct sock *raw_sk;
        struct inet_protocol *ipprot;
        ...
        if ((ipprot = inet_protos[hash]) != NULL) {
            int ret;
            smp_read_barrier_depends();
            ...
            ret = ipprot->handler(skb); // 回调处理函数
            if (ret < 0) {
                protocol = -ret;
                goto resubmit;
            }
            IP_INC_STATS_BH(IpInDelivers);
        } else { // 无网络层可以处理，那么释放 SKB
            ...
            kfree_skb(skb);
        }
    }
 out:
    rcu_read_unlock();

    return 0;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

# TCP 层回调函数注册

接下来我们看看 ipprot->handler(skb); 回调处理函数的注册过程。

#define MAX_INET_PROTOS 256 
struct inet_protocol *inet_protos[MAX_INET_PROTOS]; 


// 添加处理协议
int inet_add_protocol(struct inet_protocol *prot, unsigned char protocol)
{
    int hash, ret;

    hash = protocol & (MAX_INET_PROTOS - 1);
    spin_lock_bh(&inet_proto_lock);
    if (inet_protos[hash]) { // 已经添加过协议
        ret = -1;
    } else { // 否则将 prot 放入数组
        inet_protos[hash] = prot;
        ret = 0;
    }
    spin_unlock_bh(&inet_proto_lock);
    return ret;
}

static int __init inet_init(void){
    ...
    if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) // 添加 ICMP 处理结构
        printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
    
    if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) // 添加 UDP 处理结构
        printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");    
    
    if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) // 添加 TCP 处理结构
        printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");    
    ...
}

static struct inet_protocol tcp_protocol = {
    .handler =  tcp_v4_rcv, // tcp 处理协议回调函数
    .err_handler =  tcp_v4_err,
    .no_policy =    1,
};

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

← Linux线程模型：LinuxThreads 与 NPTL Java <clint> 与 <init> 原理 →