0
点赞
收藏
分享

微信扫一扫

HuggingFace应用——自然语言处理(1):什么是NLP?什么是Transformer?

灵魂跑者 2024-11-06 阅读 6

一、路由缓存

收到报文或者发送报文的时候都需要查找路由表,频繁的路由表查找操作时需要耗费一部分CPU,linux提供了路由缓存来减少路由表的查询,路由缓存由hash 表组织而成,路由缓存的初始化放在路由初始化函数 ip_rt_init 中,当路由缓存没有命中的时候会去查找路由表,查找成功则会添加到路由缓存里。

有两个地方需要查找缓存:一个是ip_rcv() 接收报文时,一个是发送报文的时候。

路由缓存初始化流程:

补充一个inet_init 初始化时,是先执行 arp_init(),再执行ip_init(),说明arp 模块是依赖。

下面分析一下 ip_rt_init :

//kernel/net/ipv4/route.c

int __init ip_rt_init(void)
{
        void *idents_hash;
        int cpu;
// 建立路由缓存hash 表
        /* For modern hosts, this will use 2 MB of memory */
        idents_hash = alloc_large_system_hash("IP idents",
                                              sizeof(*ip_idents) + sizeof(*ip_tstamps),
                                              0,
                                              16, /* one bucket per 64 KB */
                                              HASH_ZERO,
                                              NULL,
                                              &ip_idents_mask,
                                              2048,
                                              256*1024);

        ip_idents = idents_hash;

        prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));

        ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);

                INIT_LIST_HEAD(&ul->head);
                spin_lock_init(&ul->lock);
        }
#ifdef CONFIG_IP_ROUTE_CLASSID //基于路由分类器,每CPU分配256个变量。目前我的板子没有打开
        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
        if (!ip_rt_acct)
                panic("IP: failed to allocate ip_rt_acct\n");
#endif
       //路由缓存池
        ipv4_dst_ops.kmem_cachep =
                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);

        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;

        //初始化每CPU变量
        if (dst_entries_init(&ipv4_dst_ops) < 0)
                panic("IP: failed to allocate ipv4_dst_ops counter\n");
        
        //初始化每CPU变量
        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");

        //设置gc 时间和缓存最大数量
        ipv4_dst_ops.gc_thresh = ~0;
        ip_rt_max_size = INT_MAX;
        
        //初始化
        devinet_init();

        //注册通知链和创建alias缓存
        ip_fib_init();

        if (ip_rt_proc_init())
                pr_err("Unable to create route proc files\n");
#ifdef CONFIG_XFRM
        xfrm_init();
        xfrm4_init();
#endif
        //注册netlink 消息
        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
                      RTNL_FLAG_DOIT_UNLOCKED);

#ifdef CONFIG_SYSCTL
        register_pernet_subsys(&sysctl_route_ops);
#endif
        register_pernet_subsys(&rt_genid_ops);
        register_pernet_subsys(&ipv4_inetpeer_ops);
        return 0;
}

低版本的内核是把路由缓存表定义为 rt_hash_table, 高版本的内核 是 idents_hash 。我分析的内核版本是5.10.* 。

初始化完成后,如何被调用?

首先是输入函数的查询:

网络数据接收调用栈:

ip_rcv() 函数:

//  kernel/net/ipv4/ip_input.c
/*
 * IP receive entry point
 */
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
           struct net_device *orig_dev)
{
        struct net *net = dev_net(dev);

        skb = ip_rcv_core(skb, net);//IP层主要处理函数
        if (skb == NULL)
                return NET_RX_DROP;
//钩子函数,通过iptables的NF_INET_PRE_ROUTING链,如果继续则进入ip_rcv_finish 
        return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
                       net, NULL, skb, dev, NULL,
                       ip_rcv_finish);
}

ip_rcv_core :

//kernel/net/ipv4/ip_input.c
/*
 *      Main IP Receive routine.
 */
static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
        const struct iphdr *iph;
        u32 len;

        /* When the interface is in promisc. mode, drop all the crap
         * that it receives, do not try to analyse it.
         */
        if (skb->pkt_type == PACKET_OTHERHOST)// 丢弃这个类型的包
                goto drop;

        __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb) {
                __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
                goto out;
        }
        // 尝试以最小长度获取ip 头,获取不到则返回error
        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto inhdr_error;
        //获取ip 头
        iph = ip_hdr(skb);

        /*
         *      RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
         *
         *      Is the datagram acceptable?
         *
         *      1.      Length at least the size of an ip header
         *      2.      Version of 4
         *      3.      Checksums correctly. [Speed optimisation for later, skip loopback checksums]
         *      4.      Doesn't have a bogus length
         */
        //检查IP头的长度和版本
        if (iph->ihl < 5 || iph->version != 4)
                goto inhdr_error;

        BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
        BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
        BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
        __IP_ADD_STATS(net,
                       IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
                       max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
        // 尝试以ip头携带的ip头部长度获取ip头,获取失败返回error
        if (!pskb_may_pull(skb, iph->ihl*4))
                goto inhdr_error;

        iph = ip_hdr(skb);

        if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
                goto csum_error;
        //skb的长度,比ip头部携带的ip 包总长度小,丢弃报文
        //ip包总长度,比ip头部长度小,返回error
        len = ntohs(iph->tot_len);
        if (skb->len < len) {
                __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
                goto drop;
        } else if (len < (iph->ihl*4))
                goto inhdr_error;

        /* Our transport medium may have padded the buffer out. Now we know it
         * is IP we can trim to the true length of the frame.
         * Note this now means skb->len holds ntohs(iph->tot_len).
         */
        if (pskb_trim_rcsum(skb, len)) {
                __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
                goto drop;
        }

        iph = ip_hdr(skb);
        //获取传输层的数据指针
        skb->transport_header = skb->network_header + iph->ihl*4;

        /* Remove any debris in the socket control block */
        memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
        IPCB(skb)->iif = skb->skb_iif;

        /* Must drop socket now because of tproxy. */
        if (!skb_sk_is_prefetched(skb))
                skb_orphan(skb);

        return skb;

csum_error:
        __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
        __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
        kfree_skb(skb);
out:
        return NULL;
}

接着看NF_INET_PRE_ROUTING 路由之前的函数 ip_rcv_finish:

//kernel/net/ipv4/ip_input.c

static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        int ret;

        /* if ingress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
          //如果数据报属于L3层,则传递给其处理程序进行处理
        skb = l3mdev_ip_rcv(skb);
        if (!skb)
                return NET_RX_SUCCESS;
        
        ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
        //dst_input 实际上会调用skb->dst->input(skb). input 函数会根据路由信息设置为合适的
        //函数指针,如果是递交到本地的则为 ip_local_deliver, 若是转发则为 ip_forward.
        if (ret != NET_RX_DROP)
                ret = dst_input(skb);
     
        return ret;
}

ip_rcv_finish_core :

// kernel/net/ipv4/ip_input.c

static int ip_rcv_finish_core(struct net *net, struct sock *sk,
                              struct sk_buff *skb, struct net_device *dev,
                              const struct sk_buff *hint)
{
        const struct iphdr *iph = ip_hdr(skb);
        int (*edemux)(struct sk_buff *skb);
        struct rtable *rt;
        int err;

        //这个判断似乎是在缓存中查找
        if (ip_can_use_hint(skb, iph, hint)) {
                err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
                                        dev, hint);
                if (unlikely(err))
                        goto drop_error;
        }

        if (net->ipv4.sysctl_ip_early_demux &&
            !skb_dst(skb) &&
            !skb->sk &&
            !ip_is_fragment(iph)) {
                const struct net_protocol *ipprot;
                int protocol = iph->protocol;

                ipprot = rcu_dereference(inet_protos[protocol]);
                if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
                        err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux,
                                              udp_v4_early_demux, skb);
                        if (unlikely(err))
                                goto drop_error;
                        /* must reload iph, skb->head might have changed */
                        iph = ip_hdr(skb);
                }
        }

        /*
         *      Initialise the virtual path cache for the packet. It describes
         *      how the packet travels inside Linux networking.
         */// 为数据包初始化虚拟路径缓存,它描述了数据包是如何在linux网络中传播的
        //通常从外界接收的数据包,skb->dst不会包含路由信息,暂时还不知道在何处会设置这个字段
        //ip_route_input函数会根据路由表设置路由信息
        
        if (!skb_valid_dst(skb)) {
                err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
                                           iph->tos, dev);
                if (unlikely(err))
                        goto drop_error;
        }
更新统计数据
#ifdef CONFIG_IP_ROUTE_CLASSID
        if (unlikely(skb_dst(skb)->tclassid)) {
                struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
                u32 idx = skb_dst(skb)->tclassid;
                st[idx&0xFF].o_packets++;
                st[idx&0xFF].o_bytes += skb->len;
                st[(idx>>16)&0xFF].i_packets++;
                st[(idx>>16)&0xFF].i_bytes += skb->len;
        }
#endif
如果IP头部大于20字节,则表示IP头部包含IP选项,需要进行选项处理
        if (iph->ihl > 5 && ip_rcv_options(skb, dev))
                goto drop;
skb_rtable函数等同于skb_dst函数,获取skb->dst
        rt = skb_rtable(skb);
        if (rt->rt_type == RTN_MULTICAST) {
                __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
        } else if (rt->rt_type == RTN_BROADCAST) {
                __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
        } else if (skb->pkt_type == PACKET_BROADCAST ||
                   skb->pkt_type == PACKET_MULTICAST) {
                struct in_device *in_dev = __in_dev_get_rcu(dev);

                /* RFC 1122 3.3.6:
                 *
                 *   When a host sends a datagram to a link-layer broadcast
                 *   address, the IP destination address MUST be a legal IP
                 *   broadcast or IP multicast address.
                 *
                 *   A host SHOULD silently discard a datagram that is received
                 *   via a link-layer broadcast (see Section 2.4) but does not
                 *   specify an IP multicast or broadcast destination address.
                 *
                 * This doesn't explicitly say L2 *broadcast*, but broadcast is
                 * in a way a form of multicast and the most common use case for
                 * this is 802.11 protecting against cross-station spoofing (the
                 * so-called "hole-196" attack) so do it for both.
                 */
                if (in_dev &&
                    IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
                        goto drop;
        }

        return NET_RX_SUCCESS;

drop:
        kfree_skb(skb);
        return NET_RX_DROP;

drop_error:
        if (err == -EXDEV)
                __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
        goto drop;
}

ip_route_input_noref :  开始进入路由

int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                         u8 tos, struct net_device *dev)
{
        struct fib_result res;
        int err;

        tos &= IPTOS_RT_MASK;
        rcu_read_lock();
        err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
        rcu_read_unlock();

        return err;
}
EXPORT_SYMBOL(ip_route_input_noref);

ip_route_input_rcu 函数 的功能:如果缓存没有命中,就判断目的地址是否是多播地址,多播地址就调用 ip_route_input_mc,否则调用ip_route_input_slow函数,该函数会查找路由表,最终会调用fib_lookup函数查找。路由表的查询,后面在接着分析。

ip_route_input_slow函数,如果查询路由表之后,发现是发给本机,dst->input =ip_local_deliver ;

ip_local_deliver:

/*
 *      Deliver IP Packets to the higher protocol layers.
 */
int ip_local_deliver(struct sk_buff *skb)
{
        /*
         *      Reassemble IP fragments.
         */
        // 分片重组
        struct net *net = dev_net(skb->dev);

        if (ip_is_fragment(ip_hdr(skb))) {
                if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
                        return 0;
        }
         //经过OCAL_IN 钩子函数
        return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
                       net, NULL, skb, skb->dev, NULL,
                       ip_local_deliver_finish);
}

ip_local_deliver_finish

//kernel/net/ipv4/ip_input.c

static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        //去掉 ip 头
        __skb_pull(skb, skb_network_header_len(skb));

        rcu_read_lock();
        ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
        rcu_read_unlock();

        return 0;
}

。。。
INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *));   //接收函数间接声明
INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *));
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
        const struct net_protocol *ipprot;
        int raw, ret;

resubmit:
        //原始套接口,复制一个副本,输出到该套接口
        raw = raw_local_deliver(skb, protocol);

        //获取协议处理结构
        ipprot = rcu_dereference(inet_protos[protocol]);
        if (ipprot) {
                if (!ipprot->no_policy) {
                        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                                kfree_skb(skb);
                                return;
                        }
                        nf_reset_ct(skb);
                }
                //注册tcp 和udp 接收函数
                ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
                                      skb);
                if (ret < 0) {
                        protocol = -ret;
                        goto resubmit;
                }
                __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
        } else {
                if (!raw) {// 原始套接口未接收或者接收异常
                        if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                                __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
                                /* 发送icmp */
                                icmp_send(skb, ICMP_DEST_UNREACH,
                                          ICMP_PROT_UNREACH, 0);
                        }
                        kfree_skb(skb);
                } else {
                        /* 原始套接口接收 */
                        __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
                            /* 释放包 */
                        consume_skb(skb);
                }
        }
}

接下来就是传输层:

下面根据IP报文协议,以TCP 为例,TCP 的接收函数为:int tcp_v4_rcv(struct sk_buff *skb)。

tcp_v4_rcv:

/*
 *      From tcp_input.c
 */

int tcp_v4_rcv(struct sk_buff *skb)
{
        struct net *net = dev_net(skb->dev);
        struct sk_buff *skb_to_free;
        int sdif = inet_sdif(skb);
        int dif = inet_iif(skb);
        const struct iphdr *iph;
        const struct tcphdr *th;
        bool refcounted;
        struct sock *sk;
        int ret;
/* 如果不是发往本机的就直接丢弃 */
        if (skb->pkt_type != PACKET_HOST)
                goto discard_it;

        /* Count it even if it's bad */
        __TCP_INC_STATS(net, TCP_MIB_INSEGS);

/*
	如果 TCP 段在传输过程中被分片了,则到达本地后会在 IP 层重新组装。
	组装完成后,报文分片都存储在链表中。在此,需把存储在分片中的报文
	复制到 SKB 的线性存储区域中。如果发生异常,则丢弃该报文。
	*/
        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
                goto discard_it;

        th = (const struct tcphdr *)skb->data;

/*
	如果 TCP 的首部长度小于不带数据的 TCP 的首部长度,则说明 TCP 数据异常。
	统计相关信息后,丢弃。
	*/
        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
                goto bad_packet;
//保证skb的线性区域至少包括实际的TCP首部
        if (!pskb_may_pull(skb, th->doff * 4))
                goto discard_it;

        /* An explanation is required here, I think.
         * Packet length and doff are validated by header prediction,
         * provided case of th->doff==0 is eliminated.
         * So, we defer the checks. */

//验证 TCP 首部中的校验和,如校验和有误,则说明报文已损坏,统计相关信息后丢弃。
        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
                goto csum_error;
//初始化skb中的控制块
        th = (const struct tcphdr *)skb->data;
        iph = ip_hdr(skb);//IP头
/* 在 ehash 或 bhash 散列表中根据地址和端口来查找传
	输控制块。如果在 ehash 中找到,则表示已经经历了三次握手并且已建立了连接,可以
	进行正常的通信。如果在 bhash 中找到,则表示已经绑定已经绑定了端口,处于侦听
	状态。如果在两个散列表中都查找不到,说明此时对应的传输控制块还没有创建,跳转
	到no_tcp_socket 处处理。
	*/


lookup:
        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
                               th->dest, sdif, &refcounted);
        if (!sk)
                goto no_tcp_socket;

process:
//TCP_TIME_WAIT需要做特殊处理,这里先不关注
        if (sk->sk_state == TCP_TIME_WAIT)
                goto do_time_wait;

        if (sk->sk_state == TCP_NEW_SYN_RECV) {
                struct request_sock *req = inet_reqsk(sk);
                bool req_stolen = false;
                struct sock *nsk;

                sk = req->rsk_listener;
                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
                        sk_drops_add(sk, skb);
                        reqsk_put(req);
                        goto discard_it;
                }
                if (tcp_checksum_complete(skb)) {
                        reqsk_put(req);
                        goto csum_error;
                }
                if (unlikely(sk->sk_state != TCP_LISTEN)) {
                        inet_csk_reqsk_queue_drop_and_put(sk, req);
                        goto lookup;
                }
                /* We own a reference on the listener, increase it again
                 * as we might lose it too soon.
                 */
                sock_hold(sk);
                refcounted = true;
                nsk = NULL;
                if (!tcp_filter(sk, skb)) {
                        th = (const struct tcphdr *)skb->data;
                        iph = ip_hdr(skb);
                        tcp_v4_fill_cb(skb, iph, th);
                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
                }
                if (!nsk) {
                        reqsk_put(req);
                        if (req_stolen) {
                                /* Another cpu got exclusive access to req
                                 * and created a full blown socket.
                                 * Try to feed this packet to this socket
                                 * instead of discarding it.
                                 */
                                tcp_v4_restore_cb(skb);
                                sock_put(sk);
                                goto lookup;
                        }
                        goto discard_and_relse;
                }
                if (nsk == sk) {
                        reqsk_put(req);
                        tcp_v4_restore_cb(skb);
                } else if (tcp_child_process(sk, nsk, skb)) {
                        tcp_v4_send_reset(nsk, skb);
                        goto discard_and_relse;
                } else {
                        sock_put(sk);
                        return 0;
                }
        }
/*ttl 小于给定的最小的 ttl*/
        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
                goto discard_and_relse;
        }
//查找 IPsec 数据库,如果查找失败,进行相应处理.
        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
                goto discard_and_relse;
//md5 相关
        if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
                goto discard_and_relse;
       nf_reset_ct(skb);
//TCP套接字过滤器,如果数据包被过滤掉了,结束处理过程 -> sk_filter_trim_cap
        if (tcp_filter(sk, skb))
                goto discard_and_relse;
        th = (const struct tcphdr *)skb->data;
        iph = ip_hdr(skb);
        tcp_v4_fill_cb(skb, iph, th);
//到了传输层,该字段已经没有意义,将其置为空
        skb->dev = NULL;
/*LISTEN 状态 */
        if (sk->sk_state == TCP_LISTEN) {
                ret = tcp_v4_do_rcv(sk, skb);//交由tcp_v4_do_rcv()处理
                goto put_and_return;
        }

        sk_incoming_cpu_update(sk);
//先持锁,这样进程上下文和其它软中断则无法操作该TCP
        bh_lock_sock_nested(sk);
        tcp_segs_in(tcp_sk(sk), skb);
        ret = 0;

//如果当前TCB没有被进程上下文锁定,首先尝试将数据包放入prequeue队列,
	//如果prequeue队列没有处理,再将其处理后放入receive队列。如果TCB已
	//经被进程上下文锁定,那么直接将数据包放入backlog队列

        if (!sock_owned_by_user(sk)) {
                skb_to_free = sk->sk_rx_skb_cache;
                sk->sk_rx_skb_cache = NULL;
                ret = tcp_v4_do_rcv(sk, skb);
        } else {
                if (tcp_add_backlog(sk, skb))//TCB被用户进程锁定,直接将数据包放入backlog队列
                        goto discard_and_relse;
                skb_to_free = NULL;
        }
        bh_unlock_sock(sk);//释放锁
        if (skb_to_free)
                __kfree_skb(skb_to_free);

put_and_return:
        if (refcounted)
                sock_put(sk);//释放TCB引用计数,当计数为 0 的时候,使用 sk\_free 释放传输控制块

        return ret;//返回处理结果

//处理没有创建传输控制块收到报文,校验错误,坏包的情况,给对端发送 RST 报文。
no_tcp_socket:
        /* 查找 IPsec 数据库,如果查找失败,进行相应处理.*/
        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
                goto discard_it;

        tcp_v4_fill_cb(skb, iph, th);

        if (tcp_checksum_complete(skb)) {
csum_error:
                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
                __TCP_INC_STATS(net, TCP_MIB_INERRS);
        } else {
                tcp_v4_send_reset(NULL, skb);
        }

discard_it:
        /* Discard frame. */ //丢弃帧
        kfree_skb(skb);
        return 0;

discard_and_relse:
        sk_drops_add(sk, skb);
        if (refcounted)
                sock_put(sk);
        goto discard_it;
//处理TIME_WAIT状态
do_time_wait:
        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                inet_twsk_put(inet_twsk(sk));减少引用计数
                goto discard_it;
        }

        tcp_v4_fill_cb(skb, iph, th);

        if (tcp_checksum_complete(skb)) {
                inet_twsk_put(inet_twsk(sk));
                goto csum_error;
        }
        //根据返回值进行相应处理
        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
        case TCP_TW_SYN: {
                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
                                                        &tcp_hashinfo, skb,
                                                        __tcp_hdrlen(th),
                                                        iph->saddr, th->source,
                                                        iph->daddr, th->dest,
                                                        inet_iif(skb),
                                                        sdif);
                if (sk2) {
                        inet_twsk_deschedule_put(inet_twsk(sk));
                        sk = sk2;
                        tcp_v4_restore_cb(skb);
                        refcounted = false;
                        goto process;
                }
        }
                /* to ACK */ /* to ACK */
                fallthrough;
        case TCP_TW_ACK:
                tcp_v4_timewait_ack(sk, skb);
                break;
        case TCP_TW_RST:
                tcp_v4_send_reset(sk, skb);
                inet_twsk_deschedule_put(inet_twsk(sk));
                goto discard_it;
        case TCP_TW_SUCCESS:;
        }
        goto discard_it;
}

tcp_v4_do_rcv:

/* The socket must have it's spinlock held when we get
 * here, unless it is a TCP_LISTEN socket.
        必须有自旋锁,除非是TCP_LISTEN socket.
 *
 * We have a potential double-lock case here, so even when
 * doing backlog processing we use the BH locking scheme.
 * This is because we cannot sleep with the original spinlock
 * held.
 */
我们这里有一个潜在的双锁情况,所以即使在做积压处理时,我们也使用BH锁定方案。这是因为我们不能用原来的自旋锁来睡觉。
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
        struct sock *rsk;
 /* 当状态为ESTABLISHED时,用tcp_rcv_established()接收处理 */
        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
                struct dst_entry *dst = sk->sk_rx_dst;

                sock_rps_save_rxhash(sk, skb);
                sk_mark_napi_id(sk, skb);
                if (dst) {
                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
                            !dst->ops->check(dst, 0)) {
                                dst_release(dst);
                                sk->sk_rx_dst = NULL;
                        }
                }
                /* 连接已建立时的处理路径 */
                tcp_rcv_established(sk, skb);
                return 0;
        }

        if (tcp_checksum_complete(skb))
                goto csum_err;
/* 如果这个sock处于监听状态,被动打开时的处理,包括收到SYN或ACK */
        if (sk->sk_state == TCP_LISTEN) {
                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
        /*
		NULL,错误
		nsk == sk,接收到 SYN
		nsk != sk,接收到 ACK
		*/
                if (!nsk)
                        goto discard;
                if (nsk != sk) {
                        if (tcp_child_process(sk, nsk, skb)) {/* 处理新的sock ,初始化子传输控制块*/
                                rsk = nsk;
                                goto reset;//失败时,给客户端发送 RST 段进行复位
                        }
                        return 0;
                }
        } else
                sock_rps_save_rxhash(sk, skb);
/* 处理除了ESTABLISHED和TIME_WAIT之外的所有状态 */
        if (tcp_rcv_state_process(sk, skb)) {
                rsk = sk;
                goto reset;
        }
        return 0;

reset:
        tcp_v4_send_reset(rsk, skb);/* 发送被动的RST包 */
discard:
        kfree_skb(skb);
        /* Be careful here. If this function gets more complicated and
         * gcc suffers from register pressure on the x86, sk (in %ebx)
         * might be destroyed here. This current version compiles correctly,
         * but you have been warned.
         */
        return 0;

csum_err:
        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
        goto discard;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);

tcp_rcv_state_process :tcp_rcv_state_process实现了 TCP 状态机相对核心的一个部分。该函数可以处理除ESTABLELISHED 和 TIME_WAIT 状态以外的情况下的接收过程。

/*
 *      This function implements the receiving procedure of RFC 793 for
 *      all states except ESTABLISHED and TIME_WAIT.
 *      It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
 *      address independent.
 */
该函数实现了除已建立状态和TIME_WAIT外的所有状态的RFC 793的接收过程。
 *	它从tcp_v4_rcv和tcp_v6_rcv都调用,应该是独立的地址。
 *sk: 传输控制块
 *skb:缓存块

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        const struct tcphdr *th = tcp_hdr(skb);
        struct request_sock *req;
        int queued = 0;
        bool acceptable;

        switch (sk->sk_state) {
        case TCP_CLOSE:   /* CLOSE 状态的处理代码 */
                goto discard;

        case TCP_LISTEN:* LISTEN 状态的处理代码 */
                if (th->ack)
                        return 1;

                if (th->rst)
                        goto discard;

                if (th->syn) {
                        if (th->fin)
                                goto discard;
                        /* It is possible that we process SYN packets from backlog,
                         * so we need to make sure to disable BH and RCU right there.
                         */
                        rcu_read_lock();
                        local_bh_disable();
                        acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
                        local_bh_enable();
                        rcu_read_unlock();

                        if (!acceptable)
                                return 1;
                        consume_skb(skb);
                        return 0;
                }
                goto discard;

        case TCP_SYN_SENT: //客户端发送SYN第一次握手,等待服务端回复SYN+ACK第二次握手
                tp->rx_opt.saw_tstamp = 0;
                tcp_mstamp_refresh(tp);
                queued = tcp_rcv_synsent_state_process(sk, skb, th);
                if (queued >= 0)
                        return queued;

                /* Do step6 onward by hand. */
                tcp_urg(sk, skb, th);
                __kfree_skb(skb);
                tcp_data_snd_check(sk);
                return 0;
        }

        tcp_mstamp_refresh(tp);
        tp->rx_opt.saw_tstamp = 0;
        req = rcu_dereference_protected(tp->fastopen_rsk,
                                        lockdep_sock_is_held(sk));
        if (req) {
                bool req_stolen;

                WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
                    sk->sk_state != TCP_FIN_WAIT1);

                if (!tcp_check_req(sk, skb, req, true, &req_stolen))
                        goto discard;
        }

        if (!th->ack && !th->rst && !th->syn)
                goto discard;

        if (!tcp_validate_incoming(sk, skb, th, 0))
                return 0;

        /* step 5: check the ACK field */
        //对收到的 ACK 段进行处理判断是否正确接收,如果正确接收就会发送返回非零值。
        acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
                                      FLAG_UPDATE_TS_RECENT |
                                      FLAG_NO_CHALLENGE_ACK) > 0;

        if (!acceptable) {
                if (sk->sk_state == TCP_SYN_RECV)
                        return 1;       /* send one RST */
                tcp_send_challenge_ack(sk, skb);
                goto discard;
        }
        switch (sk->sk_state) {
        case TCP_SYN_RECV:       //服务端发送SYN+ACK第二次握手,等待客户端回复ACK第三次握手
                tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
                if (!tp->srtt_us)
                        tcp_synack_rtt_meas(sk, req);

                if (req) {
                        tcp_rcv_synrecv_state_fastopen(sk);
                } else {
                        tcp_try_undo_spurious_syn(sk);
                        tp->retrans_stamp = 0;
                        tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
                                          skb);
                        WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
                }
                smp_mb();
    //进行一系列的初始化,开启相应拥塞控制等,并且将 TCP 的状态置为 TCP_ESTABLISHED。
                tcp_set_state(sk, TCP_ESTABLISHED);
                sk->sk_state_change(sk);

                /* Note, that this wakeup is only for marginal crossed SYN case.
                 * Passively open sockets are not waked up, because
                 * sk->sk_sleep == NULL and sk->sk_socket == NULL.
                 */
     //发信号给那些将通过该套接口发送数据的进程,通知它们套接口目前已经可以发送数据了
                if (sk->sk_socket)
                        sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
//初始化传输控制块的各个字段,对时间戳进行处理。
                tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
                tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

                if (tp->rx_opt.tstamp_ok)
                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

                if (!inet_csk(sk)->icsk_ca_ops->cong_control)
                        tcp_update_pacing_rate(sk);

                /* Prevent spurious tcp_cwnd_restart() on first data packet */
                tp->lsndtime = tcp_jiffies32;
//更新最近一次的发送数据报的时间,初始化与路径 MTU 有关的成员,并计算有关TCP 首部预测的标志
                tcp_initialize_rcv_mss(sk);
                tcp_fast_path_on(tp);
                break;

        case TCP_FIN_WAIT1: {//发送SIN+ACK第一次挥手后,等待对方回复ACK第二次挥手
                int tmo;

                if (req)
                        tcp_rcv_synrecv_state_fastopen(sk);

                if (tp->snd_una != tp->write_seq)
                        break;

                tcp_set_state(sk, TCP_FIN_WAIT2);
                sk->sk_shutdown |= SEND_SHUTDOWN;

                sk_dst_confirm(sk);

                if (!sock_flag(sk, SOCK_DEAD)) {
                        /* Wake up lingering close() */
                        sk->sk_state_change(sk);
                        break;
                }

                if (tp->linger2 < 0) {
                        tcp_done(sk);
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
                        return 1;
                }
                if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
                    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
                        /* Receive out of order FIN after close() */
                        if (tp->syn_fastopen && th->fin)
                                tcp_fastopen_active_disable(sk);
                        tcp_done(sk);
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
                        return 1;
                }

                tmo = tcp_fin_time(sk);
                if (tmo > TCP_TIMEWAIT_LEN) {
                        inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
                } else if (th->fin || sock_owned_by_user(sk)) {
                        /* Bad case. We could lose such FIN otherwise.
                         * It is not a big problem, but it looks confusing
                         * and not so rare event. We still can lose it now,
                         * if it spins in bh_lock_sock(), but it is really
                         * marginal case.
                         */
                        inet_csk_reset_keepalive_timer(sk, tmo);
                } else {
                        tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
                        goto discard;
                }
                break;
        }

        case TCP_CLOSING:
                if (tp->snd_una == tp->write_seq) {
                        tcp_time_wait(sk, TCP_TIME_WAIT, 0);
                        goto discard;
                }
                break;

        case TCP_LAST_ACK:
                if (tp->snd_una == tp->write_seq) {
                        tcp_update_metrics(sk);
                        tcp_done(sk);
                        goto discard;
                }
                break;
        }
        /* step 6: check the URG bit */
        tcp_urg(sk, skb, th);

        /* step 7: process the segment text */
        switch (sk->sk_state) {
        case TCP_CLOSE_WAIT:
        case TCP_CLOSING:
        case TCP_LAST_ACK:
                if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
                        if (sk_is_mptcp(sk))
                                mptcp_incoming_options(sk, skb);
                        break;
                }
                fallthrough;
        case TCP_FIN_WAIT1:
        case TCP_FIN_WAIT2:
                /* RFC 793 says to queue data in these states,
                 * RFC 1122 says we MUST send a reset.
                 * BSD 4.4 also does reset.
                 */
                if (sk->sk_shutdown & RCV_SHUTDOWN) {
                        if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
                            after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
                                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
                                tcp_reset(sk);
                                return 1;
                        }
                }
                fallthrough;
        case TCP_ESTABLISHED://对已接收到的 TCP 段排队,在建立连接阶段一般不会收到 TCP 段
                tcp_data_queue(sk, skb);
                queued = 1;
                break;
        }

        /* tcp_data could move socket to TIME-WAIT */
//此时状态不为 CLOSE,故而就回去检测是否数据和 ACK 要发送。
	//其次,根据 queue 标志来确定是否释放接收到的 TCP 段,如果接收到的 TCP 段已添加到接收队列中,则不释放
        if (sk->sk_state != TCP_CLOSE) {
                tcp_data_snd_check(sk);
                tcp_ack_snd_check(sk);
        }

        if (!queued) {
discard:
                tcp_drop(sk, skb);
        }
        return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);

tcp_rcv_established :

 *      TCP receive function for the ESTABLISHED state.
 *
 *      It is split into a fast path and a slow path. The fast path is
 *      disabled when:
 *      - A zero window was announced from us - zero window probing
 *        is only handled properly in the slow path.
 *      - Out of order segments arrived.
 *      - Urgent data is expected.
 *      - There is no buffer space left
 *      - Unexpected TCP flags/window values/header lengths are received
 *        (detected by checking the TCP header against pred_flags)
 *      - Data is sent in both directions. Fast path only supports pure senders
 *        or pure receivers (this means either the sequence number or the ack
 *        value must stay constant)
 *      - Unexpected TCP option.
 *
 *      When these conditions are not satisfied it drops into a standard
 *      receive procedure patterned after RFC793 to handle all cases.
 *      The first three cases are guaranteed by proper pred_flags setting,
 *      the rest is checked inline. Fast processing is turned on in
 *      tcp_data_queue when everything is OK.
 */
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{
        const struct tcphdr *th = (const struct tcphdr *)skb->data;
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int len = skb->len;

        /* TCP congestion window tracking */
        trace_tcp_probe(sk, skb);

        tcp_mstamp_refresh(tp);
        if (unlikely(!sk->sk_rx_dst))
                inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
        /*
         *      Header prediction.
         *      The code loosely follows the one in the famous
         *      "30 instruction TCP receive" Van Jacobson mail.
         *
         *      Van's trick is to deposit buffers into socket queue
         *      on a device interrupt, to call tcp_recv function
         *      on the receive process context and checksum and copy
         *      the buffer to user space. smart...
         *
         *      Our current scheme is not silly either but we take the
         *      extra cost of the net_bh soft interrupt processing...
         *      We do checksum and copy also but from device to kernel.
         */

        tp->rx_opt.saw_tstamp = 0;

        /*      pred_flags is 0xS?10 << 16 + snd_wnd
         *      if header_prediction is to be made
         *      'S' will always be tp->tcp_header_len >> 2
         *      '?' will be 0 for the fast path, otherwise pred_flags is 0 to
         *  turn it off (when there are holes in the receive
         *       space for instance)
         *      PSH flag is ignored.
         */
    //预定向标志和输入数据段的标志比较
	//数据段序列号是否正确
        if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
            TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
            !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
                int tcp_header_len = tp->tcp_header_len;

                /* Timestamp header prediction: tcp_header_len
                 * is automatically equal to th->doff*4 due to pred_flags
                 * match.
                 */

                /* Check timestamp */
//时间戳选项之外如果还有别的选项就送给Slow Path处理
                if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
                        /* No? Slow path! */
                        if (!tcp_parse_aligned_timestamp(tp, th))
                                goto slow_path;
                //对数据包做PAWS快速检查,如果检查走Slow Path处理
                        /* If PAWS failed, check it more carefully in slow path */
                        if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
                                goto slow_path;

                        /* DO NOT update ts_recent here, if checksum fails
                         * and timestamp was corrupted part, it will result
                         * in a hung connection since we will drop all
                         * future packets due to the PAWS test.
                         */
                }
                    //数据包长度太小
                if (len <= tcp_header_len) {
                        /* Bulk data transfer: sender */
                        if (len == tcp_header_len) {
                                /* Predicted packet is in window by definition.
                                 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
                                 * Hence, check seq<=rcv_wup reduces to:
                                 */
                                if (tcp_header_len ==
                                    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
                                    tp->rcv_nxt == tp->rcv_wup)
                                        tcp_store_ts_recent(tp);

                                /* We know that such packets are checksummed
                                 * on entry.
                                 */
                                tcp_ack(sk, skb, 0);
                                __kfree_skb(skb);
                                tcp_data_snd_check(sk);
                                /* When receiving pure ack in fast path, update
                                 * last ts ecr directly instead of calling
                                 * tcp_rcv_rtt_measure_ts()
                                 */
                                tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
                                return;
                        } else { /* Header too small */
                                TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
                                goto discard;
                        }
                } else {
                        int eaten = 0;
                        bool fragstolen = false;
                          //计算校验和
                        if (tcp_checksum_complete(skb))
                                goto csum_error;

                        if ((int)skb->truesize > sk->sk_forward_alloc)
                                goto step5;

                        /* Predicted packet is in window by definition.
                         * seq == rcv_nxt and rcv_wup <= rcv_nxt.
                         * Hence, check seq<=rcv_wup reduces to:
                         */
                        //复制成功
                        if (tcp_header_len ==
                            (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
                            tp->rcv_nxt == tp->rcv_wup)
                                tcp_store_ts_recent(tp);

                        tcp_rcv_rtt_measure_ts(sk, skb);
                        //大块数据传送
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);

                        /* Bulk data transfer: receiver */
                          //去掉tcp头部
                        __skb_pull(skb, tcp_header_len);
                        eaten = tcp_queue_rcv(sk, skb, &fragstolen);
                        //更新延迟回答时钟超时间隔值
                        tcp_event_data_recv(sk, skb);

                        if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
                                /* Well, only one small jumplet in fast path... */
                                tcp_ack(sk, skb, FLAG_DATA);
                                tcp_data_snd_check(sk);
                                if (!inet_csk_ack_scheduled(sk))
                                        goto no_ack;
                        } else {
                                tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
                        }
                        /收到数据后回复ack确认
                        __tcp_ack_snd_check(sk, 0);
no_ack:
                        if (eaten)
                                kfree_skb_partial(skb, fragstolen);
                        tcp_data_ready(sk);
                        return;
                }
        }

slow_path:
        if (len < (th->doff << 2) || tcp_checksum_complete(skb))
                goto csum_error;

        if (!th->ack && !th->rst && !th->syn)
                goto discard;

        /*
         *      Standard slow path.
         */

        if (!tcp_validate_incoming(sk, skb, th, 1))
                return;

step5:
        if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
                goto discard;

        tcp_rcv_rtt_measure_ts(sk, skb);

        /* Process urgent data. *///紧急数据段处理
        tcp_urg(sk, skb, th);

        /* step 7: process the segment text */
        //根据情况将数据复制到应用层或者
	//将数据加入sk_receive_queue常规队列中
        tcp_data_queue(sk, skb);

        tcp_data_snd_check(sk);
        tcp_ack_snd_check(sk);
        return;

csum_error:
        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);

discard:
        tcp_drop(sk, skb);
}
EXPORT_SYMBOL(tcp_rcv_established);

用户进程接收:tcp_recvmsg

用户进程调用recvfrom读取套接字缓冲区上的数据,实际是调用tcp_recvmsg函数将数据包从内核地址空间复制到用户考地址空间。

/*
 *      This routine copies from a sock struct into the user buffer.
 *
 *      Technical note: in 2.3 we work on _locked_ socket, so that
 *      tricks with *seq access order and skb->users are not required.
 *      Probably, code can be easily improved even more.
 */

int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
                int flags, int *addr_len)
{
    //获取TCP套接字结构
        struct tcp_sock *tp = tcp_sk(sk);
        int copied = 0;
        u32 peek_seq;
        u32 *seq;
        unsigned long used;
        int err, inq;
        int target;             /* Read at least this many bytes */
        long timeo;
        struct sk_buff *skb, *last;
        u32 urg_hole = 0;
        struct scm_timestamping_internal tss;
        int cmsg_flags;

        if (unlikely(flags & MSG_ERRQUEUE))
                return inet_recv_error(sk, msg, len, addr_len);

        if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
            (sk->sk_state == TCP_ESTABLISHED))
                sk_busy_loop(sk, nonblock);

    //锁住套接字,其实就是设置sk->sk_lock.owned  = 1
	//当产生软中断调用tcp_v4_rcv获取套接字sock发现
	//sock处于进程上下文,就会把数据包加入到balock_queue队列中

        lock_sock(sk);

        err = -ENOTCONN;
        //套接字当前处于监听状态就直接跳出
        if (sk->sk_state == TCP_LISTEN)
                goto out;

        cmsg_flags = tp->recvmsg_inq ? 1 : 0;
        //查实时间,如果是非阻塞模式就为0
        timeo = sock_rcvtimeo(sk, nonblock);

    //紧急处理数据
        /* Urgent data needs to be handled specially. */
        if (flags & MSG_OOB)
                goto recv_urg;

        if (unlikely(tp->repair)) {
                err = -EPERM;
                if (!(flags & MSG_PEEK))
                        goto out;

                if (tp->repair_queue == TCP_SEND_QUEUE)
                        goto recv_sndq;

                err = -EINVAL;
                if (tp->repair_queue == TCP_NO_QUEUE)
                        goto out;

                /* 'common' recv queue MSG_PEEK-ing */
        }
//未读取数据包的开始序列号
        seq = &tp->copied_seq;
        if (flags & MSG_PEEK) {
                peek_seq = tp->copied_seq;
                seq = &peek_seq;
        }
	//取len和sk->rcvlowat中的最小值
	//MSG_WAITALL标志是判断是否要接受完整的数据包后再拷贝复制数据包
        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
//主循环,复制数据到用户地址空间直到target为0

        do {
                u32 offset;

                /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. *///遇到紧急数据停止处理跳出循环
                if (tp->urg_data && tp->urg_seq == *seq) {
                        if (copied)
                                break;
                           //检测套接字上是否有信号等待处理,确保能处理SIGUSR信号。
                        if (signal_pending(current)) {//检查是否超时
                                copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
                                break;
                        }
                }

                /* Next get a buffer. */
                //循环变量接受缓冲区队列receive_queue队列
                last = skb_peek_tail(&sk->sk_receive_queue);
                skb_queue_walk(&sk->sk_receive_queue, skb) {
                        last = skb;
                        /* Now that we have two receive queues this
                         * shouldn't happen.
                         */
                        if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
                                 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
                                 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
                                 flags))
                                break;
			//未读取数据包的序列号和已经读取数据包的序列号差
			//如果这个差小于数据包长度skb->len,表示这是我们要找的数据包
			//因为是最小的序列号
                        offset = *seq - TCP_SKB_CB(skb)->seq;
                    //如果是syn表就跳过
                        if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
                                pr_err_once("%s: found a SYN, please report !\n", __func__);
                                offset--;
                        }
                           //找到了skb,跳转到found_ok_skb处完成复制工作
                        if (offset < skb->len)
                                goto found_ok_skb;
                          //发现是fin包调转到fin处理标签处
                        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                                goto found_fin_ok;
                        WARN(!(flags & MSG_PEEK),
                             "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
                             *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
                }

                /* Well, if we have backlog, try to process it now yet. */
                //缓冲区recieve_queue队列中已经没有数据
		       //而且backlog_queue队列中也没有数据了就跳出循环
                if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
                        break;

                if (copied) {
                        //检查套接字的状态是否是关闭
			          //或者收到远端的断开请求,则要跳出复制循环
                        if (sk->sk_err ||
                            sk->sk_state == TCP_CLOSE ||
                            (sk->sk_shutdown & RCV_SHUTDOWN) ||
                            !timeo ||
                            signal_pending(current))
                                break;
                } else {
                        //copied为0表示应用层没有复制到数据,没有复制到数据有三种可能
			            //第一是套接字已经关闭了,第二是缓冲区根本没有数据 
			            //第三是其他错误
                        if (sock_flag(sk, SOCK_DONE))
                                break;

                        if (sk->sk_err) {
                                copied = sock_error(sk);
                                break;
                        }

                        if (sk->sk_shutdown & RCV_SHUTDOWN)
                                break;

                        if (sk->sk_state == TCP_CLOSE) {
                                /* This occurs when user tries to read
                                 * from never connected socket.
                                 */
                        //当用户关闭套接字会设置SOCK_DON标志
				//连接状态是TCP_CLOSE,SOCK_DONE标志就不会0
                                copied = -ENOTCONN;
                                break;
                        }
                        //查看是否阻塞,不阻塞直接返回
			            //返回的错误标志是EAGAIN
                        if (!timeo) {
                                copied = -EAGAIN;
                                break;
                        }
                        //读取数据失败可能是其他错误
			          //返回错误原因
                        if (signal_pending(current)) {
                                copied = sock_intr_errno(timeo);
                                break;
                        }
                }
                //根据已经复制数据长度copied清除recieve_queue队列
		        //并且回复对端ack包
                tcp_cleanup_rbuf(sk, copied);

                if (copied >= target) {
                        /* Do not sleep, just process backlog. */
                        release_sock(sk);
                        lock_sock(sk);
                } else {
                        sk_wait_data(sk, &timeo, last);
                }

                if ((flags & MSG_PEEK) &&
                    (peek_seq - copied - urg_hole != tp->copied_seq)) {
                        net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
                                            current->comm,
                                            task_pid_nr(current));
                        peek_seq = tp->copied_seq;
                }
                continue;
found_ok_skb://处理sk_receive_queue队列中的数据
                /* Ok so how much can we use? */
                used = skb->len - offset;
                if (len < used)
                        used = len;

                /* Do we have urgent data here? */
	    //首先查看是否有紧急数据需要处理
		//如果设置套接字选项设置了SO_OOBINLINE就不需要处理紧急数据
		//因为有单独处理
                if (tp->urg_data) {
                        u32 urg_offset = tp->urg_seq - *seq;
                        if (urg_offset < used) {
                                if (!urg_offset) {
                                        if (!sock_flag(sk, SOCK_URGINLINE)) {
                                                WRITE_ONCE(*seq, *seq + 1);
                                                urg_hole++;
                                                offset++;
                                                used--;
                                                if (!used)
                                                        goto skip_copy;
                                        }
                                } else
                                        used = urg_offset;
                        }
                }

                if (!(flags & MSG_TRUNC)) {
                        //将数据包从内核地址空间复制到用户地址空间
                        err = skb_copy_datagram_msg(skb, offset, msg, used);
                        if (err) {
                                /* Exception. Bailout! */
                                if (!copied)
                                        copied = -EFAULT;
                                break;
                        }
                }

                WRITE_ONCE(*seq, *seq + used);
                copied += used;//更新已复制的数据长度
                len -= used;/更新剩下需要复制的数据长度

                tcp_rcv_space_adjust(sk);//重新调整tcp接受窗口

skip_copy:
                if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
                        tp->urg_data = 0;
                        tcp_fast_path_check(sk);//处理完了紧急数据,调转到Fast Path处理
                }

                if (TCP_SKB_CB(skb)->has_rxtstamp) {
                        tcp_update_recv_tstamps(skb, &tss);
                        cmsg_flags |= 2;
                }

                if (used + offset < skb->len)
                        continue;

                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                        goto found_fin_ok;
                if (!(flags & MSG_PEEK))
                        sk_eat_skb(sk, skb);
                continue;

found_fin_ok://套接字状态是Fin
                /* Process the FIN. */
                WRITE_ONCE(*seq, *seq + 1);//序列号加1
                if (!(flags & MSG_PEEK))
                        sk_eat_skb(sk, skb);//重新计算tcp窗口
                break;
        } while (len > 0);

        /* According to UNIX98, msg_name/msg_namelen are ignored
         * on connected socket. I was just happy when found this 8) --ANK
         */

        /* Clean up data we have read: This will do ACK frames. */
        tcp_cleanup_rbuf(sk, copied);

        release_sock(sk);

        if (cmsg_flags) {
                if (cmsg_flags & 2)
                        tcp_recv_timestamp(msg, sk, &tss);
                if (cmsg_flags & 1) {
                        inq = tcp_inq_hint(sk);
                        put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
                }
        }

        return copied;

out:
        release_sock(sk);
        return err;

recv_urg:
        err = tcp_recv_urg(sk, msg, len, flags);
        goto out;

recv_sndq:
        err = tcp_peek_sndq(sk, msg, len);
        goto out;
}
EXPORT_SYMBOL(tcp_recvmsg);

对传输层TCP 的接收函数的总结:

 

上面很大部分是介绍了接收报文的流程,下面发送报文需要查找路由的函数接口:ip_route_output_flow

举报

相关推荐

0 条评论