内核full nat 简析-CFANZ编程社区

DPVS 参考LVS设计，其核心在LVS的netfilter框架；笔者曾经做过类似的FULLNAT 满足portal 三层认证

　　 LVS的HOOK点函数在内核中IPVS的源码在net/netfilter/ipvs目录下，LVS是以netfilter框架为基础，先看一下LVS在哪些HOOK点挂载了自己的处理函数。IPVS的处理函数在挂载在下面三个HOOK点，NF_INET_LOCAL_IN, NF_INET_FORWARD，NF_INET_POST_ROUTING，LVS同时支持IPv4/IPv6两种协议。

static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
    /* After packet filtering, forward packet through VS/DR, VS/TUN,
     * or VS/NAT(change destination), so that filtering rules can be
     * applied to IPVS. */
    {
        .hook        = ip_vs_in,    
        .owner        = THIS_MODULE,
        .pf        = PF_INET,
        .hooknum = NF_INET_LOCAL_IN,
        .priority = 100,
    },
    /* After packet filtering, change source only for VS/NAT */
    {
        .hook        = ip_vs_out,
        .owner        = THIS_MODULE,
        .pf        = PF_INET,
        .hooknum = NF_INET_FORWARD,
        .priority = 100,
    },
    /* After packet filtering (but before ip_vs_out_icmp), catch icmp
     * destined for 0.0.0.0/0, which is for incoming IPVS connections */
    {
        .hook        = ip_vs_forward_icmp,
        .owner        = THIS_MODULE,
        .pf        = PF_INET,
        .hooknum = NF_INET_FORWARD,
        .priority = 99,
    },
    /* Before the netfilter connection tracking, exit from POST_ROUTING */
    {
        .hook        = ip_vs_post_routing,
        .owner        = THIS_MODULE,
        .pf        = PF_INET,
        .hooknum = NF_INET_POST_ROUTING,
        .priority = NF_IP_PRI_NAT_SRC-1,
    },
#ifdef CONFIG_IP_VS_IPV6
    /* After packet filtering, forward packet through VS/DR, VS/TUN,
     * or VS/NAT(change destination), so that filtering rules can be
     * applied to IPVS. */
    {
        .hook        = ip_vs_in,
        .owner        = THIS_MODULE,
        .pf        = PF_INET6,
        .hooknum = NF_INET_LOCAL_IN,
        .priority = 100,
    },
    /* After packet filtering, change source only for VS/NAT */
    {
        .hook        = ip_vs_out,
        .owner        = THIS_MODULE,
        .pf        = PF_INET6,
        .hooknum = NF_INET_FORWARD,
        .priority = 100,
    },
    /* After packet filtering (but before ip_vs_out_icmp), catch icmp
     * destined for 0.0.0.0/0, which is for incoming IPVS connections */
    {
        .hook        = ip_vs_forward_icmp_v6,
        .owner        = THIS_MODULE,
        .pf        = PF_INET6,
        .hooknum = NF_INET_FORWARD,
        .priority = 99,
    },
    /* Before the netfilter connection tracking, exit from POST_ROUTING */
    {
        .hook        = ip_vs_post_routing,
        .owner        = THIS_MODULE,
        .pf        = PF_INET6,
        .hooknum = NF_INET_POST_ROUTING,
        .priority = NF_IP6_PRI_NAT_SRC-1,
    },
#endif
};

4.2 ip_vs_in函数

　　当远端的客户端发送数据到达服务器时，也就是LVS Server时，该数据包的目的IP地址就是到达本地的，所以会先进入NF_LOCAL_IN的HOOK点，进行本地处理。

static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
     const struct net_device *in, const struct net_device *out,
     int (*okfn)(struct sk_buff *))
{
    struct ip_vs_iphdr iph;
    struct ip_vs_protocol *pp;
    struct ip_vs_conn *cp;
    int ret, restart, af, pkts;
   //判断是IPV4还是IPV6协议
    af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
   /*获取到IP头*/
    ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);

    /*
     *    Big tappo: only PACKET_HOST, including loopback for local client
     *    Don't handle local packets on IPv6 for now
     判断是否是达到本机的*/
    if (unlikely(skb->pkt_type != PACKET_HOST)) {
        IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
             skb->pkt_type,
             iph.protocol,
             IP_VS_DBG_ADDR(af, &iph.daddr));
        return NF_ACCEPT;
    }
/*对IPV6的判断忽略*/
#ifdef CONFIG_IP_VS_IPV6
    if (af == AF_INET6) {
        if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
            int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);

            if (related)
                return verdict;
            ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
        }
    } else
#endif
/*判断是否是ICMP报文，*/
        if (unlikely(iph.protocol == IPPROTO_ICMP)) {
            int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);

            if (related)
                return verdict;
            ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
        }

    /* Protocol supported? 判断LVS支持的协议，目前支持TCP、UDP、sctp、ah、esp    */
    pp = ip_vs_proto_get(iph.protocol);
    if (unlikely(!pp))
        return NF_ACCEPT;

    /*
     * Check if the packet belongs to an existing connection entry，检查该数据包是否已经属于已经存在的连接实例，对于第一个包，这里返回NULL，这里对应的回调函数为ip_vs_conn_in_get_proto，该函数主要是根据源IP，目的IP，源端口和目的端口,协议号来查找。 */
    cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
/*对于新到来的数据包包，查找不到cp为空，进入下面的if处理*/
    if (unlikely(!cp)) {
        int v;

        /* For local client packets, it could be a response 对于TCP这里调用的函数为ip_vs_conn_out_get_proto 新的内核版本已经没有了response的处理，这里不作分析*/
        cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
        if (cp)
            return handle_response(af, skb, pp, cp, iph.len); /*这里查找是否是出去的报文，if (res_dir == IP_VS_CIDX_F_IN2OUT) {
            return handle_response(af, skb, pp, cp, iph.len);
        } 做SNAT*/
    /*没有查找到connection ,创建新的connection ，这里的函数为tcp_conn_schedule */
        if (!pp->conn_schedule(af, skb, pp, &v, &cp))
            return v;
    }

    if (unlikely(!cp)) {
        /* sorry, all this trouble for a no-hit :) 到这里说明创建也没有成功，直接返回ACCEPT*/
        IP_VS_DBG_PKT(12, pp, skb, 0,
             "packet continues traversal as normal");
        return NF_ACCEPT;
    }

    IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");

    /* Check the server status 检查server的状态，如果cp->dest不为空，但是服务器不可用，则直接丢弃该数据包*/
    if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
        /* the destination server is not available */

        if (sysctl_ip_vs_expire_nodest_conn) {
            /* try to expire the connection immediately */
            ip_vs_conn_expire_now(cp);
        }
        /* don't restart its timer, and silently
         drop the packet. 引用计数减一 */
        __ip_vs_conn_put(cp);
        return NF_DROP;
    }
   /*更新统计和状态信息*/
    ip_vs_in_stats(cp, skb);
  /*设置现在的状态为IP_VS_DIR_INPUT,调用的函数为，最终调用的函数为tcp_state_transition */
    restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
    if (cp->packet_xmit)
        ret = cp->packet_xmit(skb, cp, pp); /*调用发包函数，对于NAT模式来说这里的发包函数为ip_vs_nat_xmit */
        /* do not touch skb anymore */
    else {
        IP_VS_DBG_RL("warning: packet_xmit is null");
        ret = NF_ACCEPT;
    }

    /* Increase its packet counter and check if it is needed
     * to be synchronized
     *
     * Sync connection if it is about to close to
     * encorage the standby servers to update the connections timeout
     增加发送的包的数量，并进行同步*/
    pkts = atomic_add_return(1, &cp->in_pkts);
    if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
     cp->protocol == IPPROTO_SCTP) {
        if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
            (atomic_read(&cp->in_pkts) %
             sysctl_ip_vs_sync_threshold[1]
             == sysctl_ip_vs_sync_threshold[0])) ||
                (cp->old_state != cp->state &&
                 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
                 (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
                 (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
            ip_vs_sync_conn(cp);//这里进行同步，同步主要是同步备份LVS server，这里是针对SCTP协议
            goto out;
        }
    }

    if (af == AF_INET &&
     (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
     (((cp->protocol != IPPROTO_TCP ||
     cp->state == IP_VS_TCP_S_ESTABLISHED) &&
     (pkts % sysctl_ip_vs_sync_threshold[1]
     == sysctl_ip_vs_sync_threshold[0])) ||
     ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
     ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
     (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
     (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
        ip_vs_sync_conn(cp); //这里进行同步，同步主要是同步备份LVS server，这里是针对TCP协议

out:
    cp->old_state = cp->state;

    ip_vs_conn_put(cp);
    return ret;
}

4.2.1 tcp_conn_schedule

在该函数中调用调度策略函数，按照调度策略找到真正的real Server。

static int
tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
          struct ip_vs_proto_data *pd,
          int *verdict, struct ip_vs_conn **cpp,
          struct ip_vs_iphdr *iph)
{
    struct ip_vs_service *svc;
    struct tcphdr _tcph, *th;
    __be16 _ports[2], *ports = NULL;

    /* In the event of icmp, we're only guaranteed to have the first 8
     * bytes of the transport header, so we only check the rest of the
     * TCP packet for non-ICMP packets
     */
    if (likely(!ip_vs_iph_icmp(iph))) {
        th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
        if (th) {
            if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn))
                return 1;
            ports = &th->source;
        }
    } else {
        ports = skb_header_pointer(
            skb, iph->len, sizeof(_ports), &_ports);
    }

    if (!ports) {
        *verdict = NF_DROP;
        return 0;
    }

    /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
    rcu_read_lock();

    if (likely(!ip_vs_iph_inverse(iph)))
        svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
                     &iph->daddr, ports[1]);
    else
        svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
                     &iph->saddr, ports[0]);

    if (svc) {
        int ignored;

        if (ip_vs_todrop(ipvs)) {
            /*
             * It seems that we are very loaded.
             * We have to drop this packet :(
             */
            rcu_read_unlock();
            *verdict = NF_DROP;
            return 0;
        }

        /* 根据协议号和目的地址、目的端口号查找到ip_vs_service 实例
         * Let the virtual server select a real server for the
         * incoming connection, and create a connection entry.
         */
        *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
        if (!*cpp && ignored <= 0) {
            if (!ignored)
                *verdict = ip_vs_leave(svc, skb, pd, iph);
            else
                *verdict = NF_DROP;
            rcu_read_unlock();
            return 0;
        }
    }
    rcu_read_unlock();
    /* NF_ACCEPT */
    return 1;
}

/*
 *  IPVS main scheduling function
 *  It selects a server according to the virtual service, and
 *  creates a connection entry.
 *  Protocols supported: TCP, UDP
 *
 *  Usage of *ignored
 *
 * 1 :   protocol tried to schedule (eg. on SYN), found svc but the
 *       svc/scheduler decides that this packet should be accepted with
 *       NF_ACCEPT because it must not be scheduled.
 *
 * 0 :   scheduler can not find destination, so try bypass or
 *       return ICMP and then NF_DROP (ip_vs_leave).
 *
 * -1 :  scheduler tried to schedule but fatal error occurred, eg.
 *       ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
 *       failure such as missing Call-ID, ENOMEM on skb_linearize
 *       or pe_data. In this case we should return NF_DROP without
 *       any attempts to send ICMP with ip_vs_leave.
 */
struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
           struct ip_vs_proto_data *pd, int *ignored,
           struct ip_vs_iphdr *iph)
{
    struct ip_vs_protocol *pp = pd->pp;
    struct ip_vs_conn *cp = NULL;
    struct ip_vs_scheduler *sched;
    struct ip_vs_dest *dest;
    __be16 _ports[2], *pptr, cport, vport;
    const void *caddr, *vaddr;
    unsigned int flags;

    *ignored = 1;
    /*
     * IPv6 frags, only the first hit here.
     */
    pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
    if (pptr == NULL)
        return NULL;

    if (likely(!ip_vs_iph_inverse(iph))) {
        cport = pptr[0];
        caddr = &iph->saddr;
        vport = pptr[1];
        vaddr = &iph->daddr;
    } else {
        cport = pptr[1];
        caddr = &iph->daddr;
        vport = pptr[0];
        vaddr = &iph->saddr;
    }

    /*
     * FTPDATA needs this check when using local real server.
     * Never schedule Active FTPDATA connections from real server.
     * For LVS-NAT they must be already created. For other methods
     * with persistence the connection is created on SYN+ACK.
     */
    if (cport == FTPDATA) {
        IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off,
                  "Not scheduling FTPDATA");
        return NULL;
    }

    /*
     *    Do not schedule replies from local real server.
     */
    if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) {
        iph->hdr_flags ^= IP_VS_HDR_INVERSE;
        cp = pp->conn_in_get(svc->ipvs, svc->af, skb, iph);
        iph->hdr_flags ^= IP_VS_HDR_INVERSE;

        if (cp) {
            IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off,
                      "Not scheduling reply for existing"
                      " connection");
            __ip_vs_conn_put(cp);
            return NULL;
        }
    }

    /*
     *    Persistent service
     */
    if (svc->flags & IP_VS_SVC_F_PERSISTENT)
        return ip_vs_sched_persist(svc, skb, cport, vport, ignored,
                       iph);

    *ignored = 0;

    /*
     *    Non-persistent service
     */
    if (!svc->fwmark && vport != svc->port) {
        if (!svc->port)
            pr_err("Schedule: port zero only supported "
                   "in persistent services, "
                   "check your ipvs configuration\n");
        return NULL;
    }

    sched = rcu_dereference(svc->scheduler);
    if (sched) {
        /* read svc->sched_data after svc->scheduler */
        smp_rmb();
        dest = sched->schedule(svc, skb, iph);
    } else {
        dest = NULL;
    }
    if (dest == NULL) {
        IP_VS_DBG(1, "Schedule: no dest found.\n");
        return NULL;
    }

    flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
         && iph->protocol == IPPROTO_UDP) ?
        IP_VS_CONN_F_ONE_PACKET : 0;

    /*
     *    Create a connection entry.
     */
    {
        struct ip_vs_conn_param p;

        ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
                      caddr, cport, vaddr, vport, &p);
        cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
                    dest->port ? dest->port : vport,
                    flags, dest, skb->mark);
        if (!cp) {
            *ignored = -1;
            return NULL;
        }
    }

    IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
              "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
              ip_vs_fwd_tag(cp),
              IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
              IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
              IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
              cp->flags, atomic_read(&cp->refcnt));

    ip_vs_conn_stats(cp, svc);
    return cp;
}

/*
 *    Create a new connection entry and hash it into the ip_vs_conn_tab
 */
struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
           const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
           struct ip_vs_dest *dest, __u32 fwmark)
{
    struct ip_vs_conn *cp;
    struct netns_ipvs *ipvs = p->ipvs;
    struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs,
                               p->protocol);

    cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
    if (cp == NULL) {
        IP_VS_ERR_RL("%s(): no memory\n", __func__);
        return NULL;
    }

    INIT_HLIST_NODE(&cp->c_list);
    setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
    cp->ipvs       = ipvs;
    cp->af           = p->af;
    cp->daf           = dest_af;
    cp->protocol       = p->protocol;
    ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
    cp->cport       = p->cport;
    /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
    ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
               &cp->vaddr, p->vaddr);
    cp->vport       = p->vport;
    ip_vs_addr_set(cp->daf, &cp->daddr, daddr);
    cp->dport          = dport;
    cp->flags       = flags;
    cp->fwmark         = fwmark;
    if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
        ip_vs_pe_get(p->pe);
        cp->pe = p->pe;
        cp->pe_data = p->pe_data;
        cp->pe_data_len = p->pe_data_len;
    } else {
        cp->pe = NULL;
        cp->pe_data = NULL;
        cp->pe_data_len = 0;
    }
    spin_lock_init(&cp->lock);

    /*
     * Set the entry is referenced by the current thread before hashing
     * it in the table, so that other thread run ip_vs_random_dropentry
     * but cannot drop this entry.
     */
    atomic_set(&cp->refcnt, 1);

    cp->control = NULL;
    atomic_set(&cp->n_control, 0);
    atomic_set(&cp->in_pkts, 0);

    cp->packet_xmit = NULL;
    cp->app = NULL;
    cp->app_data = NULL;
    /* reset struct ip_vs_seq */
    cp->in_seq.delta = 0;
    cp->out_seq.delta = 0;

    atomic_inc(&ipvs->conn_count);
    if (flags & IP_VS_CONN_F_NO_CPORT)
        atomic_inc(&ip_vs_conn_no_cport_cnt);

    /* Bind the connection with a destination server */
    cp->dest = NULL;
    ip_vs_bind_dest(cp, dest);

    /* Set its state and timeout */
    cp->state = 0;
    cp->old_state = 0;
    cp->timeout = 3*HZ;
    cp->sync_endtime = jiffies & ~3UL;

    /* Bind its packet transmitter */
#ifdef CONFIG_IP_VS_IPV6
    if (p->af == AF_INET6)
        ip_vs_bind_xmit_v6(cp);
    else
#endif
        ip_vs_bind_xmit(cp);

    if (unlikely(pd && atomic_read(&pd->appcnt)))
        ip_vs_bind_app(cp, pd->pp);

    /*
     * Allow conntrack to be preserved. By default, conntrack
     * is created and destroyed for every packet.
     * Sometimes keeping conntrack can be useful for
     * IP_VS_CONN_F_ONE_PACKET too.
     */

    if (ip_vs_conntrack_enabled(ipvs))
        cp->flags |= IP_VS_CONN_F_NFCT;

    /* Hash it in the ip_vs_conn_tab finally */
    ip_vs_conn_hash(cp);

    return cp;
}


/*
 *    Bind a connection entry with the corresponding packet_xmit.
 *    Called by ip_vs_conn_new.
 */
static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
{
    switch (IP_VS_FWD_METHOD(cp)) {
    case IP_VS_CONN_F_MASQ:
        cp->packet_xmit = ip_vs_nat_xmit;
        break;

    case IP_VS_CONN_F_TUNNEL:
#ifdef CONFIG_IP_VS_IPV6
        if (cp->daf == AF_INET6)
            cp->packet_xmit = ip_vs_tunnel_xmit_v6;
        else
#endif
            cp->packet_xmit = ip_vs_tunnel_xmit;
        break;

    case IP_VS_CONN_F_DROUTE:
        cp->packet_xmit = ip_vs_dr_xmit;
        break;

    case IP_VS_CONN_F_LOCALNODE:
        cp->packet_xmit = ip_vs_null_xmit;
        break;

    case IP_VS_CONN_F_BYPASS:
        cp->packet_xmit = ip_vs_bypass_xmit;
        break;
    }
}

如果是NAT模式下：执行ip_vs_nat_xmit 转发报文到RS服务器

/*
 *      NAT transmitter (only for outside-to-inside nat forwarding)
 *      Not used for related ICMP
 */
int
ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
           struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
    struct rtable *rt;        /* Route to the other host */
    int local, rc, was_input;

    EnterFunction(10);

    rcu_read_lock();
    /* check if it is a connection of no-client-port */
    if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
        __be16 _pt, *p;

        p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
        if (p == NULL)
            goto tx_error;
        ip_vs_conn_fill_cport(cp, *p);
        IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
    }

    was_input = rt_is_input_route(skb_rtable(skb));
    local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
                   IP_VS_RT_MODE_LOCAL |
                   IP_VS_RT_MODE_NON_LOCAL |
                   IP_VS_RT_MODE_RDR, NULL, ipvsh);
    if (local < 0)
        goto tx_error;
    rt = skb_rtable(skb);
    /*
     * Avoid duplicate tuple in reply direction for NAT traffic
     * to local address when connection is sync-ed
     */
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
    if (cp->flags & IP_VS_CONN_F_SYNC && local) {
        enum ip_conntrack_info ctinfo;
        struct nf_conn *ct = nf_ct_get(skb, &ctinfo);

        if (ct && !nf_ct_is_untracked(ct)) {
            IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off,
                     "ip_vs_nat_xmit(): "
                     "stopping DNAT to local address");
            goto tx_error;
        }
    }
#endif

    /* From world but DNAT to loopback address? */
    if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
        IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off,
                 "ip_vs_nat_xmit(): stopping DNAT to loopback "
                 "address");
        goto tx_error;
    }

    /* copy-on-write the packet before mangling it */
    if (!skb_make_writable(skb, sizeof(struct iphdr)))
        goto tx_error;

    if (skb_cow(skb, rt->dst.dev->hard_header_len))
        goto tx_error;

    /* mangle the packet */
    if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
        goto tx_error;
    ip_hdr(skb)->daddr = cp->daddr.ip;// DNAT  目的ip地址替换
    ip_send_check(ip_hdr(skb));

    IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT");

    /* FIXME: when application helper enlarges the packet and the length
       is larger than the MTU of outgoing device, there will be still
       MTU problem. */

    /* Another hack: avoid icmp_send in ip_fragment */
    skb->ignore_df = 1;

    rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
    rcu_read_unlock();

    LeaveFunction(10);
    return rc;

  tx_error:
    kfree_skb(skb);
    rcu_read_unlock();
    LeaveFunction(10);
    return NF_STOLEN;
}

/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
                     struct ip_vs_conn *cp, int local)
{
    int ret = NF_STOLEN;

    skb->ipvs_property = 1;
    if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
        ip_vs_notrack(skb);
    else
        ip_vs_update_conntrack(skb, cp, 1);

    /* Remove the early_demux association unless it's bound for the
     * exact same port and address on this host after translation.
     */
    if (!local || cp->vport != cp->dport ||
        !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
        ip_vs_drop_early_demux_sk(skb);

    if (!local) {
        skb_forward_csum(skb);
        NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
            NULL, skb_dst(skb)->dev, dst_output);
    } else
        ret = NF_ACCEPT;

    return ret;
}