Linux 内核中定义了众多协议簇,在 include/linux/socket.h中定义如下:
/* Supported address families. */
#define AF_UNSPEC 0
#define AF_UNIX 1 /* Unix domain sockets */
#define AF_LOCAL 1 /* POSIX name for AF_UNIX */
#define AF_INET 2 /* Internet IP Protocol */
#define AF_AX25 3 /* Amateur Radio AX.25 */
#define AF_IPX 4 /* Novell IPX */
#define AF_APPLETALK 5 /* AppleTalk DDP */
#define AF_NETROM 6 /* Amateur Radio NET/ROM */
#define AF_BRIDGE 7 /* Multiprotocol bridge */
#define AF_ATMPVC 8 /* ATM PVCs */
#define AF_X25 9 /* Reserved for X.25 project */
#define AF_INET6 10 /* IP version 6 */
#define AF_ROSE 11 /* Amateur Radio X.25 PLP */
#define AF_DECnet 12 /* Reserved for DECnet project */
#define AF_NETBEUI 13 /* Reserved for 802.2LLC project*/
#define AF_SECURITY 14 /* Security callback pseudo AF */
#define AF_KEY 15 /* PF_KEY key management API */
#define AF_NETLINK 16
#define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */
#define AF_PACKET 17 /* Packet family */
#define AF_ASH 18 /* Ash */
#define AF_ECONET 19 /* Acorn Econet */
#define AF_ATMSVC 20 /* ATM SVCs */
#define AF_RDS 21 /* RDS sockets */
#define AF_SNA 22 /* Linux SNA Project (nutters!) */
#define AF_IRDA 23 /* IRDA sockets */
#define AF_PPPOX 24 /* PPPoX sockets */
#define AF_WANPIPE 25 /* Wanpipe API Sockets */
#define AF_LLC 26 /* Linux LLC */
#define AF_IB 27 /* Native InfiniBand address */
#define AF_MPLS 28 /* MPLS */
#define AF_CAN 29 /* Controller Area Network */
#define AF_TIPC 30 /* TIPC sockets */
#define AF_BLUETOOTH 31 /* Bluetooth sockets */
#define AF_IUCV 32 /* IUCV sockets */
#define AF_RXRPC 33 /* RxRPC sockets */
#define AF_ISDN 34 /* mISDN sockets */
#define AF_PHONET 35 /* Phonet sockets */
#define AF_IEEE802154 36 /* IEEE802154 sockets */
#define AF_CAIF 37 /* CAIF sockets */
#define AF_ALG 38 /* Algorithm sockets */
#define AF_NFC 39 /* NFC sockets */
#define AF_VSOCK 40 /* vSockets */
#define AF_KCM 41 /* Kernel Connection Multiplexor*/
#define AF_QIPCRTR 42 /* Qualcomm IPC Router */
#define AF_SMC 43 /* smc sockets: reserve number for
* PF_SMC protocol family that
* reuses AF_INET address family
*/
#define AF_XDP 44 /* XDP sockets */
#define AF_MCTP 45 /* Management component
* transport protocol
*/
#define AF_MAX 46 /* For now.. */
AF_INET 协议簇中包含了常用的IPV4协议,如TCP/UDP/IP/ICMP等。
注册
INET初始化过程中,会向socket layer注册一个struct net_proto_family 类型结构体. socket layer在创建sock时,会调用结构体中定义的create函数创建sock, 结构体定义如下:
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
struct proto_ops
INET层支持常用的三种类型的sock: SOCK_STREAM,SOCK_DGRAM,SOCK_RAW。
enum sock_type {
SOCK_STREAM = 1,
SOCK_DGRAM = 2,
SOCK_RAW = 3,
SOCK_RDM = 4,
SOCK_SEQPACKET = 5,
SOCK_DCCP = 6,
SOCK_PACKET = 10,
};
对应这三种sock,分别定义了一个struct proto_ops类型的结构体:
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.gettstamp = sock_gettstamp,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
.sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked,
.peek_len = tcp_peek_len,
#ifdef CONFIG_COMPAT
.compat_ioctl = inet_compat_ioctl,
#endif
.set_rcvlowat = tcp_set_rcvlowat,
};
const struct proto_ops inet_dgram_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_dgram_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
.poll = udp_poll,
.ioctl = inet_ioctl,
.gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.read_sock = udp_read_sock,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
.set_peek_off = sk_set_peek_off,
#ifdef CONFIG_COMPAT
.compat_ioctl = inet_compat_ioctl,
#endif
};
static const struct proto_ops inet_sockraw_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_dgram_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
.poll = datagram_poll,
.ioctl = inet_ioctl,
.gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
.compat_ioctl = inet_compat_ioctl,
#endif
};
这些结构体最终会赋值给struct socket结构体的成员struct proto_ops *ops,当socket系统调用发生时,会首先调用这些结构成成员函数对socket进行操作。
struct proto
对支持的每一个协议,都定义了一个sruct proto结构体,这个结构包含特定协议相关的数据及处理函数,具体定义如下:
/* Networking protocol blocks we attach to sockets.
* socket layer -> transport layer interface
*/
struct proto {
void (*close)(struct sock *sk,
long timeout);
int (*pre_connect)(struct sock *sk,
struct sockaddr *uaddr,
int addr_len);
int (*connect)(struct sock *sk,
struct sockaddr *uaddr,
int addr_len);
......
unsigned int __percpu *orphan_count;
struct request_sock_ops *rsk_prot;
struct timewait_sock_ops *twsk_prot;
union {
struct inet_hashinfo *hashinfo;
struct udp_table *udp_table;
struct raw_hashinfo *raw_hash;
struct smc_hashinfo *smc_hash;
} h;
struct module *owner;
char name[32];
struct list_head node;
#ifdef SOCK_REFCNT_DEBUG
atomic_t socks;
#endif
int (*diag_destroy)(struct sock *sk, int err);
}
每种协议对应一个:
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.pre_connect = tcp_v4_pre_connect,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = tcp_bpf_update_proto,
#endif
.enter_memory_pressure = tcp_enter_memory_pressure,
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
.diag_destroy = tcp_abort,
};
struct proto udp_prot = {
.name = "UDP",
.owner = THIS_MODULE,
.close = udp_lib_close,
.pre_connect = udp_pre_connect,
.connect = ip4_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
.init = udp_init_sock,
.destroy = udp_destroy_sock,
.setsockopt = udp_setsockopt,
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = udp_bpf_update_proto,
#endif
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp_sock),
.h.udp_table = &udp_table,
.diag_destroy = udp_abort,
};
struct proto raw_prot = {
.name = "RAW",
.owner = THIS_MODULE,
.close = raw_close,
.destroy = raw_destroy,
.connect = ip4_datagram_connect,
.disconnect = __udp_disconnect,
.ioctl = raw_ioctl,
.init = raw_sk_init,
.setsockopt = raw_setsockopt,
.getsockopt = raw_getsockopt,
.sendmsg = raw_sendmsg,
.recvmsg = raw_recvmsg,
.bind = raw_bind,
.backlog_rcv = raw_rcv_skb,
.release_cb = ip4_datagram_release_cb,
.hash = raw_hash_sk,
.unhash = raw_unhash_sk,
.obj_size = sizeof(struct raw_sock),
.useroffset = offsetof(struct raw_sock, filter),
.usersize = sizeof_field(struct raw_sock, filter),
.h.raw_hash = &raw_v4_hashinfo,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_raw_ioctl,
#endif
.diag_destroy = raw_abort,
};
struct inet_protosw
这个结构体没有实质上的意义,只是将创建一个sock所需的要信息组合在一起。
struct inet_protosw {
struct list_head list;
/* These two fields form the lookup key. */
unsigned short type; /* This is the 2nd argument to socket(2).*/
unsigned short protocol; /* This is the L4 protocol number. */
struct *prot;
const struct proto_ops *ops;
unsigned char flags; /* See INET_PROTOSW_* below. */
};
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
}
};
数组inetsw_array中的每一项都会被插入inetsw链表数组:
/* Register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
struct net_protocol
/* This is used to register protocols. */
struct net_protocol {
int (*early_demux)(struct sk_buff *skb);
int (*early_demux_handler)(struct sk_buff *skb);
int (*handler)(struct sk_buff *skb);
/* This returns an error if we weren't able to handle the error. */
int (*err_handler)(struct sk_buff *skb, u32 info);
unsigned int no_policy:1,
/* does the protocol do more stringent
* icmp tag validation than simple
* socket lookup?
*/
icmp_strict_tag_validation:1;
};
每个协议对应一个,主要用于处理skb,可以看作是与IP层 的接口。
/* thinking of making this const? Don't.
* early_demux can change based on sysctl.
*/
static struct net_protocol tcp_protocol = {
.early_demux = tcp_v4_early_demux,
.early_demux_handler = tcp_v4_early_demux,
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.no_policy = 1,
.icmp_strict_tag_validation = 1,
};
/* thinking of making this const? Don't.
* early_demux can change based on sysctl.
*/
static struct net_protocol udp_protocol = {
.early_demux = udp_v4_early_demux,
.early_demux_handler = udp_v4_early_demux,
.handler = udp_rcv,
.err_handler = udp_err,
.no_policy = 1,
};
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.err_handler = icmp_err,
.no_policy = 1,
};