一次LINUX网络协议栈的探根之旅
公司的公共库中发现一段代码,简单化之,如下
...
struct sockaddr_un srv_sun;
int fd = socket(AF_UNIX, SOCK_STREAM, 0);
srv_sun.sun_family = AF_UNIX;
strcpy(srv_sun.sun_path,"/tmp/something_socket");
int srv_len = sizeof(srv_sun);
int ret = bind(fd, (struct sockaddr *)&srv_sun, srv_len);
...
我们知道sockaddr
和sockaddr_un
的结构不一样。按照UNP的写法:
struct sockaddr{
uint8_t sa_len;
sa_family_t sa_family;
char sa_data[14];
};
struct sockaddr_un{
uint8_t sun_len;
sa_family_t sun_family;
char sun_path[104];
};
疑惑1
那么问题来了,strcpy
后,srv_sa
的sun_path
长度明显超过sockaddr
的sa_data
的长度,那么bind
强制转换后是否有问题?
因此就想查一下bind
函数的实现是什么样的,辗转反侧,最终没有找到直接的bind
函数实现,但是找到了下面的东西。位于xxx/linux/kernels/mips-linux-2.6.31/net/socket.c
。
Location: xxx/linux/kernels/mips-linux-2.6.31/net/socket.c
/*
* Bind a name to a socket. Nothing much to do here since it's
* the protocol's responsibility to handle the local address.
*
* We move the socket address to kernel space before we call
* the protocol layer (having also checked the address is ok).
*/
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
struct socket *sock;
struct sockaddr_storage address;
int err, fput_needed;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
if (err >= 0) {
err = security_socket_bind(sock,
(struct sockaddr *)&address,
addrlen);
if (!err)
err = sock->ops->bind(sock,
(struct sockaddr *)
&address, addrlen);
}
fput_light(sock->file, fput_needed);
}
return err;
}
从上面的代码,看上去似乎是定义了一个系统调用。这里有一个关于SYSCALL_DEFINE3的不错的blog,提供了思路,可惜排版不好,我就自己找源码摘录下来。位于xxx/linux/kernels/mips-linux-2.6.31/include/linux/syscalls.h
。
Location: xxx/linux/kernels/mips-linux-2.6.31/include/linux/syscalls.h
#define __SC_DECL1(t1, a1) t1 a1
#define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
#define __SC_DECL3(t3, a3, ...) t3 a3, __SC_DECL2(__VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#ifdef CONFIG_FTRACE_SYSCALLS
#define SYSCALL_DEFINEx(x, sname, ...) \
static const char *types_##sname[] = { \
__SC_STR_TDECL##x(__VA_ARGS__) \
}; \
static const char *args_##sname[] = { \
__SC_STR_ADECL##x(__VA_ARGS__) \
}; \
SYSCALL_METADATA(sname, x); \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#else
#define SYSCALL_DEFINEx(x, sname, ...) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#endif
#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
#define SYSCALL_DEFINE(name) static inline long SYSC_##name
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \
static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \
asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \
{ \
__SC_TEST##x(__VA_ARGS__); \
return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__)); \
} \
SYSCALL_ALIAS(sys##name, SyS##name); \
static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))
#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
#define SYSCALL_DEFINE(name) asmlinkage long sys_##name
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
照着这个宏一层层剥离:
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
SYSCALL_DEFINEx(3, _bind, __VA_ARGS__)
__SYSCALL_DEFINEx(3, _bind, __VA_ARGS__)
asmlinkage long sys_bind(__SC_DECL3(__VA_ARGS__))
其中 __VA_ARGS__
代表 int, fd, struct sockaddr __user *, umyaddr, int, addrlen
,用__SC_DECL3
逐一展开:
__SC_DECL3(int, fd, struct sockaddr __user *, umyaddr, int, addrlen) --->
int fd, __SC_DECL2(struct sockaddr __user *, umyaddr, int, addrlen) --->
int fd, struct sockaddr __user * umyaddr, __SC_DECL1(int, addrlen) --->
int fd, struct sockaddr __user * umyaddr, int addrlen
那么SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
最后表示就是:
asmlinkage long sys_bind(int fd, struct sockaddr __user * umyaddr, int addrlen);
这样解出来的函数其实并不是我们网络编程时候调用的函数bind
,这就很奇怪了,两者是如何联系起来的?这儿我们把疑惑放下,先假设两者是一致的。此时细看一下sys_bind
的实现,注意这几句:
struct sockaddr_storage address;
err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
实现:
int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr)
{
if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
return -EINVAL;
if (ulen == 0)
return 0;
if (copy_from_user(kaddr, uaddr, ulen))
return -EFAULT;
return audit_sockaddr(ulen, kaddr);
}
从用户空间拷贝到内核空间时候要检查addrlen
,如果ulen > sizeof(struct sockaddr_storage)
,则会报错。sockaddr_storage
的定义如下:
/* Structure large enough to hold any socket address (with the historical
exception of AF_UNIX). We reserve 128 bytes. */
#define __ss_aligntype unsigned long int
#define _SS_SIZE 128
#define _SS_PADSIZE (_SS_SIZE - (2 * sizeof (__ss_aligntype)))
struct sockaddr_storage
{
__SOCKADDR_COMMON (ss_); /* Address family, etc. */
__ss_aligntype __ss_align; /* Force desired alignment. */
char __ss_padding[_SS_PADSIZE];
};
重点就是:Structure large enough to hold any socket address
,只要是合法的socket address
,就都不会超过它的大小。至此我们可以得出结论,强转指针不会影响原来的结构,毕竟仅仅是传递指针而已。
另外,《UNIX 网络编程 卷1: 套接字联网API》的3.3.2节中也给出了明确的表述:
疑惑2
现在再把前面一个疑惑捡回来,我们编程时候使用的bind
函数和sys_bind
有什么关系?直觉来说,网络编程时调用的bind
函数最后一定调用了sys_bind
函数。
网络编程常用接口的内核实现----sys_bind()
Linux 网络协议栈系列
这个系列文章中提到,socket()
调用的是glibc
里面的__socket()
实现。这里岔开一下,先看一下几个术语名词之间的联系、区别:
疑惑3
参考1-socket函数在哪个文件定义
参考2-GLIBC 官网
在socket()库函数到系统调用,再到内核中提到__socket()
函数的路径下并没有找到函数实现。猜测可能是glibc
版本变迁造成的迁移?
__attribute__ 之weak,alias属性中提到weak
与alias
属性,同时提到socket
函数在glibc
库中实现方式。
上面这段描述倒是与socket()库函数到系统调用,再到内核提到的socket
调用路径是一致的。说明早期glibc
库中关于socket
函数实现就是这个思路。
问题来了: 最近版本glibc
[2.19 与 2.22]中相应socket
函数如何实现的?
首先,我们将前面文章中提到的调用关系重复一下:
glibc-2.3.6
- socket()调用如下:
- socket()->__socket():glibc-2.3.6/sysdept/generic/socket.c (weak_alias(name1, name2))
- __socket():glibc-2.3.6/sysdept/unix/sysv/linux/i386/socket.S
- ENTER_KERNEL:
movl $SYS_ify(socketcall), %eax /* System call number in %eax. */
/* Use ## so `socket' is a separate token that might be #define'd. */
movl $P(SOCKOP_,socket), %ebx /* Subcode is first arg to syscall. */
lea 4(%esp), %ecx /* Address of args is 2nd arg. */
/* Do the system call trap. */
ENTER_KERNEL
weak_alias(name1,name2)
为标号name1定义一个弱化名name2。仅当name2没有在任何地方定义时,连接器就会用name1解析name2相关的符号。在文件中定义的标号name1也会同样处理。
glibc-2.19
- socket()->__socket():glibc-2.19/socket/socket.c
/* Create a new socket of type TYPE in domain DOMAIN, using
protocol PROTOCOL. If PROTOCOL is zero, one is chosen automatically.
Returns a file descriptor for the new socket, or -1 for errors. */
int
__socket (domain, type, protocol)
int domain;
int type;
int protocol;
{
__set_errno (ENOSYS);
return -1;
}
weak_alias (__socket, socket)
stub_warning (socket)
- __socket():glibc-2.19/sysdeps/unix/sysv/linux/i386/socket.S
/* Save registers. */
movl %ebx, %edx
cfi_register (3, 2)
movl $SYS_ify(socketcall), %eax /* System call number in %eax. */
/* Use ## so `socket' is a separate token that might be #define'd. */
movl $P(SOCKOP_,socket), %ebx /* Subcode is first arg to syscall. */
lea 4(%esp), %ecx /* Address of args is 2nd arg. */
/* Do the system call trap. */
ENTER_KERNEL
这说明直到2.19,glibc
的socket
函数调用方式还是一致的。
glibc-2.22
- socket()->__socket():glibc-2.22/socket/socket.c
weak_alias
的函数依然在这里定义。 - __socket():glibc-2.22/sysdeps/unix/sysv/linux/socket.c
int
__socket (int fd, int type, int domain)
{
return SOCKETCALL (socket, fd, type, domain);
}
libc_hidden_def (__socket)
weak_alias (__socket, socket)
- SOCKETCALL:glibc-2.22/sysdeps/unix/sysv/linux/socketcall.h
#define __SOCKETCALL3(name, a1, a2, a3) \
INLINE_SYSCALL (socketcall, 2, name, \
((long int [3]) { (long int) (a1), (long int) (a2), (long int) (a3) }))
#define __SOCKETCALL_NARGS_X(a,b,c,d,e,f,g,h,n,...) n
#define __SOCKETCALL_NARGS(...) \
__SOCKETCALL_NARGS_X (__VA_ARGS__,7,6,5,4,3,2,1,0,)
#define __SOCKETCALL_CONCAT_X(a,b) a##b
#define __SOCKETCALL_CONCAT(a,b) __SOCKETCALL_CONCAT_X (a, b)
#define __SOCKETCALL_DISP(b,...) \
__SOCKETCALL_CONCAT (b,__SOCKETCALL_NARGS(__VA_ARGS__))(__VA_ARGS__)
#define __SOCKETCALL(...) __SOCKETCALL_DISP (__SOCKETCALL, __VA_ARGS__)
#define SOCKETCALL(name, args...) \
({ \
long int sc_ret = __SOCKETCALL (SOCKOP_##name, args); \
sc_ret; \
})
OK,这段鬼代码翻译一下就是
SOCKETCALL (socket, fd, type, domain); --->
__SOCKETCALL (SOCKOP_socket, fd, type, domain); --->
__SOCKETCALL_DISP (__SOCKETCALL,SOCKOP_socket, fd, type, domain); --->
__SOCKETCALL_CONCAT (__SOCKETCALL,__SOCKETCALL_NARGS(SOCKOP_socket, fd, type, domain))(SOCKOP_socket, fd, type, domain); --->
__SOCKETCALL_CONCAT_X (__SOCKETCALL, __SOCKETCALL_NARGS_X(SOCKOP_socket, fd, type, domain,7,6,5,4,3,2,1,0,))(SOCKOP_socket, fd, type, domain); --->
__SOCKETCALL_CONCAT_X (__SOCKETCALL, 3)(SOCKOP_socket, fd, type, domain);
__SOCKETCALL3(SOCKOP_socket, fd, type, domain); --->
INLINE_SYSCALL (socketcall, 2, SOCKOP_socket, ((long int [3]) { (long int) (fd), (long int) (type), (long int) (domain) }))
这INLINE_SYSCALL
又是什么鬼东西?在代码里面搜了搜,还比较多,其中:glibc-2.22/sysdeps/unix/sysdep.h里面的定义比较靠谱,毕竟是与上面的代码有共同的祖先目录。
/* Wrappers around system calls should normally inline the system call code.
But sometimes it is not possible or implemented and we use this code. */
#ifndef INLINE_SYSCALL
#define INLINE_SYSCALL(name, nr, args...) __syscall_##name (args)
#endif
上面说:包裹系统调用的函数一般会内联系统调用的代码。从上面的宏定义看,我们的INLINE_SYSCALL
是解不开的。那么这些内联代码应该是在glibc-2.22/sysdeps/unix/sysv/linux/下的子目录里面,各个子目录是不同的硬件平台: