0
点赞
收藏
分享

微信扫一扫

一次LINUX网络协议栈的探根之旅

一次LINUX网络协议栈的探根之旅

公司的公共库中发现一段代码,简单化之,如下

...
struct sockaddr_un srv_sun;
int fd = socket(AF_UNIX, SOCK_STREAM, 0);
srv_sun.sun_family = AF_UNIX;
strcpy(srv_sun.sun_path,"/tmp/something_socket");
int srv_len = sizeof(srv_sun);
int ret = bind(fd, (struct sockaddr *)&srv_sun, srv_len);
...

我们知道sockaddrsockaddr_un的结构不一样。按照UNP的写法:

struct sockaddr{
    uint8_t     sa_len;
    sa_family_t sa_family;
    char        sa_data[14];
};

struct sockaddr_un{
    uint8_t         sun_len;
    sa_family_t     sun_family; 
    char            sun_path[104];      
};

疑惑1

那么问题来了,strcpy后,srv_sasun_path长度明显超过sockaddrsa_data的长度,那么bind强制转换后是否有问题
因此就想查一下bind函数的实现是什么样的,辗转反侧,最终没有找到直接的bind函数实现,但是找到了下面的东西。位于xxx/linux/kernels/mips-linux-2.6.31/net/socket.c

Location: xxx/linux/kernels/mips-linux-2.6.31/net/socket.c

/*
 *      Bind a name to a socket. Nothing much to do here since it's
 *      the protocol's responsibility to handle the local address.
 *
 *      We move the socket address to kernel space before we call
 *      the protocol layer (having also checked the address is ok).
 */

SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
        struct socket *sock;
        struct sockaddr_storage address;
        int err, fput_needed;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (sock) {
                err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
                if (err >= 0) {
                        err = security_socket_bind(sock,
                                                   (struct sockaddr *)&address,
                                                   addrlen);
                        if (!err)
                                err = sock->ops->bind(sock,
                                                      (struct sockaddr *)
                                                      &address, addrlen);
                }
                fput_light(sock->file, fput_needed);
        }
        return err;
}

从上面的代码,看上去似乎是定义了一个系统调用。这里有一个关于SYSCALL_DEFINE3的不错的blog,提供了思路,可惜排版不好,我就自己找源码摘录下来。位于xxx/linux/kernels/mips-linux-2.6.31/include/linux/syscalls.h

Location: xxx/linux/kernels/mips-linux-2.6.31/include/linux/syscalls.h

#define __SC_DECL1(t1, a1)      t1 a1
#define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
#define __SC_DECL3(t3, a3, ...) t3 a3, __SC_DECL2(__VA_ARGS__)

#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)

#ifdef CONFIG_FTRACE_SYSCALLS
    #define SYSCALL_DEFINEx(x, sname, ...)                          \
            static const char *types_##sname[] = {                  \
                    __SC_STR_TDECL##x(__VA_ARGS__)                  \
            };                                                      \
            static const char *args_##sname[] = {                   \
                    __SC_STR_ADECL##x(__VA_ARGS__)                  \
            };                                                      \
            SYSCALL_METADATA(sname, x);                             \
            __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#else
    #define SYSCALL_DEFINEx(x, sname, ...)                          \
            __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#endif


#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
    #define SYSCALL_DEFINE(name) static inline long SYSC_##name
    #define __SYSCALL_DEFINEx(x, name, ...)                                 \
            asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));           \
            static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));       \
            asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))            \
            {                                                               \
                    __SC_TEST##x(__VA_ARGS__);                              \
                    return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__));    \
            }                                                               \
            SYSCALL_ALIAS(sys##name, SyS##name);                            \
            static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))
#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
    #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
    #define __SYSCALL_DEFINEx(x, name, ...)                                 \
            asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */

照着这个宏一层层剥离:

SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
    SYSCALL_DEFINEx(3, _bind, __VA_ARGS__)
        __SYSCALL_DEFINEx(3, _bind, __VA_ARGS__)
            asmlinkage long sys_bind(__SC_DECL3(__VA_ARGS__))

其中 __VA_ARGS__ 代表 int, fd, struct sockaddr __user *, umyaddr, int, addrlen,用__SC_DECL3逐一展开:

__SC_DECL3(int, fd, struct sockaddr __user *, umyaddr, int, addrlen) --->
int fd, __SC_DECL2(struct sockaddr __user *, umyaddr, int, addrlen)  --->
int fd, struct sockaddr __user * umyaddr, __SC_DECL1(int, addrlen)   --->
int fd, struct sockaddr __user * umyaddr, int addrlen

那么SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)最后表示就是:

asmlinkage long sys_bind(int fd, struct sockaddr __user * umyaddr, int addrlen);

这样解出来的函数其实并不是我们网络编程时候调用的函数bind,这就很奇怪了,两者是如何联系起来的?这儿我们把疑惑放下,先假设两者是一致的。此时细看一下sys_bind的实现,注意这几句:

struct sockaddr_storage address;
err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);

实现:
int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr)
{
        if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
                return -EINVAL;
        if (ulen == 0)
                return 0;
        if (copy_from_user(kaddr, uaddr, ulen))
                return -EFAULT;
        return audit_sockaddr(ulen, kaddr);
}

从用户空间拷贝到内核空间时候要检查addrlen,如果ulen > sizeof(struct sockaddr_storage),则会报错。sockaddr_storage的定义如下:

/* Structure large enough to hold any socket address (with the historical
   exception of AF_UNIX).  We reserve 128 bytes.  */
#define __ss_aligntype  unsigned long int
#define _SS_SIZE        128
#define _SS_PADSIZE     (_SS_SIZE - (2 * sizeof (__ss_aligntype)))

struct sockaddr_storage
  {
    __SOCKADDR_COMMON (ss_);    /* Address family, etc.  */
    __ss_aligntype __ss_align;  /* Force desired alignment.  */
    char __ss_padding[_SS_PADSIZE];
  };

重点就是:Structure large enough to hold any socket address,只要是合法的socket address,就都不会超过它的大小。至此我们可以得出结论,强转指针不会影响原来的结构,毕竟仅仅是传递指针而已。

另外,《UNIX 网络编程 卷1: 套接字联网API》的3.3.2节中也给出了明确的表述:

疑惑2

现在再把前面一个疑惑捡回来,我们编程时候使用的bind函数和sys_bind有什么关系?直觉来说,网络编程时调用的bind函数最后一定调用了sys_bind函数。

网络编程常用接口的内核实现----sys_bind()
Linux 网络协议栈系列
这个系列文章中提到,socket()调用的是glibc里面的__socket()实现。这里岔开一下,先看一下几个术语名词之间的联系、区别:

疑惑3

参考1-socket函数在哪个文件定义
参考2-GLIBC 官网

在socket()库函数到系统调用,再到内核中提到__socket()函数的路径下并没有找到函数实现。猜测可能是glibc版本变迁造成的迁移?

__attribute__ 之weak,alias属性中提到weakalias属性,同时提到socket函数在glibc库中实现方式。

上面这段描述倒是与socket()库函数到系统调用,再到内核提到的socket调用路径是一致的。说明早期glibc库中关于socket函数实现就是这个思路。

问题来了: 最近版本glibc[2.19 与 2.22]中相应socket函数如何实现的?

首先,我们将前面文章中提到的调用关系重复一下:

glibc-2.3.6

  1. socket()调用如下:
    1. socket()->__socket():glibc-2.3.6/sysdept/generic/socket.c (weak_alias(name1, name2))
    2. __socket():glibc-2.3.6/sysdept/unix/sysv/linux/i386/socket.S
  2. ENTER_KERNEL:
movl $SYS_ify(socketcall), %eax   /* System call number in %eax.  */  
/* Use ## so `socket' is a separate token that might be #define'd.  */  
movl $P(SOCKOP_,socket), %ebx   /* Subcode is first arg to syscall.  */ 
lea 4(%esp), %ecx       /* Address of args is 2nd arg.  */  
/* Do the system call trap.  */  
ENTER_KERNEL 

weak_alias(name1,name2)
为标号name1定义一个弱化名name2。仅当name2没有在任何地方定义时,连接器就会用name1解析name2相关的符号。在文件中定义的标号name1也会同样处理。

glibc-2.19

  • socket()->__socket():glibc-2.19/socket/socket.c
/* Create a new socket of type TYPE in domain DOMAIN, using
   protocol PROTOCOL.  If PROTOCOL is zero, one is chosen automatically.
   Returns a file descriptor for the new socket, or -1 for errors.  */
int
__socket (domain, type, protocol)
     int domain;
     int type;
     int protocol;
{
  __set_errno (ENOSYS);
  return -1;
}

weak_alias (__socket, socket)
stub_warning (socket)
  • __socket():glibc-2.19/sysdeps/unix/sysv/linux/i386/socket.S
    /* Save registers.  */
    movl %ebx, %edx
    cfi_register (3, 2)

    movl $SYS_ify(socketcall), %eax /* System call number in %eax.  */

    /* Use ## so `socket' is a separate token that might be #define'd.  */
    movl $P(SOCKOP_,socket), %ebx   /* Subcode is first arg to syscall.  */
    lea 4(%esp), %ecx       /* Address of args is 2nd arg.  */

        /* Do the system call trap.  */
    ENTER_KERNEL

这说明直到2.19,glibcsocket函数调用方式还是一致的。

glibc-2.22

  1. socket()->__socket():glibc-2.22/socket/socket.c weak_alias的函数依然在这里定义。
  2. __socket():glibc-2.22/sysdeps/unix/sysv/linux/socket.c
int
__socket (int fd, int type, int domain)
{
  return SOCKETCALL (socket, fd, type, domain);
}
libc_hidden_def (__socket)
weak_alias (__socket, socket)
  1. SOCKETCALL:glibc-2.22/sysdeps/unix/sysv/linux/socketcall.h
#define __SOCKETCALL3(name, a1, a2, a3) \
  INLINE_SYSCALL (socketcall, 2, name, \
     ((long int [3]) { (long int) (a1), (long int) (a2), (long int) (a3) }))
#define __SOCKETCALL_NARGS_X(a,b,c,d,e,f,g,h,n,...) n
#define __SOCKETCALL_NARGS(...) \
  __SOCKETCALL_NARGS_X (__VA_ARGS__,7,6,5,4,3,2,1,0,)
#define __SOCKETCALL_CONCAT_X(a,b)     a##b
#define __SOCKETCALL_CONCAT(a,b)       __SOCKETCALL_CONCAT_X (a, b)
#define __SOCKETCALL_DISP(b,...) \
  __SOCKETCALL_CONCAT (b,__SOCKETCALL_NARGS(__VA_ARGS__))(__VA_ARGS__)
#define __SOCKETCALL(...) __SOCKETCALL_DISP (__SOCKETCALL, __VA_ARGS__)
#define SOCKETCALL(name, args...)                   \
  ({                                    \
    long int sc_ret = __SOCKETCALL (SOCKOP_##name, args);       \
    sc_ret;                             \
  })    

OK,这段鬼代码翻译一下就是

SOCKETCALL (socket, fd, type, domain);  --->
__SOCKETCALL (SOCKOP_socket, fd, type, domain); --->
__SOCKETCALL_DISP (__SOCKETCALL,SOCKOP_socket, fd, type, domain); --->
__SOCKETCALL_CONCAT (__SOCKETCALL,__SOCKETCALL_NARGS(SOCKOP_socket, fd, type, domain))(SOCKOP_socket, fd, type, domain); --->
__SOCKETCALL_CONCAT_X (__SOCKETCALL, __SOCKETCALL_NARGS_X(SOCKOP_socket, fd, type, domain,7,6,5,4,3,2,1,0,))(SOCKOP_socket, fd, type, domain); --->
__SOCKETCALL_CONCAT_X (__SOCKETCALL, 3)(SOCKOP_socket, fd, type, domain);
__SOCKETCALL3(SOCKOP_socket, fd, type, domain); --->
INLINE_SYSCALL (socketcall, 2, SOCKOP_socket, ((long int [3]) { (long int) (fd), (long int) (type), (long int) (domain) }))

INLINE_SYSCALL又是什么鬼东西?在代码里面搜了搜,还比较多,其中:glibc-2.22/sysdeps/unix/sysdep.h里面的定义比较靠谱,毕竟是与上面的代码有共同的祖先目录。

/* Wrappers around system calls should normally inline the system call code.
   But sometimes it is not possible or implemented and we use this code.  */
#ifndef INLINE_SYSCALL
#define INLINE_SYSCALL(name, nr, args...) __syscall_##name (args)
#endif

上面说:包裹系统调用的函数一般会内联系统调用的代码。从上面的宏定义看,我们的INLINE_SYSCALL是解不开的。那么这些内联代码应该是在glibc-2.22/sysdeps/unix/sysv/linux/下的子目录里面,各个子目录是不同的硬件平台:

QQ截图20151105174900.jpg-381kB

QQ截图20151105174900.jpg-381kB

以i386为例,看一下:glibc-2.22/sysdeps/unix/sysv/linux/i386/sysdep.h

#undef INLINE_SYSCALL
#define INLINE_SYSCALL(name, nr, args...) \
  ({                                          \
    unsigned int resultvar = INTERNAL_SYSCALL (name, , nr, args);         \
    if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (resultvar, )))        \
      {                                       \
    __set_errno (INTERNAL_SYSCALL_ERRNO (resultvar, ));           \
    resultvar = 0xffffffff;                           \
      }                                       \
    (int) resultvar; })
    
INLINE_SYSCALL (socketcall, 2, SOCKOP_socket, ((long int [3]) { (long int) (fd), (long int) (type), (long int) (domain) })) --->

 ({
    unsigned int resultvar = INTERNAL_SYSCALL (socketcall, , 2, SOCKOP_socket, ((long int [3]) { (long int) (fd), (long int) (type), (long int) (domain) }));
    if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (resultvar, )))       
    {                                     
        __set_errno (INTERNAL_SYSCALL_ERRNO (resultvar, ));      
        resultvar = 0xffffffff;                      
    }                       
    (int) resultvar; 
})

其中,

#define INTERNAL_SYSCALL(name, err, nr, args...) \
  ({                                          \
    register unsigned int resultvar;                          \
    INTERNAL_SYSCALL_MAIN_##nr (name, err, args);                 \
    (int) resultvar; })
#define INTERNAL_SYSCALL_MAIN_2(name, err, args...) \
    INTERNAL_SYSCALL_MAIN_INLINE(name, err, 2, args)    
#  define INTERNAL_SYSCALL_MAIN_INLINE(name, err, nr, args...) \
    EXTRAVAR_##nr                                 \
    asm volatile (                                \
    LOADARGS_##nr                                 \
    "movl %1, %%eax\n\t"                              \
    "call *%%gs:%P2\n\t"                              \
    RESTOREARGS_##nr                                  \
    : "=a" (resultvar)                                \
    : "i" (__NR_##name), "i" (offsetof (tcbhead_t, sysinfo))              \
      ASMFMT_##nr(args) : "memory", "cc")
      
 INTERNAL_SYSCALL (socketcall, , 2, SOCKOP_socket, ((long int [3]) { (long int) (fd), (long int) (type), (long int) (domain) })) --->
 
({                                        
    register unsigned int resultvar;                          
    INTERNAL_SYSCALL_MAIN_2 (socketcall, err, SOCKOP_socket, ((long int [3]) { (long int) (fd), (long int) (type), (long int) (domain) }));               
    (int) resultvar; 
})--->

({                                        
    register unsigned int resultvar;                          
    INTERNAL_SYSCALL_MAIN_INLINE (socketcall, err, 2, SOCKOP_socket, ((long int [3]) { (long int) (fd), (long int) (type), (long int) (domain) }));           
    (int) resultvar; 
})--->
卧槽,到汇编了,我搞不下去了。

尼玛,2.22版本太虐心了,我还是参考2.19吧。那个应该比较好分析一点。

分析glibc-2.19 socket函数

__socket():glibc-2.19/sysdeps/unix/sysv/linux/i386/socket.S

#define P(a, b) P2(a, b)
#define P2(a, b) a##b
...
    /* Save registers.  */
    movl %ebx, %edx
    cfi_register (3, 2)

    movl $SYS_ify(socketcall), %eax /* System call number in %eax.  */

    /* Use ## so `socket' is a separate token that might be #define'd.  */
    movl $P(SOCKOP_,socket), %ebx   /* Subcode is first arg to syscall.  */
    lea 4(%esp), %ecx       /* Address of args is 2nd arg.  */

        /* Do the system call trap.  */
    ENTER_KERNEL

其中,
SYS_ify:glibc-2.19/sysdeps/unix/sysv/linux/i386/sysdep.h

#undef SYS_ify
#define SYS_ify(syscall_name)   __NR_##syscall_name

因此,

/* Save registers.  */
    movl %ebx, %edx
    cfi_register (3, 2)

    movl $__NR_socketcall, %eax /* System call number in %eax.  */

    /* Use ## so `socket' is a separate token that might be #define'd.  */
    movl $SOCKOP_socket, %ebx   /* Subcode is first arg to syscall.  */
    lea 4(%esp), %ecx       /* Address of args is 2nd arg.  */

        /* Do the system call trap.  */
    ENTER_KERNEL

其中,
SOCKOP_socket : glibc-2.19/sysdeps/unix/sysv/linux/socketcall.h

#define SOCKOP_socket       1

__NR_socketcall的定义也找不到,从名字看,应该是一个数字(number)。暂时不管,这里面最重要的就是ENTER_KERNEL:glibc-2.19/sysdeps/unix/sysv/linux/i386/sysdep.h

/* The original calling convention for system calls on Linux/i386 is
   to use int $0x80.  */
#ifdef I386_USE_SYSENTER
# ifdef SHARED
#  define ENTER_KERNEL call *%gs:SYSINFO_OFFSET
# else
#  define ENTER_KERNEL call *_dl_sysinfo
# endif
#else
# define ENTER_KERNEL int $0x80
#endif

Linux/i386平台上系统调用的原始调用惯例是使用int $0x80。稍微熟悉i386处理器的应该知道,这条汇编语句是i386的软中断指令。

在许多汇编中常有int $0x80,int $0x80究竟是个什么样的软中断?

Linux用int $80来实现系统调用异常 .这一指令使用中断/异常向量号128(即16进制的80)将控制权转移给内核。为达到在使用系统调用时不必用机器指令编程,在标准的C语言库中为每一系统调用提供了一段短的子程序,完成机器代码的编程工作。事实上,机器代码段非常简短。它所要做的工作只是将送给系统调用的参数加载到CPU寄存器中,接着执行int $0x80指令。然后运行系统调用,系统调用的返回值将送入CPU的一个寄存器中,标准的库子程序取得这一返回值,并将它送回用户程序.
一般现在的核心有190个左右的系统调用,可以看/usr/src/linux/代码目录下的
init/main.c
arch/i386/kernel/traps.c
arch/i386/kernel/entry.S
arch/i386/kernel/irq.h
include/asm-386/unistd.h 

总结一下socket.S里面汇编代码的意思,参考链接:

  1. 将系统调用号__NR_socketcall存入eax寄存器
  2. 将系统调用子码SOCKOP_socket存入ebx寄存器,子码是syscall的第一个参数。
  3. 取出栈指针寄存器esp的值加4,然后存入ecx寄存器。参数地址是第二个参数?
  4. 靠软中断(也称陷阱)进入内核,内核负责处理系统调用。

TO BE CONTINUED

再了解一下Linux网络协议栈架构:
Linux 网络栈剖析