博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Linux内核-协议栈-从BSD Socket接口层到传输层1
阅读量:6534 次
发布时间:2019-06-24

本文共 17250 字,大约阅读时间需要 57 分钟。

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/feilengcui008/article/details/49530991

本文接上一篇,在上一篇中主要分析了了Linux内核协议栈涉及到的关键初始化函数,在这一篇文章中将分析协议栈的BSD socket和到传输层的流程。采取的方式是分析socket相关的主要系统调用。针对不同的系统调用,其到达的协议层深度可能不同,有的基本只到sock层就够了,但是有些可能需要会涉及到比如tcp的具体细节和更底层的细节。本文基本追溯到传输层的开始,再深入的细节后续文章分析。


1.准备

协议的基本分层:
(A代表socket的某个系统调用)
BSD socket system calls A => proto_ops->A => sock->A => tcp_prot => A

  • BSD socket层和具体协议族某个类型的联系是通过struct proto_ops,在include/linux/net.h中定义了不同协议族如af_inet,af_unix等的通用操作函数指针的结构体struct proto_ops,具体的定义有各个协议族的某个类型的子模块自己完成。比如ipv4/af_inet.c中定义的af_inet family的tcp/udp等相应的struct proto_ops。
  • 由于对于每个family的不同类型,其针对socket的某些需求可能不同,所以抽了一层struct sock出来,sock->sk_prot挂接到具体tcp/udp等传输层的struct proto上(具体定义在ipv4/tcp_ipv4.c,ipv4/udp.c)
  • 另外,由于内容比较多,这一篇主要分析socket,bind,listen,accept几个系统调用,下一篇会涉及connect,send,recv等的分析
//不同协议族的通用函数hooks//比如af_inet相关的定义在ipv4/af_inet.c中//除了创建socket为系统调用外,基本针对socket层的操作函数都在这里面struct proto_ops {    int     family;    struct module   *owner;    int     (*release)   (struct socket *sock);    int     (*bind)      (struct socket *sock,                      struct sockaddr *myaddr,                      int sockaddr_len);    int     (*connect)   (struct socket *sock,                      struct sockaddr *vaddr,                      int sockaddr_len, int flags);    int     (*socketpair)(struct socket *sock1,                      struct socket *sock2);    int     (*accept)    (struct socket *sock,                      struct socket *newsock, int flags);    int     (*getname)   (struct socket *sock,                      struct sockaddr *addr,                      int *sockaddr_len, int peer);    unsigned int    (*poll)      (struct file *file, struct socket *sock,                      struct poll_table_struct *wait);    int     (*ioctl)     (struct socket *sock, unsigned int cmd,                      unsigned long arg);#ifdef CONFIG_COMPAT    int     (*compat_ioctl) (struct socket *sock, unsigned int cmd,                      unsigned long arg);#endif    int     (*listen)    (struct socket *sock, int len);    int     (*shutdown)  (struct socket *sock, int flags);    int     (*setsockopt)(struct socket *sock, int level,                      int optname, char __user *optval, unsigned int optlen);/*省略部分*/};
//传输层的proto //作为sock->sk_prot与具体传输层的hooksstruct proto {    void            (*close)(struct sock *sk,                    long timeout);    int         (*connect)(struct sock *sk,                    struct sockaddr *uaddr,                    int addr_len);    int         (*disconnect)(struct sock *sk, int flags);    struct sock *       (*accept)(struct sock *sk, int flags, int *err);    int         (*ioctl)(struct sock *sk, int cmd,                     unsigned long arg);    int         (*init)(struct sock *sk);    void            (*destroy)(struct sock *sk);    void            (*shutdown)(struct sock *sk, int how);    int         (*setsockopt)(struct sock *sk, int level,                    int optname, char __user *optval,                    unsigned int optlen);    int         (*getsockopt)(struct sock *sk, int level,                    int optname, char __user *optval,                    int __user *option);#ifdef CONFIG_COMPAT    int         (*compat_setsockopt)(struct sock *sk,                    int level,                    int optname, char __user *optval,                    unsigned int optlen);    int         (*compat_getsockopt)(struct sock *sk,                    int level,                    int optname, char __user *optval,                    int __user *option);    int         (*compat_ioctl)(struct sock *sk,                    unsigned int cmd, unsigned long arg);#endif    int         (*sendmsg)(struct kiocb *iocb, struct sock *sk,                       struct msghdr *msg, size_t len);    int         (*recvmsg)(struct kiocb *iocb, struct sock *sk,                       struct msghdr *msg,                       size_t len, int noblock, int flags,                       int *addr_len);    int         (*sendpage)(struct sock *sk, struct page *page,                    int offset, size_t size, int flags);    int         (*bind)(struct sock *sk,                    struct sockaddr *uaddr, int addr_len);    /*省略部分*/};

同时附上其他几个关键结构体:

//bsd socket层//include/linux/net.hstruct socket {    socket_state        state;    kmemcheck_bitfield_begin(type);    short           type;    kmemcheck_bitfield_end(type);    unsigned long       flags;    struct socket_wq __rcu  *wq;    struct file     *file;    struct sock     *sk;    const struct proto_ops  *ops;};
//sock层struct sock { sock_common    __sk_common;#define sk_node         __sk_common.skc_node#define sk_nulls_node       __sk_common.skc_nulls_node#define sk_refcnt       __sk_common.skc_refcnt#define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping#define sk_dontcopy_begin   __sk_common.skc_dontcopy_begin#define sk_dontcopy_end     __sk_common.skc_dontcopy_end#define sk_hash         __sk_common.skc_hash#define sk_portpair     __sk_common.skc_portpair#define sk_num          __sk_common.skc_num#define sk_dport        __sk_common.skc_dport#define sk_addrpair     __sk_common.skc_addrpair#define sk_daddr        __sk_common.skc_daddr#define sk_rcv_saddr        __sk_common.skc_rcv_saddr#define sk_family       __sk_common.skc_family#define sk_state        __sk_common.skc_state#define sk_reuse        __sk_common.skc_reuse#define sk_reuseport        __sk_common.skc_reuseport#define sk_ipv6only     __sk_common.skc_ipv6only#define sk_bound_dev_if     __sk_common.skc_bound_dev_if#define sk_bind_node        __sk_common.skc_bind_node#define sk_prot         __sk_common.skc_prot#define sk_net          __sk_common.skc_net#define sk_v6_daddr     __sk_common.skc_v6_daddr#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr    unsigned long       sk_flags;    struct dst_entry    *sk_rx_dst;    struct dst_entry __rcu  *sk_dst_cache;    spinlock_t      sk_dst_lock;    atomic_t        sk_wmem_alloc;    atomic_t        sk_omem_alloc;    int         sk_sndbuf;    struct sk_buff_head sk_write_queue;    /*省略部分*/    struct pid      *sk_peer_pid;    const struct cred   *sk_peer_cred;    long            sk_rcvtimeo;    long            sk_sndtimeo;    void            *sk_protinfo;    struct timer_list   sk_timer;    ktime_t         sk_stamp;    u16         sk_tsflags;    u32         sk_tskey;    struct socket       *sk_socket;    void            *sk_user_data;    struct page_frag    sk_frag;    struct sk_buff      *sk_send_head;    /*省略部分*/};

2.开始

主要追溯几个典型的socket相关的系统调用,如socket,bind,listen,accept等等

  • socket
//创建socket的系统调用SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol){    int retval;    struct socket *sock;    int flags;    /* Check the SOCK_* constants for consistency.  */    BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);    BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);    BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);    BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);    flags = type & ~SOCK_TYPE_MASK;    if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))        return -EINVAL;    type &= SOCK_TYPE_MASK;    if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))        flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;    //分配inode,返回inode中的一个成员作为sock    retval = sock_create(family, type, protocol, &sock);    if (retval < 0)        goto out;    //找个fd映射sock    //得到空fd    //分配伪dentry和file,并将socket file的operations与file挂接     retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));/*省略部分*/}
  • socketpair
//创建socketpair,注意af_inet协议族下没有pair,af_unix下有SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,        int __user *, usockvec){    struct socket *sock1, *sock2;    int fd1, fd2, err;    struct file *newfile1, *newfile2;    int flags;    flags = type & ~SOCK_TYPE_MASK;    if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))        return -EINVAL;    type &= SOCK_TYPE_MASK;    if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))        flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;    //创建socket1     err = sock_create(family, type, protocol, &sock1);    if (err < 0)        goto out;    //创建socket2    err = sock_create(family, type, protocol, &sock2);    if (err < 0)        goto out_release_1;    //调用socket operations的socketpair     //关于不同协议层的函数hook,公共结构体是struct proto_ops     //对于不同的family,比如af_inet协议族的定义在ipv4/af_inet.c    //    //对于af_inet没有socketpair     //对于af_unix有socketpair    err = sock1->ops->socketpair(sock1, sock2);    if (err < 0)        goto out_release_both;    //后面部分就很类似了,找到空fd,分配file,绑定到socket,将file    安装到当前进程    fd1 = get_unused_fd_flags(flags);    if (unlikely(fd1 < 0)) {        err = fd1;        goto out_release_both;    }    fd2 = get_unused_fd_flags(flags);    if (unlikely(fd2 < 0)) {        err = fd2;        goto out_put_unused_1;    }    newfile1 = sock_alloc_file(sock1, flags, NULL);    if (unlikely(IS_ERR(newfile1))) {        err = PTR_ERR(newfile1);        goto out_put_unused_both;    }    newfile2 = sock_alloc_file(sock2, flags, NULL);    if (IS_ERR(newfile2)) {        err = PTR_ERR(newfile2);        goto out_fput_1;    }    err = put_user(fd1, &usockvec[0]);    if (err)        goto out_fput_both;    err = put_user(fd2, &usockvec[1]);    if (err)        goto out_fput_both;    audit_fd_pair(fd1, fd2);    fd_install(fd1, newfile1);    fd_install(fd2, newfile2);    /* fd1 and fd2 may be already another descriptors.     * Not kernel problem.     */    return 0;
  • bind
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen){    struct socket *sock;    struct sockaddr_storage address;    int err, fput_needed;    //根据fd查找file,进而查找socket指针sock    sock = sockfd_lookup_light(fd, &err, &fput_needed);    if (sock) {        //把用户态地址数据移到内核态        //调用copy_from_user         err = move_addr_to_kernel(umyaddr, addrlen, &address);        if (err >= 0) {            //security hook            err = security_socket_bind(sock,                           (struct sockaddr *)&address,                           addrlen);            if (!err)                //ok, 到具体family定义的proto_ops中的bind                 //比如对af_inet,主要是设置socket->sock->inet_sock的一些参数,比如接收地址,端口什么的                err = sock->ops->bind(sock,                              (struct sockaddr *)                              &address, addrlen);        }        fput_light(sock->file, fput_needed);    }    return err;}
  • listen
    listen所做的事情也比较简单,从系统调用的listen(fd, backlog)到proto_ops 的inet_listen与前面类似,这里分析下inet_listen中的核心函数inet_csk_listen_start(位于ipv4/inet_connection_sock.c中)。
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries){    //获得网络层inte_sock     struct inet_sock *inet = inet_sk(sk);    //管理request connection的结构体      struct inet_connection_sock *icsk = inet_csk(sk);    //分配backlog个长度的accpet_queue的结构连接请求的队列    int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);    if (rc != 0)        return rc;    sk->sk_max_ack_backlog = 0;    sk->sk_ack_backlog = 0;    inet_csk_delack_init(sk);    /* There is race window here: we announce ourselves listening,     * but this transition is still not validated by get_port().     * It is OK, because this socket enters to hash table only     * after validation is complete.     */    //切换状态到listening     sk->sk_state = TCP_LISTEN;    if (!sk->sk_prot->get_port(sk, inet->inet_num)) {        inet->inet_sport = htons(inet->inet_num);        //更新dst_entry表        sk_dst_reset(sk);        sk->sk_prot->hash(sk);        return 0;    }    sk->sk_state = TCP_CLOSE;    __reqsk_queue_destroy(&icsk->icsk_accept_queue);    return -EADDRINUSE;}
  • accept
    上面socket, socketpair, bind基本只涉及到BSD socket, sock层相关的,过程比较简单,而accept层在sock层和tcp层交互稍微复杂,下面详细分析
//socket.c//accept系统调用SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,        int __user *, upeer_addrlen, int, flags){    /*省略部分*/    err = -ENFILE;    //for client socket     newsock = sock_alloc();    if (!newsock)        goto out_put;    newsock->type = sock->type;    newsock->ops = sock->ops;    /*     * We don't need try_module_get here, as the listening socket (sock)     * has the protocol module (sock->ops->owner) held.     */    __module_get(newsock->ops->owner);    //得到当前进程空fd,分给newsock file    newfd = get_unused_fd_flags(flags);    if (unlikely(newfd < 0)) {        err = newfd;        sock_release(newsock);        goto out_put;    }    //从flab分配空file结构    newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);    if (unlikely(IS_ERR(newfile))) {        err = PTR_ERR(newfile);        put_unused_fd(newfd);        sock_release(newsock);        goto out_put;    }    err = security_socket_accept(sock, newsock);    if (err)        goto out_fd;    //proto_ops中的accept     //accept从系统调用到具体协议族的某个type的struct proto_ops的accept如af_inet tcp的的accept,再到sock层的accept,然后sock层的accept实际上对应的是具体传输层的struct proto中的accpet,如tcp/udp的struct proto tcp_prot/udp_prot,然后放入newsock     err = sock->ops->accept(sock, newsock, sock->file->f_flags);    if (err < 0)        goto out_fd;    if (upeer_sockaddr) {        if (newsock->ops->getname(newsock, (struct sockaddr *)&address,                      &len, 2) < 0) {            err = -ECONNABORTED;            goto out_fd;        }        //拷贝client socket addr storage到userspace        err = move_addr_to_user(&address,                    len, upeer_sockaddr, upeer_addrlen);        if (err < 0)            goto out_fd;    }    fd_install(newfd, newfile);    err = newfd;    /*省略部分*/}
//ipv4/af_inet.c//inet family的tcp相关的proto_opsint inet_accept(struct socket *sock, struct socket *newsock, int flags){    struct sock *sk1 = sock->sk;    int err = -EINVAL;    //进入(网络)sock层,accept新sock     struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);    if (!sk2)        goto do_err;    //锁住sock,因为需要操作sock内的request_socket请求队列头    wait_queue_head_t等数据    lock_sock(sk2);    sock_rps_record_flow(sk2);    WARN_ON(!((1 << sk2->sk_state) &          (TCPF_ESTABLISHED | TCPF_SYN_RECV |          TCPF_CLOSE_WAIT | TCPF_CLOSE)));    sock_graft(sk2, newsock);    //设置client socket状态     newsock->state = SS_CONNECTED;    err = 0;    release_sock(sk2);do_err:    return err;}
//ipv4/tcp_ipv4.c//这里进入struct proto tcp_prot中的acceptstruct sock *inet_csk_accept(struct sock *sk, int flags, int *err){    struct inet_connection_sock *icsk = inet_csk(sk);    //icsk : inet_connection_sock 面向连接的客户端连接处理相关的信息    //接收队列    struct request_sock_queue *queue = &icsk->icsk_accept_queue;    struct sock *newsk;    struct request_sock *req;    int error;    //lock sock    lock_sock(sk);    //如果不是ACCPET状态转换过来,出错    error = -EINVAL;    if (sk->sk_state != TCP_LISTEN)        goto out_err;    //如果request_sock队列是空的, 利用等待队列挂起当前进程到等待队列,并且将等待队列放入sock中的请求队列头    if (reqsk_queue_empty(queue)) {         //如果非阻塞,0,否则为sk的接收时间        long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);        error = -EAGAIN;        if (!timeo)   //如果非阻塞而且接收队列是空,直接返回-EAGAIN            goto out_err;        //阻塞情况下,等待timeo时间的超时        //利用了等待队列,下面会详细注解         error = inet_csk_wait_for_connect(sk, timeo);        if (error)            goto out_err;    }    //不是空,移出一个连接请求     req = reqsk_queue_remove(queue);    //连接请求的sock    newsk = req->sk;    //减少backlog     sk_acceptq_removed(sk);    //fastopenq?    if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {        spin_lock_bh(&queue->fastopenq->lock);        if (tcp_rsk(req)->listener) {            /* We are still waiting for the final ACK from 3WHS             * so can't free req now. Instead, we set req->sk to             * NULL to signify that the child socket is taken             * so reqsk_fastopen_remove() will free the req             * when 3WHS finishes (or is aborted).             */            req->sk = NULL;            req = NULL;        }        spin_unlock_bh(&queue->fastopenq->lock);    }    //ok,清理,返回newsk    /*省略部分*/
//ipv4/inet_connection_sock.c//accept连接请求的核心函数static int inet_csk_wait_for_connect(struct sock *sk, long timeo){    struct inet_connection_sock *icsk = inet_csk(sk);    //定义一个等待队列wait_queue_t wait 进程是当前进程    DEFINE_WAIT(wait);    int err;    for (;;) {        //sk_leep(sk) : sock的wait_queue_head_t        //wait : wait_queue_t        //这里将current进程的wait_queue_t加入sk的wait_queue_head_t中,spin锁定         //wait_queue_head_t,设置current状态,然后spin解锁时可能重新schedule         prepare_to_wait_exclusive(sk_sleep(sk), &wait,                      TASK_INTERRUPTIBLE);        //被唤醒,解锁sock         release_sock(sk);        //如果请求队列为空,说明timeout了        if (reqsk_queue_empty(&icsk->icsk_accept_queue))            //schedule timeout            timeo = schedule_timeout(timeo);        //再锁住进行下次循环,准备再次进入TASK_INTERRUPTIBLE        lock_sock(sk);        err = 0;        //检查是否有连接到达, 如果有,break,唤醒等待队列         if (!reqsk_queue_empty(&icsk->icsk_accept_queue))            break;        err = -EINVAL;        //如果不是listening 状态转过来的, 除错-EINVAL          if (sk->sk_state != TCP_LISTEN)            break;        //检查interrupt错误        err = sock_intr_errno(timeo);        //如果当前进程收到信号了,break         if (signal_pending(current))            break;        //如果传入的timeo为0,则回到nonblock的状态, break         err = -EAGAIN;        if (!timeo)            break;    }    //ok, 有连接到达,设置state为running, 唤醒wait queue的第一个进程,移除wait_queue_t和wait_queue_head_t     finish_wait(sk_sleep(sk), &wait);    return err;}
你可能感兴趣的文章
你不得不知道的Visual Studio 2012(3)- 创建Windows应用程序
查看>>
Android操作系统2.0制作备份
查看>>
To XSS or not ? 杂谈
查看>>
TFTP服务器在Cisco设备上的应用(上传、下载IOS)
查看>>
获得文件和文件夹的所有权
查看>>
烂泥:学习mysql数据库主从同步复制原理
查看>>
Java相对路径读取文件
查看>>
PostgreSQL 商用版本EPAS(阿里云ppas) 自动(postgresql.conf)参数计算与适配功能
查看>>
烂泥:学习ssh之ssh隧道应用
查看>>
Android TableLayout 常用的属性介绍及演示
查看>>
Ajax跨域访问XML数据的另一种方式——使用YQL查询语句
查看>>
[原创]让您的服务器不再有被挂马的烦恼---文件安全卫士
查看>>
流水线和PC指针
查看>>
Fiddler设置抓取https请求
查看>>
div布局小技巧
查看>>
OCP 12c最新考试原题及答案(071-4)
查看>>
MHA故障切换和在线手工切换原理
查看>>
Python版本切换和Pip安装
查看>>
SilverLigth学习笔记--控制 Silverlight控件样式(转)
查看>>
poj3262
查看>>