本文共 17250 字,大约阅读时间需要 57 分钟。
本文接上一篇,在上一篇中主要分析了了Linux内核协议栈涉及到的关键初始化函数,在这一篇文章中将分析协议栈的BSD socket和到传输层的流程。采取的方式是分析socket相关的主要系统调用。针对不同的系统调用,其到达的协议层深度可能不同,有的基本只到sock层就够了,但是有些可能需要会涉及到比如tcp的具体细节和更底层的细节。本文基本追溯到传输层的开始,再深入的细节后续文章分析。
1.准备
协议的基本分层: (A代表socket的某个系统调用) BSD socket system calls A => proto_ops->A => sock->A => tcp_prot => A//不同协议族的通用函数hooks//比如af_inet相关的定义在ipv4/af_inet.c中//除了创建socket为系统调用外,基本针对socket层的操作函数都在这里面struct proto_ops { int family; struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, struct sockaddr *myaddr, int sockaddr_len); int (*connect) (struct socket *sock, struct sockaddr *vaddr, int sockaddr_len, int flags); int (*socketpair)(struct socket *sock1, struct socket *sock2); int (*accept) (struct socket *sock, struct socket *newsock, int flags); int (*getname) (struct socket *sock, struct sockaddr *addr, int *sockaddr_len, int peer); unsigned int (*poll) (struct file *file, struct socket *sock, struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg);#ifdef CONFIG_COMPAT int (*compat_ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg);#endif int (*listen) (struct socket *sock, int len); int (*shutdown) (struct socket *sock, int flags); int (*setsockopt)(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen);/*省略部分*/};
//传输层的proto //作为sock->sk_prot与具体传输层的hooksstruct proto { void (*close)(struct sock *sk, long timeout); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags); struct sock * (*accept)(struct sock *sk, int flags, int *err); int (*ioctl)(struct sock *sk, int cmd, unsigned long arg); int (*init)(struct sock *sk); void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option);#ifdef CONFIG_COMPAT int (*compat_setsockopt)(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); int (*compat_getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); int (*compat_ioctl)(struct sock *sk, unsigned int cmd, unsigned long arg);#endif int (*sendmsg)(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int (*sendpage)(struct sock *sk, struct page *page, int offset, size_t size, int flags); int (*bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len); /*省略部分*/};
同时附上其他几个关键结构体:
//bsd socket层//include/linux/net.hstruct socket { socket_state state; kmemcheck_bitfield_begin(type); short type; kmemcheck_bitfield_end(type); unsigned long flags; struct socket_wq __rcu *wq; struct file *file; struct sock *sk; const struct proto_ops *ops;};
//sock层struct sock { sock_common __sk_common;#define sk_node __sk_common.skc_node#define sk_nulls_node __sk_common.skc_nulls_node#define sk_refcnt __sk_common.skc_refcnt#define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping#define sk_dontcopy_begin __sk_common.skc_dontcopy_begin#define sk_dontcopy_end __sk_common.skc_dontcopy_end#define sk_hash __sk_common.skc_hash#define sk_portpair __sk_common.skc_portpair#define sk_num __sk_common.skc_num#define sk_dport __sk_common.skc_dport#define sk_addrpair __sk_common.skc_addrpair#define sk_daddr __sk_common.skc_daddr#define sk_rcv_saddr __sk_common.skc_rcv_saddr#define sk_family __sk_common.skc_family#define sk_state __sk_common.skc_state#define sk_reuse __sk_common.skc_reuse#define sk_reuseport __sk_common.skc_reuseport#define sk_ipv6only __sk_common.skc_ipv6only#define sk_bound_dev_if __sk_common.skc_bound_dev_if#define sk_bind_node __sk_common.skc_bind_node#define sk_prot __sk_common.skc_prot#define sk_net __sk_common.skc_net#define sk_v6_daddr __sk_common.skc_v6_daddr#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr unsigned long sk_flags; struct dst_entry *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; spinlock_t sk_dst_lock; atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; int sk_sndbuf; struct sk_buff_head sk_write_queue; /*省略部分*/ struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; long sk_sndtimeo; void *sk_protinfo; struct timer_list sk_timer; ktime_t sk_stamp; u16 sk_tsflags; u32 sk_tskey; struct socket *sk_socket; void *sk_user_data; struct page_frag sk_frag; struct sk_buff *sk_send_head; /*省略部分*/};
2.开始
主要追溯几个典型的socket相关的系统调用,如socket,bind,listen,accept等等//创建socket的系统调用SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol){ int retval; struct socket *sock; int flags; /* Check the SOCK_* constants for consistency. */ BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; //分配inode,返回inode中的一个成员作为sock retval = sock_create(family, type, protocol, &sock); if (retval < 0) goto out; //找个fd映射sock //得到空fd //分配伪dentry和file,并将socket file的operations与file挂接 retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));/*省略部分*/}
//创建socketpair,注意af_inet协议族下没有pair,af_unix下有SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, int __user *, usockvec){ struct socket *sock1, *sock2; int fd1, fd2, err; struct file *newfile1, *newfile2; int flags; flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; //创建socket1 err = sock_create(family, type, protocol, &sock1); if (err < 0) goto out; //创建socket2 err = sock_create(family, type, protocol, &sock2); if (err < 0) goto out_release_1; //调用socket operations的socketpair //关于不同协议层的函数hook,公共结构体是struct proto_ops //对于不同的family,比如af_inet协议族的定义在ipv4/af_inet.c // //对于af_inet没有socketpair //对于af_unix有socketpair err = sock1->ops->socketpair(sock1, sock2); if (err < 0) goto out_release_both; //后面部分就很类似了,找到空fd,分配file,绑定到socket,将file 安装到当前进程 fd1 = get_unused_fd_flags(flags); if (unlikely(fd1 < 0)) { err = fd1; goto out_release_both; } fd2 = get_unused_fd_flags(flags); if (unlikely(fd2 < 0)) { err = fd2; goto out_put_unused_1; } newfile1 = sock_alloc_file(sock1, flags, NULL); if (unlikely(IS_ERR(newfile1))) { err = PTR_ERR(newfile1); goto out_put_unused_both; } newfile2 = sock_alloc_file(sock2, flags, NULL); if (IS_ERR(newfile2)) { err = PTR_ERR(newfile2); goto out_fput_1; } err = put_user(fd1, &usockvec[0]); if (err) goto out_fput_both; err = put_user(fd2, &usockvec[1]); if (err) goto out_fput_both; audit_fd_pair(fd1, fd2); fd_install(fd1, newfile1); fd_install(fd2, newfile2); /* fd1 and fd2 may be already another descriptors. * Not kernel problem. */ return 0;
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen){ struct socket *sock; struct sockaddr_storage address; int err, fput_needed; //根据fd查找file,进而查找socket指针sock sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { //把用户态地址数据移到内核态 //调用copy_from_user err = move_addr_to_kernel(umyaddr, addrlen, &address); if (err >= 0) { //security hook err = security_socket_bind(sock, (struct sockaddr *)&address, addrlen); if (!err) //ok, 到具体family定义的proto_ops中的bind //比如对af_inet,主要是设置socket->sock->inet_sock的一些参数,比如接收地址,端口什么的 err = sock->ops->bind(sock, (struct sockaddr *) &address, addrlen); } fput_light(sock->file, fput_needed); } return err;}
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries){ //获得网络层inte_sock struct inet_sock *inet = inet_sk(sk); //管理request connection的结构体 struct inet_connection_sock *icsk = inet_csk(sk); //分配backlog个长度的accpet_queue的结构连接请求的队列 int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); if (rc != 0) return rc; sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only * after validation is complete. */ //切换状态到listening sk->sk_state = TCP_LISTEN; if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); //更新dst_entry表 sk_dst_reset(sk); sk->sk_prot->hash(sk); return 0; } sk->sk_state = TCP_CLOSE; __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE;}
//socket.c//accept系统调用SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen, int, flags){ /*省略部分*/ err = -ENFILE; //for client socket newsock = sock_alloc(); if (!newsock) goto out_put; newsock->type = sock->type; newsock->ops = sock->ops; /* * We don't need try_module_get here, as the listening socket (sock) * has the protocol module (sock->ops->owner) held. */ __module_get(newsock->ops->owner); //得到当前进程空fd,分给newsock file newfd = get_unused_fd_flags(flags); if (unlikely(newfd < 0)) { err = newfd; sock_release(newsock); goto out_put; } //从flab分配空file结构 newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); if (unlikely(IS_ERR(newfile))) { err = PTR_ERR(newfile); put_unused_fd(newfd); sock_release(newsock); goto out_put; } err = security_socket_accept(sock, newsock); if (err) goto out_fd; //proto_ops中的accept //accept从系统调用到具体协议族的某个type的struct proto_ops的accept如af_inet tcp的的accept,再到sock层的accept,然后sock层的accept实际上对应的是具体传输层的struct proto中的accpet,如tcp/udp的struct proto tcp_prot/udp_prot,然后放入newsock err = sock->ops->accept(sock, newsock, sock->file->f_flags); if (err < 0) goto out_fd; if (upeer_sockaddr) { if (newsock->ops->getname(newsock, (struct sockaddr *)&address, &len, 2) < 0) { err = -ECONNABORTED; goto out_fd; } //拷贝client socket addr storage到userspace err = move_addr_to_user(&address, len, upeer_sockaddr, upeer_addrlen); if (err < 0) goto out_fd; } fd_install(newfd, newfile); err = newfd; /*省略部分*/}
//ipv4/af_inet.c//inet family的tcp相关的proto_opsint inet_accept(struct socket *sock, struct socket *newsock, int flags){ struct sock *sk1 = sock->sk; int err = -EINVAL; //进入(网络)sock层,accept新sock struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); if (!sk2) goto do_err; //锁住sock,因为需要操作sock内的request_socket请求队列头 wait_queue_head_t等数据 lock_sock(sk2); sock_rps_record_flow(sk2); WARN_ON(!((1 << sk2->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_RECV | TCPF_CLOSE_WAIT | TCPF_CLOSE))); sock_graft(sk2, newsock); //设置client socket状态 newsock->state = SS_CONNECTED; err = 0; release_sock(sk2);do_err: return err;}
//ipv4/tcp_ipv4.c//这里进入struct proto tcp_prot中的acceptstruct sock *inet_csk_accept(struct sock *sk, int flags, int *err){ struct inet_connection_sock *icsk = inet_csk(sk); //icsk : inet_connection_sock 面向连接的客户端连接处理相关的信息 //接收队列 struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct sock *newsk; struct request_sock *req; int error; //lock sock lock_sock(sk); //如果不是ACCPET状态转换过来,出错 error = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out_err; //如果request_sock队列是空的, 利用等待队列挂起当前进程到等待队列,并且将等待队列放入sock中的请求队列头 if (reqsk_queue_empty(queue)) { //如果非阻塞,0,否则为sk的接收时间 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); error = -EAGAIN; if (!timeo) //如果非阻塞而且接收队列是空,直接返回-EAGAIN goto out_err; //阻塞情况下,等待timeo时间的超时 //利用了等待队列,下面会详细注解 error = inet_csk_wait_for_connect(sk, timeo); if (error) goto out_err; } //不是空,移出一个连接请求 req = reqsk_queue_remove(queue); //连接请求的sock newsk = req->sk; //减少backlog sk_acceptq_removed(sk); //fastopenq? if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) { spin_lock_bh(&queue->fastopenq->lock); if (tcp_rsk(req)->listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to * NULL to signify that the child socket is taken * so reqsk_fastopen_remove() will free the req * when 3WHS finishes (or is aborted). */ req->sk = NULL; req = NULL; } spin_unlock_bh(&queue->fastopenq->lock); } //ok,清理,返回newsk /*省略部分*/
//ipv4/inet_connection_sock.c//accept连接请求的核心函数static int inet_csk_wait_for_connect(struct sock *sk, long timeo){ struct inet_connection_sock *icsk = inet_csk(sk); //定义一个等待队列wait_queue_t wait 进程是当前进程 DEFINE_WAIT(wait); int err; for (;;) { //sk_leep(sk) : sock的wait_queue_head_t //wait : wait_queue_t //这里将current进程的wait_queue_t加入sk的wait_queue_head_t中,spin锁定 //wait_queue_head_t,设置current状态,然后spin解锁时可能重新schedule prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); //被唤醒,解锁sock release_sock(sk); //如果请求队列为空,说明timeout了 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) //schedule timeout timeo = schedule_timeout(timeo); //再锁住进行下次循环,准备再次进入TASK_INTERRUPTIBLE lock_sock(sk); err = 0; //检查是否有连接到达, 如果有,break,唤醒等待队列 if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) break; err = -EINVAL; //如果不是listening 状态转过来的, 除错-EINVAL if (sk->sk_state != TCP_LISTEN) break; //检查interrupt错误 err = sock_intr_errno(timeo); //如果当前进程收到信号了,break if (signal_pending(current)) break; //如果传入的timeo为0,则回到nonblock的状态, break err = -EAGAIN; if (!timeo) break; } //ok, 有连接到达,设置state为running, 唤醒wait queue的第一个进程,移除wait_queue_t和wait_queue_head_t finish_wait(sk_sleep(sk), &wait); return err;}